1 From ffdb5092c571c88a6d924c2c3dbf8bec9d70a3e6 Mon Sep 17 00:00:00 2001
2 From: Phil Elwell <phil@raspberrypi.org>
3 Date: Mon, 13 Oct 2014 11:47:53 +0100
4 Subject: [PATCH] Improve __copy_to_user and __copy_from_user performance
6 Provide a __copy_from_user that uses memcpy. On BCM2708, use
7 optimised memcpy/memmove/memcmp/memset implementations.
9 arch/arm: Add mmiocpy/set aliases for memcpy/set
11 See: https://github.com/raspberrypi/linux/issues/1082
13 arch/arm/include/asm/string.h | 5 +
14 arch/arm/include/asm/uaccess.h | 3 +
15 arch/arm/lib/Makefile | 15 +-
16 arch/arm/lib/arm-mem.h | 159 ++++++++++++
17 arch/arm/lib/copy_from_user.S | 4 +-
18 arch/arm/lib/exports_rpi.c | 37 +++
19 arch/arm/lib/memcmp_rpi.S | 285 +++++++++++++++++++++
20 arch/arm/lib/memcpy_rpi.S | 61 +++++
21 arch/arm/lib/memcpymove.h | 506 +++++++++++++++++++++++++++++++++++++
22 arch/arm/lib/memmove_rpi.S | 61 +++++
23 arch/arm/lib/memset_rpi.S | 123 +++++++++
24 arch/arm/lib/uaccess_with_memcpy.c | 112 +++++++-
25 12 files changed, 1365 insertions(+), 6 deletions(-)
26 create mode 100644 arch/arm/lib/arm-mem.h
27 create mode 100644 arch/arm/lib/exports_rpi.c
28 create mode 100644 arch/arm/lib/memcmp_rpi.S
29 create mode 100644 arch/arm/lib/memcpy_rpi.S
30 create mode 100644 arch/arm/lib/memcpymove.h
31 create mode 100644 arch/arm/lib/memmove_rpi.S
32 create mode 100644 arch/arm/lib/memset_rpi.S
34 --- a/arch/arm/include/asm/string.h
35 +++ b/arch/arm/include/asm/string.h
36 @@ -24,6 +24,11 @@ extern void * memchr(const void *, int,
37 #define __HAVE_ARCH_MEMSET
38 extern void * memset(void *, int, __kernel_size_t);
40 +#ifdef CONFIG_MACH_BCM2708
41 +#define __HAVE_ARCH_MEMCMP
42 +extern int memcmp(const void *, const void *, size_t);
45 extern void __memzero(void *ptr, __kernel_size_t n);
47 #define memset(p,v,n) \
48 --- a/arch/arm/include/asm/uaccess.h
49 +++ b/arch/arm/include/asm/uaccess.h
50 @@ -493,6 +493,9 @@ do { \
51 extern unsigned long __must_check
52 arm_copy_from_user(void *to, const void __user *from, unsigned long n);
54 +extern unsigned long __must_check
55 +__copy_from_user_std(void *to, const void __user *from, unsigned long n);
57 static inline unsigned long __must_check
58 __copy_from_user(void *to, const void __user *from, unsigned long n)
60 --- a/arch/arm/lib/Makefile
61 +++ b/arch/arm/lib/Makefile
64 lib-y := backtrace.o changebit.o csumipv6.o csumpartial.o \
65 csumpartialcopy.o csumpartialcopyuser.o clearbit.o \
66 - delay.o delay-loop.o findbit.o memchr.o memcpy.o \
67 - memmove.o memset.o memzero.o setbit.o \
68 - strchr.o strrchr.o \
69 + delay.o delay-loop.o findbit.o memchr.o memzero.o \
70 + setbit.o strchr.o strrchr.o \
71 testchangebit.o testclearbit.o testsetbit.o \
72 ashldi3.o ashrdi3.o lshrdi3.o muldi3.o \
73 ucmpdi2.o lib1funcs.o div64.o \
74 @@ -18,6 +17,16 @@ lib-y := backtrace.o changebit.o csumip
75 mmu-y := clear_user.o copy_page.o getuser.o putuser.o \
76 copy_from_user.o copy_to_user.o
78 +# Choose optimised implementations for Raspberry Pi
79 +ifeq ($(CONFIG_MACH_BCM2708),y)
80 + CFLAGS_uaccess_with_memcpy.o += -DCOPY_FROM_USER_THRESHOLD=1600
81 + CFLAGS_uaccess_with_memcpy.o += -DCOPY_TO_USER_THRESHOLD=672
82 + obj-$(CONFIG_MODULES) += exports_rpi.o
83 + lib-y += memcpy_rpi.o memmove_rpi.o memset_rpi.o memcmp_rpi.o
85 + lib-y += memcpy.o memmove.o memset.o
88 # using lib_ here won't override already available weak symbols
89 obj-$(CONFIG_UACCESS_WITH_MEMCPY) += uaccess_with_memcpy.o
92 +++ b/arch/arm/lib/arm-mem.h
95 +Copyright (c) 2013, Raspberry Pi Foundation
96 +Copyright (c) 2013, RISC OS Open Ltd
99 +Redistribution and use in source and binary forms, with or without
100 +modification, are permitted provided that the following conditions are met:
101 + * Redistributions of source code must retain the above copyright
102 + notice, this list of conditions and the following disclaimer.
103 + * Redistributions in binary form must reproduce the above copyright
104 + notice, this list of conditions and the following disclaimer in the
105 + documentation and/or other materials provided with the distribution.
106 + * Neither the name of the copyright holder nor the
107 + names of its contributors may be used to endorse or promote products
108 + derived from this software without specific prior written permission.
110 +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
111 +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
112 +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
113 +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
114 +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
115 +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
116 +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
117 +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
118 +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
119 +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
128 +.macro preload_leading_step1 backwards, ptr, base
129 +/* If the destination is already 16-byte aligned, then we need to preload
130 + * between 0 and prefetch_distance (inclusive) cache lines ahead so there
131 + * are no gaps when the inner loop starts.
140 + .rept prefetch_distance+1
143 + .set OFFSET, OFFSET-32
145 + .set OFFSET, OFFSET+32
150 +.macro preload_leading_step2 backwards, ptr, base, leading_bytes, tmp
151 +/* However, if the destination is not 16-byte aligned, we may need to
152 + * preload one more cache line than that. The question we need to ask is:
153 + * are the leading bytes more than the amount by which the source
154 + * pointer will be rounded down for preloading, and if so, by how many
158 +/* Here we compare against how many bytes we are into the
159 + * cache line, counting down from the highest such address.
160 + * Effectively, we want to calculate
161 + * leading_bytes = dst&15
162 + * cacheline_offset = 31-((src-leading_bytes-1)&31)
163 + * extra_needed = leading_bytes - cacheline_offset
164 + * and test if extra_needed is <= 0, or rearranging:
165 + * leading_bytes + (src-leading_bytes-1)&31 <= 31
167 + mov tmp, base, lsl #32-5
168 + sbc tmp, tmp, leading_bytes, lsl #32-5
169 + adds tmp, tmp, leading_bytes, lsl #32-5
171 + pld [ptr, #-32*(prefetch_distance+1)]
173 +/* Effectively, we want to calculate
174 + * leading_bytes = (-dst)&15
175 + * cacheline_offset = (src+leading_bytes)&31
176 + * extra_needed = leading_bytes - cacheline_offset
177 + * and test if extra_needed is <= 0.
179 + mov tmp, base, lsl #32-5
180 + add tmp, tmp, leading_bytes, lsl #32-5
181 + rsbs tmp, tmp, leading_bytes, lsl #32-5
183 + pld [ptr, #32*(prefetch_distance+1)]
188 +.macro preload_trailing backwards, base, remain, tmp
189 + /* We need either 0, 1 or 2 extra preloads */
192 + mov tmp, tmp, lsl #32-5
194 + mov tmp, base, lsl #32-5
196 + adds tmp, tmp, remain, lsl #32-5
197 + adceqs tmp, tmp, #0
198 + /* The instruction above has two effects: ensures Z is only
199 + * set if C was clear (so Z indicates that both shifted quantities
200 + * were 0), and clears C if Z was set (so C indicates that the sum
201 + * of the shifted quantities was greater and not equal to 32) */
211 + pld [tmp, #-32*(prefetch_distance+1)]
213 + pld [tmp, #-32*prefetch_distance]
215 + pld [tmp, #32*(prefetch_distance+2)]
217 + pld [tmp, #32*(prefetch_distance+1)]
222 +.macro preload_all backwards, narrow_case, shift, base, remain, tmp0, tmp1
225 + bic tmp0, tmp0, #31
227 + sub tmp1, base, remain, lsl #shift
229 + bic tmp0, base, #31
231 + add tmp1, base, remain, lsl #shift
234 + bic tmp1, tmp1, #31
238 + /* In this case, all the data fits in either 1 or 2 cache lines */
243 + sub tmp0, tmp0, #32
245 + add tmp0, tmp0, #32
253 --- a/arch/arm/lib/copy_from_user.S
254 +++ b/arch/arm/lib/copy_from_user.S
259 -ENTRY(arm_copy_from_user)
260 +ENTRY(__copy_from_user_std)
261 +WEAK(arm_copy_from_user)
263 #include "copy_template.S"
265 ENDPROC(arm_copy_from_user)
266 +ENDPROC(__copy_from_user_std)
268 .pushsection .fixup,"ax"
271 +++ b/arch/arm/lib/exports_rpi.c
274 + * Copyright (c) 2014, Raspberry Pi (Trading) Ltd.
276 + * Redistribution and use in source and binary forms, with or without
277 + * modification, are permitted provided that the following conditions
279 + * 1. Redistributions of source code must retain the above copyright
280 + * notice, this list of conditions, and the following disclaimer,
281 + * without modification.
282 + * 2. Redistributions in binary form must reproduce the above copyright
283 + * notice, this list of conditions and the following disclaimer in the
284 + * documentation and/or other materials provided with the distribution.
285 + * 3. The names of the above-listed copyright holders may not be used
286 + * to endorse or promote products derived from this software without
287 + * specific prior written permission.
289 + * ALTERNATIVELY, this software may be distributed under the terms of the
290 + * GNU General Public License ("GPL") version 2, as published by the Free
291 + * Software Foundation.
293 + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
294 + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
295 + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
296 + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
297 + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
298 + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
299 + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
300 + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
301 + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
302 + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
303 + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
306 +#include <linux/kernel.h>
307 +#include <linux/module.h>
309 +EXPORT_SYMBOL(memcmp);
311 +++ b/arch/arm/lib/memcmp_rpi.S
314 +Copyright (c) 2013, Raspberry Pi Foundation
315 +Copyright (c) 2013, RISC OS Open Ltd
316 +All rights reserved.
318 +Redistribution and use in source and binary forms, with or without
319 +modification, are permitted provided that the following conditions are met:
320 + * Redistributions of source code must retain the above copyright
321 + notice, this list of conditions and the following disclaimer.
322 + * Redistributions in binary form must reproduce the above copyright
323 + notice, this list of conditions and the following disclaimer in the
324 + documentation and/or other materials provided with the distribution.
325 + * Neither the name of the copyright holder nor the
326 + names of its contributors may be used to endorse or promote products
327 + derived from this software without specific prior written permission.
329 +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
330 +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
331 +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
332 +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
333 +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
334 +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
335 +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
336 +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
337 +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
338 +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
341 +#include <linux/linkage.h>
342 +#include "arm-mem.h"
344 +/* Prevent the stack from becoming executable */
345 +#if defined(__linux__) && defined(__ELF__)
346 +.section .note.GNU-stack,"",%progbits
356 +.macro memcmp_process_head unaligned
358 + ldr DAT0, [S_1], #4
359 + ldr DAT1, [S_1], #4
360 + ldr DAT2, [S_1], #4
361 + ldr DAT3, [S_1], #4
363 + ldmia S_1!, {DAT0, DAT1, DAT2, DAT3}
365 + ldmia S_2!, {DAT4, DAT5, DAT6, DAT7}
368 +.macro memcmp_process_tail
376 +.macro memcmp_leading_31bytes
377 + movs DAT0, OFF, lsl #31
378 + ldrmib DAT0, [S_1], #1
379 + ldrcsh DAT1, [S_1], #2
380 + ldrmib DAT4, [S_2], #1
381 + ldrcsh DAT5, [S_2], #2
391 + movs DAT0, OFF, lsl #29
392 + ldrmi DAT0, [S_1], #4
393 + ldrcs DAT1, [S_1], #4
394 + ldrcs DAT2, [S_1], #4
395 + ldrmi DAT4, [S_2], #4
396 + ldmcsia S_2!, {DAT5, DAT6}
411 + memcmp_process_head 1
413 + memcmp_process_tail
417 +.macro memcmp_trailing_15bytes unaligned
420 + ldrcs DAT0, [S_1], #4
421 + ldrcs DAT1, [S_1], #4
423 + ldmcsia S_1!, {DAT0, DAT1}
425 + ldrmi DAT2, [S_1], #4
426 + ldmcsia S_2!, {DAT4, DAT5}
427 + ldrmi DAT6, [S_2], #4
439 + ldrcsh DAT0, [S_1], #2
441 + ldrcsh DAT4, [S_2], #2
452 +.macro memcmp_long_inner_loop unaligned
454 + memcmp_process_head unaligned
455 + pld [S_2, #prefetch_distance*32 + 16]
456 + memcmp_process_tail
457 + memcmp_process_head unaligned
459 + memcmp_process_tail
462 + /* Just before the final (prefetch_distance+1) 32-byte blocks,
463 + * deal with final preloads */
464 + preload_trailing 0, S_1, N, DAT0
465 + preload_trailing 0, S_2, N, DAT0
466 + add N, N, #(prefetch_distance+2)*32 - 16
468 + memcmp_process_head unaligned
469 + memcmp_process_tail
472 + /* Trailing words and bytes */
475 + memcmp_trailing_15bytes unaligned
476 +199: /* Reached end without detecting a difference */
479 + pop {DAT1-DAT6, pc}
482 +.macro memcmp_short_inner_loop unaligned
483 + subs N, N, #16 /* simplifies inner loop termination */
486 + memcmp_process_head unaligned
487 + memcmp_process_tail
490 +122: /* Trailing words and bytes */
493 + memcmp_trailing_15bytes unaligned
494 +199: /* Reached end without detecting a difference */
497 + pop {DAT1-DAT6, pc}
501 + * int memcmp(const void *s1, const void *s2, size_t n);
503 + * a1 = pointer to buffer 1
504 + * a2 = pointer to buffer 2
505 + * a3 = number of bytes to compare (as unsigned chars)
507 + * a1 = >0/=0/<0 if s1 >/=/< s2
510 +.set prefetch_distance, 2
526 + push {DAT1-DAT6, lr}
527 + setend be /* lowest-addressed bytes are most significant */
529 + /* To preload ahead as we go, we need at least (prefetch_distance+2) 32-byte blocks */
530 + cmp N, #(prefetch_distance+3)*32 - 1
534 + /* Adjust N so that the decrement instruction can also test for
535 + * inner loop termination. We want it to stop when there are
536 + * (prefetch_distance+1) complete blocks to go. */
537 + sub N, N, #(prefetch_distance+2)*32
538 + preload_leading_step1 0, DAT0, S_1
539 + preload_leading_step1 0, DAT1, S_2
542 + rsb OFF, S_2, #0 /* no need to AND with 15 here */
543 + preload_leading_step2 0, DAT0, S_1, OFF, DAT2
544 + preload_leading_step2 0, DAT1, S_2, OFF, DAT2
545 + memcmp_leading_31bytes
546 +154: /* Second source now cacheline (32-byte) aligned; we have at
547 + * least one prefetch to go. */
548 + /* Prefetch offset is best selected such that it lies in the
549 + * first 8 of each 32 bytes - but it's just as easy to aim for
552 + rsb OFF, OFF, #32*prefetch_distance
555 + memcmp_long_inner_loop 0
556 +140: memcmp_long_inner_loop 1
558 +170: /* Short case */
561 + preload_all 0, 0, 0, S_1, N, DAT0, DAT1
562 + preload_all 0, 0, 0, S_2, N, DAT0, DAT1
567 + ldrb DAT0, [S_1], #1
568 + ldrb DAT4, [S_2], #1
573 +174: /* Second source now 4-byte aligned; we have 0 or more bytes to go */
576 + memcmp_short_inner_loop 0
577 +140: memcmp_short_inner_loop 1
579 +200: /* Difference found: determine sign. */
583 + pop {DAT1-DAT6, pc}
599 +++ b/arch/arm/lib/memcpy_rpi.S
602 +Copyright (c) 2013, Raspberry Pi Foundation
603 +Copyright (c) 2013, RISC OS Open Ltd
604 +All rights reserved.
606 +Redistribution and use in source and binary forms, with or without
607 +modification, are permitted provided that the following conditions are met:
608 + * Redistributions of source code must retain the above copyright
609 + notice, this list of conditions and the following disclaimer.
610 + * Redistributions in binary form must reproduce the above copyright
611 + notice, this list of conditions and the following disclaimer in the
612 + documentation and/or other materials provided with the distribution.
613 + * Neither the name of the copyright holder nor the
614 + names of its contributors may be used to endorse or promote products
615 + derived from this software without specific prior written permission.
617 +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
618 +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
619 +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
620 +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
621 +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
622 +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
623 +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
624 +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
625 +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
626 +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
629 +#include <linux/linkage.h>
630 +#include "arm-mem.h"
631 +#include "memcpymove.h"
633 +/* Prevent the stack from becoming executable */
634 +#if defined(__linux__) && defined(__ELF__)
635 +.section .note.GNU-stack,"",%progbits
646 + * void *memcpy(void * restrict s1, const void * restrict s2, size_t n);
648 + * a1 = pointer to destination
649 + * a2 = pointer to source
650 + * a3 = number of bytes to copy
655 +.set prefetch_distance, 3
663 +++ b/arch/arm/lib/memcpymove.h
666 +Copyright (c) 2013, Raspberry Pi Foundation
667 +Copyright (c) 2013, RISC OS Open Ltd
668 +All rights reserved.
670 +Redistribution and use in source and binary forms, with or without
671 +modification, are permitted provided that the following conditions are met:
672 + * Redistributions of source code must retain the above copyright
673 + notice, this list of conditions and the following disclaimer.
674 + * Redistributions in binary form must reproduce the above copyright
675 + notice, this list of conditions and the following disclaimer in the
676 + documentation and/or other materials provided with the distribution.
677 + * Neither the name of the copyright holder nor the
678 + names of its contributors may be used to endorse or promote products
679 + derived from this software without specific prior written permission.
681 +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
682 +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
683 +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
684 +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
685 +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
686 +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
687 +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
688 +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
689 +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
690 +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
693 +.macro unaligned_words backwards, align, use_pld, words, r0, r1, r2, r3, r4, r5, r6, r7, r8
696 + mov r1, r0, lsl #32-align*8
698 + orr r1, r1, r0, lsr #align*8
701 + mov r0, r1, lsr #align*8
703 + orr r0, r0, r1, lsl #32-align*8
709 + mov r2, r0, lsl #32-align*8
711 + orr r2, r2, r1, lsr #align*8
712 + mov r1, r1, lsl #32-align*8
713 + orr r1, r1, r0, lsr #align*8
717 + mov r0, r2, lsr #align*8
719 + orr r0, r0, r1, lsl #32-align*8
720 + mov r1, r1, lsr #align*8
721 + orr r1, r1, r2, lsl #32-align*8
727 + mov r4, r0, lsl #32-align*8
729 + orr r4, r4, r3, lsr #align*8
730 + mov r3, r3, lsl #32-align*8
731 + orr r3, r3, r2, lsr #align*8
732 + mov r2, r2, lsl #32-align*8
733 + orr r2, r2, r1, lsr #align*8
734 + mov r1, r1, lsl #32-align*8
735 + orr r1, r1, r0, lsr #align*8
736 + stmdb D!, {r1, r2, r3, r4}
739 + mov r0, r4, lsr #align*8
741 + orr r0, r0, r1, lsl #32-align*8
742 + mov r1, r1, lsr #align*8
743 + orr r1, r1, r2, lsl #32-align*8
744 + mov r2, r2, lsr #align*8
745 + orr r2, r2, r3, lsl #32-align*8
746 + mov r3, r3, lsr #align*8
747 + orr r3, r3, r4, lsl #32-align*8
748 + stmia D!, {r0, r1, r2, r3}
752 + ldmdb S!, {r4, r5, r6, r7}
753 + mov r8, r0, lsl #32-align*8
754 + ldmdb S!, {r0, r1, r2, r3}
758 + orr r8, r8, r7, lsr #align*8
759 + mov r7, r7, lsl #32-align*8
760 + orr r7, r7, r6, lsr #align*8
761 + mov r6, r6, lsl #32-align*8
762 + orr r6, r6, r5, lsr #align*8
763 + mov r5, r5, lsl #32-align*8
764 + orr r5, r5, r4, lsr #align*8
765 + mov r4, r4, lsl #32-align*8
766 + orr r4, r4, r3, lsr #align*8
767 + mov r3, r3, lsl #32-align*8
768 + orr r3, r3, r2, lsr #align*8
769 + mov r2, r2, lsl #32-align*8
770 + orr r2, r2, r1, lsr #align*8
771 + mov r1, r1, lsl #32-align*8
772 + orr r1, r1, r0, lsr #align*8
773 + stmdb D!, {r5, r6, r7, r8}
774 + stmdb D!, {r1, r2, r3, r4}
776 + ldmib S!, {r1, r2, r3, r4}
777 + mov r0, r8, lsr #align*8
778 + ldmib S!, {r5, r6, r7, r8}
782 + orr r0, r0, r1, lsl #32-align*8
783 + mov r1, r1, lsr #align*8
784 + orr r1, r1, r2, lsl #32-align*8
785 + mov r2, r2, lsr #align*8
786 + orr r2, r2, r3, lsl #32-align*8
787 + mov r3, r3, lsr #align*8
788 + orr r3, r3, r4, lsl #32-align*8
789 + mov r4, r4, lsr #align*8
790 + orr r4, r4, r5, lsl #32-align*8
791 + mov r5, r5, lsr #align*8
792 + orr r5, r5, r6, lsl #32-align*8
793 + mov r6, r6, lsr #align*8
794 + orr r6, r6, r7, lsl #32-align*8
795 + mov r7, r7, lsr #align*8
796 + orr r7, r7, r8, lsl #32-align*8
797 + stmia D!, {r0, r1, r2, r3}
798 + stmia D!, {r4, r5, r6, r7}
803 +.macro memcpy_leading_15bytes backwards, align
804 + movs DAT1, DAT2, lsl #31
807 + ldrmib DAT0, [S, #-1]!
808 + ldrcsh DAT1, [S, #-2]!
809 + strmib DAT0, [D, #-1]!
810 + strcsh DAT1, [D, #-2]!
812 + ldrmib DAT0, [S], #1
813 + ldrcsh DAT1, [S], #2
814 + strmib DAT0, [D], #1
815 + strcsh DAT1, [D], #2
817 + movs DAT1, DAT2, lsl #29
819 + ldrmi DAT0, [S, #-4]!
821 + ldmcsdb S!, {DAT1, DAT2}
823 + ldrcs DAT2, [S, #-4]!
824 + ldrcs DAT1, [S, #-4]!
826 + strmi DAT0, [D, #-4]!
827 + stmcsdb D!, {DAT1, DAT2}
829 + ldrmi DAT0, [S], #4
831 + ldmcsia S!, {DAT1, DAT2}
833 + ldrcs DAT1, [S], #4
834 + ldrcs DAT2, [S], #4
836 + strmi DAT0, [D], #4
837 + stmcsia D!, {DAT1, DAT2}
841 +.macro memcpy_trailing_15bytes backwards, align
845 + ldmcsdb S!, {DAT0, DAT1}
847 + ldrcs DAT1, [S, #-4]!
848 + ldrcs DAT0, [S, #-4]!
850 + ldrmi DAT2, [S, #-4]!
851 + stmcsdb D!, {DAT0, DAT1}
852 + strmi DAT2, [D, #-4]!
855 + ldmcsia S!, {DAT0, DAT1}
857 + ldrcs DAT0, [S], #4
858 + ldrcs DAT1, [S], #4
860 + ldrmi DAT2, [S], #4
861 + stmcsia D!, {DAT0, DAT1}
862 + strmi DAT2, [D], #4
866 + ldrcsh DAT0, [S, #-2]!
867 + ldrmib DAT1, [S, #-1]
868 + strcsh DAT0, [D, #-2]!
869 + strmib DAT1, [D, #-1]
871 + ldrcsh DAT0, [S], #2
873 + strcsh DAT0, [D], #2
878 +.macro memcpy_long_inner_loop backwards, align
881 + ldr DAT0, [S, #-align]!
883 + ldr LAST, [S, #-align]!
889 + ldmdb S!, {DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, LAST}
891 + stmdb D!, {DAT4, DAT5, DAT6, LAST}
892 + stmdb D!, {DAT0, DAT1, DAT2, DAT3}
894 + ldmia S!, {DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, LAST}
896 + stmia D!, {DAT0, DAT1, DAT2, DAT3}
897 + stmia D!, {DAT4, DAT5, DAT6, LAST}
900 + unaligned_words backwards, align, 1, 8, DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, DAT7, LAST
904 + /* Just before the final (prefetch_distance+1) 32-byte blocks, deal with final preloads */
905 + preload_trailing backwards, S, N, OFF
906 + add N, N, #(prefetch_distance+2)*32 - 32
910 + ldmdb S!, {DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, LAST}
911 + stmdb D!, {DAT4, DAT5, DAT6, LAST}
912 + stmdb D!, {DAT0, DAT1, DAT2, DAT3}
914 + ldmia S!, {DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, LAST}
915 + stmia D!, {DAT0, DAT1, DAT2, DAT3}
916 + stmia D!, {DAT4, DAT5, DAT6, LAST}
919 + unaligned_words backwards, align, 0, 8, DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, DAT7, LAST
926 + ldmnedb S!, {DAT0, DAT1, DAT2, LAST}
927 + stmnedb D!, {DAT0, DAT1, DAT2, LAST}
929 + ldmneia S!, {DAT0, DAT1, DAT2, LAST}
930 + stmneia D!, {DAT0, DAT1, DAT2, LAST}
934 + unaligned_words backwards, align, 0, 4, DAT0, DAT1, DAT2, DAT3, LAST
937 + /* Trailing words and bytes */
943 + memcpy_trailing_15bytes backwards, align
945 + pop {DAT3, DAT4, DAT5, DAT6, DAT7}
946 + pop {D, DAT1, DAT2, pc}
949 +.macro memcpy_medium_inner_loop backwards, align
953 + ldmdb S!, {DAT0, DAT1, DAT2, LAST}
955 + ldr LAST, [S, #-4]!
956 + ldr DAT2, [S, #-4]!
957 + ldr DAT1, [S, #-4]!
958 + ldr DAT0, [S, #-4]!
960 + stmdb D!, {DAT0, DAT1, DAT2, LAST}
963 + ldmia S!, {DAT0, DAT1, DAT2, LAST}
970 + stmia D!, {DAT0, DAT1, DAT2, LAST}
974 + /* Trailing words and bytes */
977 + memcpy_trailing_15bytes backwards, align
979 + pop {D, DAT1, DAT2, pc}
982 +.macro memcpy_short_inner_loop backwards, align
986 + ldmnedb S!, {DAT0, DAT1, DAT2, LAST}
988 + ldrne LAST, [S, #-4]!
989 + ldrne DAT2, [S, #-4]!
990 + ldrne DAT1, [S, #-4]!
991 + ldrne DAT0, [S, #-4]!
993 + stmnedb D!, {DAT0, DAT1, DAT2, LAST}
996 + ldmneia S!, {DAT0, DAT1, DAT2, LAST}
998 + ldrne DAT0, [S], #4
999 + ldrne DAT1, [S], #4
1000 + ldrne DAT2, [S], #4
1001 + ldrne LAST, [S], #4
1003 + stmneia D!, {DAT0, DAT1, DAT2, LAST}
1005 + memcpy_trailing_15bytes backwards, align
1007 + pop {D, DAT1, DAT2, pc}
1010 +.macro memcpy backwards
1027 + push {D, DAT1, DAT2, lr}
1029 + .cfi_def_cfa_offset 16
1030 + .cfi_rel_offset D, 0
1033 + .cfi_undefined DAT0
1034 + .cfi_rel_offset DAT1, 4
1035 + .cfi_rel_offset DAT2, 8
1036 + .cfi_undefined LAST
1037 + .cfi_rel_offset lr, 12
1044 + /* See if we're guaranteed to have at least one 16-byte aligned 16-byte write */
1047 + /* To preload ahead as we go, we need at least (prefetch_distance+2) 32-byte blocks */
1048 + cmp N, #(prefetch_distance+3)*32 - 1
1052 + push {DAT3, DAT4, DAT5, DAT6, DAT7}
1054 + .cfi_def_cfa_offset 36
1055 + .cfi_rel_offset D, 20
1056 + .cfi_rel_offset DAT1, 24
1057 + .cfi_rel_offset DAT2, 28
1058 + .cfi_rel_offset DAT3, 0
1059 + .cfi_rel_offset DAT4, 4
1060 + .cfi_rel_offset DAT5, 8
1061 + .cfi_rel_offset DAT6, 12
1062 + .cfi_rel_offset DAT7, 16
1063 + .cfi_rel_offset lr, 32
1065 + /* Adjust N so that the decrement instruction can also test for
1066 + * inner loop termination. We want it to stop when there are
1067 + * (prefetch_distance+1) complete blocks to go. */
1068 + sub N, N, #(prefetch_distance+2)*32
1069 + preload_leading_step1 backwards, DAT0, S
1071 + /* Bug in GAS: it accepts, but mis-assembles the instruction
1072 + * ands DAT2, D, #60, 2
1073 + * which sets DAT2 to the number of leading bytes until destination is aligned and also clears C (sets borrow)
1080 + rsb DAT2, DAT2, #16 /* number of leading bytes until destination aligned */
1082 + preload_leading_step2 backwards, DAT0, S, DAT2, OFF
1083 + memcpy_leading_15bytes backwards, 1
1084 +154: /* Destination now 16-byte aligned; we have at least one prefetch as well as at least one 16-byte output block */
1085 + /* Prefetch offset is best selected such that it lies in the first 8 of each 32 bytes - but it's just as easy to aim for the first one */
1089 + sub OFF, OFF, #32*(prefetch_distance+1)
1092 + rsb OFF, OFF, #32*prefetch_distance
1094 + movs DAT0, S, lsl #31
1098 + memcpy_long_inner_loop backwards, 0
1099 +155: memcpy_long_inner_loop backwards, 1
1100 +156: memcpy_long_inner_loop backwards, 2
1101 +157: memcpy_long_inner_loop backwards, 3
1103 + .cfi_def_cfa_offset 16
1104 + .cfi_rel_offset D, 0
1105 + .cfi_rel_offset DAT1, 4
1106 + .cfi_rel_offset DAT2, 8
1107 + .cfi_same_value DAT3
1108 + .cfi_same_value DAT4
1109 + .cfi_same_value DAT5
1110 + .cfi_same_value DAT6
1111 + .cfi_same_value DAT7
1112 + .cfi_rel_offset lr, 12
1114 +160: /* Medium case */
1115 + preload_all backwards, 0, 0, S, N, DAT2, OFF
1116 + sub N, N, #16 /* simplifies inner loop termination */
1123 + rsb DAT2, DAT2, #16
1125 + memcpy_leading_15bytes backwards, align
1126 +164: /* Destination now 16-byte aligned; we have at least one 16-byte output block */
1129 + memcpy_medium_inner_loop backwards, 0
1130 +140: memcpy_medium_inner_loop backwards, 1
1132 +170: /* Short case, less than 31 bytes, so no guarantee of at least one 16-byte block */
1135 + preload_all backwards, 1, 0, S, N, DAT2, LAST
1141 + ldrb DAT0, [S, #-1]!
1142 + strb DAT0, [D, #-1]!
1144 + ldrb DAT0, [S], #1
1145 + strb DAT0, [D], #1
1149 +174: /* Destination now 4-byte aligned; we have 0 or more output bytes to go */
1152 + memcpy_short_inner_loop backwards, 0
1153 +140: memcpy_short_inner_loop backwards, 1
1172 +++ b/arch/arm/lib/memmove_rpi.S
1175 +Copyright (c) 2013, Raspberry Pi Foundation
1176 +Copyright (c) 2013, RISC OS Open Ltd
1177 +All rights reserved.
1179 +Redistribution and use in source and binary forms, with or without
1180 +modification, are permitted provided that the following conditions are met:
1181 + * Redistributions of source code must retain the above copyright
1182 + notice, this list of conditions and the following disclaimer.
1183 + * Redistributions in binary form must reproduce the above copyright
1184 + notice, this list of conditions and the following disclaimer in the
1185 + documentation and/or other materials provided with the distribution.
1186 + * Neither the name of the copyright holder nor the
1187 + names of its contributors may be used to endorse or promote products
1188 + derived from this software without specific prior written permission.
1190 +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
1191 +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
1192 +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
1193 +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
1194 +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
1195 +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
1196 +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
1197 +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
1198 +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
1199 +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
1202 +#include <linux/linkage.h>
1203 +#include "arm-mem.h"
1204 +#include "memcpymove.h"
1206 +/* Prevent the stack from becoming executable */
1207 +#if defined(__linux__) && defined(__ELF__)
1208 +.section .note.GNU-stack,"",%progbits
1213 + .object_arch armv4
1219 + * void *memmove(void *s1, const void *s2, size_t n);
1221 + * a1 = pointer to destination
1222 + * a2 = pointer to source
1223 + * a3 = number of bytes to copy
1228 +.set prefetch_distance, 3
1232 + bpl memcpy /* pl works even over -1 - 0 and 0x7fffffff - 0x80000000 boundaries */
1236 +++ b/arch/arm/lib/memset_rpi.S
1239 +Copyright (c) 2013, Raspberry Pi Foundation
1240 +Copyright (c) 2013, RISC OS Open Ltd
1241 +All rights reserved.
1243 +Redistribution and use in source and binary forms, with or without
1244 +modification, are permitted provided that the following conditions are met:
1245 + * Redistributions of source code must retain the above copyright
1246 + notice, this list of conditions and the following disclaimer.
1247 + * Redistributions in binary form must reproduce the above copyright
1248 + notice, this list of conditions and the following disclaimer in the
1249 + documentation and/or other materials provided with the distribution.
1250 + * Neither the name of the copyright holder nor the
1251 + names of its contributors may be used to endorse or promote products
1252 + derived from this software without specific prior written permission.
1254 +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
1255 +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
1256 +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
1257 +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
1258 +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
1259 +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
1260 +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
1261 +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
1262 +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
1263 +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
1266 +#include <linux/linkage.h>
1267 +#include "arm-mem.h"
1269 +/* Prevent the stack from becoming executable */
1270 +#if defined(__linux__) && defined(__ELF__)
1271 +.section .note.GNU-stack,"",%progbits
1276 + .object_arch armv4
1282 + * void *memset(void *s, int c, size_t n);
1284 + * a1 = pointer to buffer to fill
1285 + * a2 = byte pattern to fill with (caller-narrowed)
1286 + * a3 = number of bytes to fill
1299 + orr DAT0, DAT0, lsl #8
1301 + orr DAT0, DAT0, lsl #16
1304 + /* See if we're guaranteed to have at least one 16-byte aligned 16-byte write */
1308 +161: sub N, N, #16 /* simplifies inner loop termination */
1309 + /* Leading words and bytes */
1312 + rsb DAT3, S, #0 /* bits 0-3 = number of leading bytes until aligned */
1313 + movs DAT2, DAT3, lsl #31
1315 + strmib DAT0, [S], #1
1317 + strcsh DAT0, [S], #2
1318 + movs DAT2, DAT3, lsl #29
1320 + strmi DAT0, [S], #4
1322 + stmcsia S!, {DAT0, DAT1}
1323 +164: /* Delayed set up of DAT2 and DAT3 so we could use them as scratch registers above */
1326 + /* Now the inner loop of 16-byte stores */
1327 +165: stmia S!, {DAT0, DAT1, DAT2, DAT3}
1330 +166: /* Trailing words and bytes */
1331 + movs N, N, lsl #29
1332 + stmcsia S!, {DAT0, DAT1}
1333 + strmi DAT0, [S], #4
1335 + strcsh DAT0, [S], #2
1339 +170: /* Short case */
1346 + strb DAT0, [S], #1
1350 + stmneia S!, {DAT0, DAT1, DAT2, DAT3}
1361 --- a/arch/arm/lib/uaccess_with_memcpy.c
1362 +++ b/arch/arm/lib/uaccess_with_memcpy.c
1364 #include <asm/current.h>
1365 #include <asm/page.h>
1367 +#ifndef COPY_FROM_USER_THRESHOLD
1368 +#define COPY_FROM_USER_THRESHOLD 64
1371 +#ifndef COPY_TO_USER_THRESHOLD
1372 +#define COPY_TO_USER_THRESHOLD 64
1376 pin_page_for_write(const void __user *_addr, pte_t **ptep, spinlock_t **ptlp)
1378 @@ -85,7 +93,44 @@ pin_page_for_write(const void __user *_a
1382 -static unsigned long noinline
1384 +pin_page_for_read(const void __user *_addr, pte_t **ptep, spinlock_t **ptlp)
1386 + unsigned long addr = (unsigned long)_addr;
1393 + pgd = pgd_offset(current->mm, addr);
1394 + if (unlikely(pgd_none(*pgd) || pgd_bad(*pgd)))
1398 + pud = pud_offset(pgd, addr);
1399 + if (unlikely(pud_none(*pud) || pud_bad(*pud)))
1404 + pmd = pmd_offset(pud, addr);
1405 + if (unlikely(pmd_none(*pmd) || pmd_bad(*pmd)))
1408 + pte = pte_offset_map_lock(current->mm, pmd, addr, &ptl);
1409 + if (unlikely(!pte_present(*pte) || !pte_young(*pte))) {
1410 + pte_unmap_unlock(pte, ptl);
1420 +unsigned long noinline
1421 __copy_to_user_memcpy(void __user *to, const void *from, unsigned long n)
1423 unsigned long ua_flags;
1424 @@ -138,6 +183,54 @@ out:
1428 +unsigned long noinline
1429 +__copy_from_user_memcpy(void *to, const void __user *from, unsigned long n)
1433 + if (unlikely(segment_eq(get_fs(), KERNEL_DS))) {
1434 + memcpy(to, (const void *)from, n);
1438 + /* the mmap semaphore is taken only if not in an atomic context */
1439 + atomic = in_atomic();
1442 + down_read(¤t->mm->mmap_sem);
1448 + while (!pin_page_for_read(from, &pte, &ptl)) {
1451 + up_read(¤t->mm->mmap_sem);
1452 + if (__get_user(temp, (char __user *)from))
1455 + down_read(¤t->mm->mmap_sem);
1458 + tocopy = (~(unsigned long)from & ~PAGE_MASK) + 1;
1462 + memcpy(to, (const void *)from, tocopy);
1467 + pte_unmap_unlock(pte, ptl);
1470 + up_read(¤t->mm->mmap_sem);
1477 arm_copy_to_user(void __user *to, const void *from, unsigned long n)
1479 @@ -148,7 +241,7 @@ arm_copy_to_user(void __user *to, const
1480 * With frame pointer disabled, tail call optimization kicks in
1481 * as well making this test almost invisible.
1484 + if (n < COPY_TO_USER_THRESHOLD) {
1485 unsigned long ua_flags = uaccess_save_and_enable();
1486 n = __copy_to_user_std(to, from, n);
1487 uaccess_restore(ua_flags);
1488 @@ -157,6 +250,21 @@ arm_copy_to_user(void __user *to, const
1493 +unsigned long __must_check
1494 +arm_copy_from_user(void *to, const void __user *from, unsigned long n)
1497 + * This test is stubbed out of the main function above to keep
1498 + * the overhead for small copies low by avoiding a large
1499 + * register dump on the stack just to reload them right away.
1500 + * With frame pointer disabled, tail call optimization kicks in
1501 + * as well making this test almost invisible.
1503 + if (n < COPY_FROM_USER_THRESHOLD)
1504 + return __copy_from_user_std(to, from, n);
1505 + return __copy_from_user_memcpy(to, from, n);
1508 static unsigned long noinline
1509 __clear_user_memset(void __user *addr, unsigned long n)