brcm2708: update 4.1 patches
[openwrt/openwrt.git] / target / linux / brcm2708 / patches-4.1 / 0070-Improve-__copy_to_user-and-__copy_from_user-performa.patch
1 From 275c3679e7e3be706d11e5b60a1c75249b108343 Mon Sep 17 00:00:00 2001
2 From: Phil Elwell <phil@raspberrypi.org>
3 Date: Mon, 13 Oct 2014 11:47:53 +0100
4 Subject: [PATCH 070/171] Improve __copy_to_user and __copy_from_user
5 performance
6
7 Provide a __copy_from_user that uses memcpy. On BCM2708, use
8 optimised memcpy/memmove/memcmp/memset implementations.
9 ---
10 arch/arm/include/asm/string.h | 5 +
11 arch/arm/include/asm/uaccess.h | 1 +
12 arch/arm/lib/Makefile | 15 +-
13 arch/arm/lib/arm-mem.h | 159 ++++++++++++
14 arch/arm/lib/copy_from_user.S | 4 +-
15 arch/arm/lib/exports_rpi.c | 37 +++
16 arch/arm/lib/memcmp_rpi.S | 285 +++++++++++++++++++++
17 arch/arm/lib/memcpy_rpi.S | 59 +++++
18 arch/arm/lib/memcpymove.h | 506 +++++++++++++++++++++++++++++++++++++
19 arch/arm/lib/memmove_rpi.S | 61 +++++
20 arch/arm/lib/memset_rpi.S | 121 +++++++++
21 arch/arm/lib/uaccess_with_memcpy.c | 112 +++++++-
22 12 files changed, 1359 insertions(+), 6 deletions(-)
23 create mode 100644 arch/arm/lib/arm-mem.h
24 create mode 100644 arch/arm/lib/exports_rpi.c
25 create mode 100644 arch/arm/lib/memcmp_rpi.S
26 create mode 100644 arch/arm/lib/memcpy_rpi.S
27 create mode 100644 arch/arm/lib/memcpymove.h
28 create mode 100644 arch/arm/lib/memmove_rpi.S
29 create mode 100644 arch/arm/lib/memset_rpi.S
30
31 --- a/arch/arm/include/asm/string.h
32 +++ b/arch/arm/include/asm/string.h
33 @@ -24,6 +24,11 @@ extern void * memchr(const void *, int,
34 #define __HAVE_ARCH_MEMSET
35 extern void * memset(void *, int, __kernel_size_t);
36
37 +#ifdef CONFIG_MACH_BCM2708
38 +#define __HAVE_ARCH_MEMCMP
39 +extern int memcmp(const void *, const void *, size_t);
40 +#endif
41 +
42 extern void __memzero(void *ptr, __kernel_size_t n);
43
44 #define memset(p,v,n) \
45 --- a/arch/arm/include/asm/uaccess.h
46 +++ b/arch/arm/include/asm/uaccess.h
47 @@ -475,6 +475,7 @@ do { \
48
49 #ifdef CONFIG_MMU
50 extern unsigned long __must_check __copy_from_user(void *to, const void __user *from, unsigned long n);
51 +extern unsigned long __must_check __copy_from_user_std(void *to, const void __user *from, unsigned long n);
52 extern unsigned long __must_check __copy_to_user(void __user *to, const void *from, unsigned long n);
53 extern unsigned long __must_check __copy_to_user_std(void __user *to, const void *from, unsigned long n);
54 extern unsigned long __must_check __clear_user(void __user *addr, unsigned long n);
55 --- a/arch/arm/lib/Makefile
56 +++ b/arch/arm/lib/Makefile
57 @@ -6,9 +6,8 @@
58
59 lib-y := backtrace.o changebit.o csumipv6.o csumpartial.o \
60 csumpartialcopy.o csumpartialcopyuser.o clearbit.o \
61 - delay.o delay-loop.o findbit.o memchr.o memcpy.o \
62 - memmove.o memset.o memzero.o setbit.o \
63 - strchr.o strrchr.o \
64 + delay.o delay-loop.o findbit.o memchr.o memzero.o \
65 + setbit.o strchr.o strrchr.o \
66 testchangebit.o testclearbit.o testsetbit.o \
67 ashldi3.o ashrdi3.o lshrdi3.o muldi3.o \
68 ucmpdi2.o lib1funcs.o div64.o \
69 @@ -18,6 +17,16 @@ lib-y := backtrace.o changebit.o csumip
70 mmu-y := clear_user.o copy_page.o getuser.o putuser.o \
71 copy_from_user.o copy_to_user.o
72
73 +# Choose optimised implementations for Raspberry Pi
74 +ifeq ($(CONFIG_MACH_BCM2708),y)
75 + CFLAGS_uaccess_with_memcpy.o += -DCOPY_FROM_USER_THRESHOLD=1600
76 + CFLAGS_uaccess_with_memcpy.o += -DCOPY_TO_USER_THRESHOLD=672
77 + obj-$(CONFIG_MODULES) += exports_rpi.o
78 + lib-y += memcpy_rpi.o memmove_rpi.o memset_rpi.o memcmp_rpi.o
79 +else
80 + lib-y += memcpy.o memmove.o memset.o
81 +endif
82 +
83 # using lib_ here won't override already available weak symbols
84 obj-$(CONFIG_UACCESS_WITH_MEMCPY) += uaccess_with_memcpy.o
85
86 --- /dev/null
87 +++ b/arch/arm/lib/arm-mem.h
88 @@ -0,0 +1,159 @@
89 +/*
90 +Copyright (c) 2013, Raspberry Pi Foundation
91 +Copyright (c) 2013, RISC OS Open Ltd
92 +All rights reserved.
93 +
94 +Redistribution and use in source and binary forms, with or without
95 +modification, are permitted provided that the following conditions are met:
96 + * Redistributions of source code must retain the above copyright
97 + notice, this list of conditions and the following disclaimer.
98 + * Redistributions in binary form must reproduce the above copyright
99 + notice, this list of conditions and the following disclaimer in the
100 + documentation and/or other materials provided with the distribution.
101 + * Neither the name of the copyright holder nor the
102 + names of its contributors may be used to endorse or promote products
103 + derived from this software without specific prior written permission.
104 +
105 +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
106 +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
107 +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
108 +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
109 +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
110 +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
111 +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
112 +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
113 +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
114 +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
115 +*/
116 +
117 +.macro myfunc fname
118 + .func fname
119 + .global fname
120 +fname:
121 +.endm
122 +
123 +.macro preload_leading_step1 backwards, ptr, base
124 +/* If the destination is already 16-byte aligned, then we need to preload
125 + * between 0 and prefetch_distance (inclusive) cache lines ahead so there
126 + * are no gaps when the inner loop starts.
127 + */
128 + .if backwards
129 + sub ptr, base, #1
130 + bic ptr, ptr, #31
131 + .else
132 + bic ptr, base, #31
133 + .endif
134 + .set OFFSET, 0
135 + .rept prefetch_distance+1
136 + pld [ptr, #OFFSET]
137 + .if backwards
138 + .set OFFSET, OFFSET-32
139 + .else
140 + .set OFFSET, OFFSET+32
141 + .endif
142 + .endr
143 +.endm
144 +
145 +.macro preload_leading_step2 backwards, ptr, base, leading_bytes, tmp
146 +/* However, if the destination is not 16-byte aligned, we may need to
147 + * preload one more cache line than that. The question we need to ask is:
148 + * are the leading bytes more than the amount by which the source
149 + * pointer will be rounded down for preloading, and if so, by how many
150 + * cache lines?
151 + */
152 + .if backwards
153 +/* Here we compare against how many bytes we are into the
154 + * cache line, counting down from the highest such address.
155 + * Effectively, we want to calculate
156 + * leading_bytes = dst&15
157 + * cacheline_offset = 31-((src-leading_bytes-1)&31)
158 + * extra_needed = leading_bytes - cacheline_offset
159 + * and test if extra_needed is <= 0, or rearranging:
160 + * leading_bytes + (src-leading_bytes-1)&31 <= 31
161 + */
162 + mov tmp, base, lsl #32-5
163 + sbc tmp, tmp, leading_bytes, lsl #32-5
164 + adds tmp, tmp, leading_bytes, lsl #32-5
165 + bcc 61f
166 + pld [ptr, #-32*(prefetch_distance+1)]
167 + .else
168 +/* Effectively, we want to calculate
169 + * leading_bytes = (-dst)&15
170 + * cacheline_offset = (src+leading_bytes)&31
171 + * extra_needed = leading_bytes - cacheline_offset
172 + * and test if extra_needed is <= 0.
173 + */
174 + mov tmp, base, lsl #32-5
175 + add tmp, tmp, leading_bytes, lsl #32-5
176 + rsbs tmp, tmp, leading_bytes, lsl #32-5
177 + bls 61f
178 + pld [ptr, #32*(prefetch_distance+1)]
179 + .endif
180 +61:
181 +.endm
182 +
183 +.macro preload_trailing backwards, base, remain, tmp
184 + /* We need either 0, 1 or 2 extra preloads */
185 + .if backwards
186 + rsb tmp, base, #0
187 + mov tmp, tmp, lsl #32-5
188 + .else
189 + mov tmp, base, lsl #32-5
190 + .endif
191 + adds tmp, tmp, remain, lsl #32-5
192 + adceqs tmp, tmp, #0
193 + /* The instruction above has two effects: ensures Z is only
194 + * set if C was clear (so Z indicates that both shifted quantities
195 + * were 0), and clears C if Z was set (so C indicates that the sum
196 + * of the shifted quantities was greater and not equal to 32) */
197 + beq 82f
198 + .if backwards
199 + sub tmp, base, #1
200 + bic tmp, tmp, #31
201 + .else
202 + bic tmp, base, #31
203 + .endif
204 + bcc 81f
205 + .if backwards
206 + pld [tmp, #-32*(prefetch_distance+1)]
207 +81:
208 + pld [tmp, #-32*prefetch_distance]
209 + .else
210 + pld [tmp, #32*(prefetch_distance+2)]
211 +81:
212 + pld [tmp, #32*(prefetch_distance+1)]
213 + .endif
214 +82:
215 +.endm
216 +
217 +.macro preload_all backwards, narrow_case, shift, base, remain, tmp0, tmp1
218 + .if backwards
219 + sub tmp0, base, #1
220 + bic tmp0, tmp0, #31
221 + pld [tmp0]
222 + sub tmp1, base, remain, lsl #shift
223 + .else
224 + bic tmp0, base, #31
225 + pld [tmp0]
226 + add tmp1, base, remain, lsl #shift
227 + sub tmp1, tmp1, #1
228 + .endif
229 + bic tmp1, tmp1, #31
230 + cmp tmp1, tmp0
231 + beq 92f
232 + .if narrow_case
233 + /* In this case, all the data fits in either 1 or 2 cache lines */
234 + pld [tmp1]
235 + .else
236 +91:
237 + .if backwards
238 + sub tmp0, tmp0, #32
239 + .else
240 + add tmp0, tmp0, #32
241 + .endif
242 + cmp tmp0, tmp1
243 + pld [tmp0]
244 + bne 91b
245 + .endif
246 +92:
247 +.endm
248 --- a/arch/arm/lib/copy_from_user.S
249 +++ b/arch/arm/lib/copy_from_user.S
250 @@ -89,11 +89,13 @@
251
252 .text
253
254 -ENTRY(__copy_from_user)
255 +ENTRY(__copy_from_user_std)
256 +WEAK(__copy_from_user)
257
258 #include "copy_template.S"
259
260 ENDPROC(__copy_from_user)
261 +ENDPROC(__copy_from_user_std)
262
263 .pushsection .fixup,"ax"
264 .align 0
265 --- /dev/null
266 +++ b/arch/arm/lib/exports_rpi.c
267 @@ -0,0 +1,37 @@
268 +/**
269 + * Copyright (c) 2014, Raspberry Pi (Trading) Ltd.
270 + *
271 + * Redistribution and use in source and binary forms, with or without
272 + * modification, are permitted provided that the following conditions
273 + * are met:
274 + * 1. Redistributions of source code must retain the above copyright
275 + * notice, this list of conditions, and the following disclaimer,
276 + * without modification.
277 + * 2. Redistributions in binary form must reproduce the above copyright
278 + * notice, this list of conditions and the following disclaimer in the
279 + * documentation and/or other materials provided with the distribution.
280 + * 3. The names of the above-listed copyright holders may not be used
281 + * to endorse or promote products derived from this software without
282 + * specific prior written permission.
283 + *
284 + * ALTERNATIVELY, this software may be distributed under the terms of the
285 + * GNU General Public License ("GPL") version 2, as published by the Free
286 + * Software Foundation.
287 + *
288 + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
289 + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
290 + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
291 + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
292 + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
293 + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
294 + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
295 + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
296 + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
297 + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
298 + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
299 + */
300 +
301 +#include <linux/kernel.h>
302 +#include <linux/module.h>
303 +
304 +EXPORT_SYMBOL(memcmp);
305 --- /dev/null
306 +++ b/arch/arm/lib/memcmp_rpi.S
307 @@ -0,0 +1,285 @@
308 +/*
309 +Copyright (c) 2013, Raspberry Pi Foundation
310 +Copyright (c) 2013, RISC OS Open Ltd
311 +All rights reserved.
312 +
313 +Redistribution and use in source and binary forms, with or without
314 +modification, are permitted provided that the following conditions are met:
315 + * Redistributions of source code must retain the above copyright
316 + notice, this list of conditions and the following disclaimer.
317 + * Redistributions in binary form must reproduce the above copyright
318 + notice, this list of conditions and the following disclaimer in the
319 + documentation and/or other materials provided with the distribution.
320 + * Neither the name of the copyright holder nor the
321 + names of its contributors may be used to endorse or promote products
322 + derived from this software without specific prior written permission.
323 +
324 +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
325 +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
326 +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
327 +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
328 +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
329 +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
330 +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
331 +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
332 +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
333 +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
334 +*/
335 +
336 +#include <linux/linkage.h>
337 +#include "arm-mem.h"
338 +
339 +/* Prevent the stack from becoming executable */
340 +#if defined(__linux__) && defined(__ELF__)
341 +.section .note.GNU-stack,"",%progbits
342 +#endif
343 +
344 + .text
345 + .arch armv6
346 + .object_arch armv4
347 + .arm
348 + .altmacro
349 + .p2align 2
350 +
351 +.macro memcmp_process_head unaligned
352 + .if unaligned
353 + ldr DAT0, [S_1], #4
354 + ldr DAT1, [S_1], #4
355 + ldr DAT2, [S_1], #4
356 + ldr DAT3, [S_1], #4
357 + .else
358 + ldmia S_1!, {DAT0, DAT1, DAT2, DAT3}
359 + .endif
360 + ldmia S_2!, {DAT4, DAT5, DAT6, DAT7}
361 +.endm
362 +
363 +.macro memcmp_process_tail
364 + cmp DAT0, DAT4
365 + cmpeq DAT1, DAT5
366 + cmpeq DAT2, DAT6
367 + cmpeq DAT3, DAT7
368 + bne 200f
369 +.endm
370 +
371 +.macro memcmp_leading_31bytes
372 + movs DAT0, OFF, lsl #31
373 + ldrmib DAT0, [S_1], #1
374 + ldrcsh DAT1, [S_1], #2
375 + ldrmib DAT4, [S_2], #1
376 + ldrcsh DAT5, [S_2], #2
377 + movpl DAT0, #0
378 + movcc DAT1, #0
379 + movpl DAT4, #0
380 + movcc DAT5, #0
381 + submi N, N, #1
382 + subcs N, N, #2
383 + cmp DAT0, DAT4
384 + cmpeq DAT1, DAT5
385 + bne 200f
386 + movs DAT0, OFF, lsl #29
387 + ldrmi DAT0, [S_1], #4
388 + ldrcs DAT1, [S_1], #4
389 + ldrcs DAT2, [S_1], #4
390 + ldrmi DAT4, [S_2], #4
391 + ldmcsia S_2!, {DAT5, DAT6}
392 + movpl DAT0, #0
393 + movcc DAT1, #0
394 + movcc DAT2, #0
395 + movpl DAT4, #0
396 + movcc DAT5, #0
397 + movcc DAT6, #0
398 + submi N, N, #4
399 + subcs N, N, #8
400 + cmp DAT0, DAT4
401 + cmpeq DAT1, DAT5
402 + cmpeq DAT2, DAT6
403 + bne 200f
404 + tst OFF, #16
405 + beq 105f
406 + memcmp_process_head 1
407 + sub N, N, #16
408 + memcmp_process_tail
409 +105:
410 +.endm
411 +
412 +.macro memcmp_trailing_15bytes unaligned
413 + movs N, N, lsl #29
414 + .if unaligned
415 + ldrcs DAT0, [S_1], #4
416 + ldrcs DAT1, [S_1], #4
417 + .else
418 + ldmcsia S_1!, {DAT0, DAT1}
419 + .endif
420 + ldrmi DAT2, [S_1], #4
421 + ldmcsia S_2!, {DAT4, DAT5}
422 + ldrmi DAT6, [S_2], #4
423 + movcc DAT0, #0
424 + movcc DAT1, #0
425 + movpl DAT2, #0
426 + movcc DAT4, #0
427 + movcc DAT5, #0
428 + movpl DAT6, #0
429 + cmp DAT0, DAT4
430 + cmpeq DAT1, DAT5
431 + cmpeq DAT2, DAT6
432 + bne 200f
433 + movs N, N, lsl #2
434 + ldrcsh DAT0, [S_1], #2
435 + ldrmib DAT1, [S_1]
436 + ldrcsh DAT4, [S_2], #2
437 + ldrmib DAT5, [S_2]
438 + movcc DAT0, #0
439 + movpl DAT1, #0
440 + movcc DAT4, #0
441 + movpl DAT5, #0
442 + cmp DAT0, DAT4
443 + cmpeq DAT1, DAT5
444 + bne 200f
445 +.endm
446 +
447 +.macro memcmp_long_inner_loop unaligned
448 +110:
449 + memcmp_process_head unaligned
450 + pld [S_2, #prefetch_distance*32 + 16]
451 + memcmp_process_tail
452 + memcmp_process_head unaligned
453 + pld [S_1, OFF]
454 + memcmp_process_tail
455 + subs N, N, #32
456 + bhs 110b
457 + /* Just before the final (prefetch_distance+1) 32-byte blocks,
458 + * deal with final preloads */
459 + preload_trailing 0, S_1, N, DAT0
460 + preload_trailing 0, S_2, N, DAT0
461 + add N, N, #(prefetch_distance+2)*32 - 16
462 +120:
463 + memcmp_process_head unaligned
464 + memcmp_process_tail
465 + subs N, N, #16
466 + bhs 120b
467 + /* Trailing words and bytes */
468 + tst N, #15
469 + beq 199f
470 + memcmp_trailing_15bytes unaligned
471 +199: /* Reached end without detecting a difference */
472 + mov a1, #0
473 + setend le
474 + pop {DAT1-DAT6, pc}
475 +.endm
476 +
477 +.macro memcmp_short_inner_loop unaligned
478 + subs N, N, #16 /* simplifies inner loop termination */
479 + blo 122f
480 +120:
481 + memcmp_process_head unaligned
482 + memcmp_process_tail
483 + subs N, N, #16
484 + bhs 120b
485 +122: /* Trailing words and bytes */
486 + tst N, #15
487 + beq 199f
488 + memcmp_trailing_15bytes unaligned
489 +199: /* Reached end without detecting a difference */
490 + mov a1, #0
491 + setend le
492 + pop {DAT1-DAT6, pc}
493 +.endm
494 +
495 +/*
496 + * int memcmp(const void *s1, const void *s2, size_t n);
497 + * On entry:
498 + * a1 = pointer to buffer 1
499 + * a2 = pointer to buffer 2
500 + * a3 = number of bytes to compare (as unsigned chars)
501 + * On exit:
502 + * a1 = >0/=0/<0 if s1 >/=/< s2
503 + */
504 +
505 +.set prefetch_distance, 2
506 +
507 +ENTRY(memcmp)
508 + S_1 .req a1
509 + S_2 .req a2
510 + N .req a3
511 + DAT0 .req a4
512 + DAT1 .req v1
513 + DAT2 .req v2
514 + DAT3 .req v3
515 + DAT4 .req v4
516 + DAT5 .req v5
517 + DAT6 .req v6
518 + DAT7 .req ip
519 + OFF .req lr
520 +
521 + push {DAT1-DAT6, lr}
522 + setend be /* lowest-addressed bytes are most significant */
523 +
524 + /* To preload ahead as we go, we need at least (prefetch_distance+2) 32-byte blocks */
525 + cmp N, #(prefetch_distance+3)*32 - 1
526 + blo 170f
527 +
528 + /* Long case */
529 + /* Adjust N so that the decrement instruction can also test for
530 + * inner loop termination. We want it to stop when there are
531 + * (prefetch_distance+1) complete blocks to go. */
532 + sub N, N, #(prefetch_distance+2)*32
533 + preload_leading_step1 0, DAT0, S_1
534 + preload_leading_step1 0, DAT1, S_2
535 + tst S_2, #31
536 + beq 154f
537 + rsb OFF, S_2, #0 /* no need to AND with 15 here */
538 + preload_leading_step2 0, DAT0, S_1, OFF, DAT2
539 + preload_leading_step2 0, DAT1, S_2, OFF, DAT2
540 + memcmp_leading_31bytes
541 +154: /* Second source now cacheline (32-byte) aligned; we have at
542 + * least one prefetch to go. */
543 + /* Prefetch offset is best selected such that it lies in the
544 + * first 8 of each 32 bytes - but it's just as easy to aim for
545 + * the first one */
546 + and OFF, S_1, #31
547 + rsb OFF, OFF, #32*prefetch_distance
548 + tst S_1, #3
549 + bne 140f
550 + memcmp_long_inner_loop 0
551 +140: memcmp_long_inner_loop 1
552 +
553 +170: /* Short case */
554 + teq N, #0
555 + beq 199f
556 + preload_all 0, 0, 0, S_1, N, DAT0, DAT1
557 + preload_all 0, 0, 0, S_2, N, DAT0, DAT1
558 + tst S_2, #3
559 + beq 174f
560 +172: subs N, N, #1
561 + blo 199f
562 + ldrb DAT0, [S_1], #1
563 + ldrb DAT4, [S_2], #1
564 + cmp DAT0, DAT4
565 + bne 200f
566 + tst S_2, #3
567 + bne 172b
568 +174: /* Second source now 4-byte aligned; we have 0 or more bytes to go */
569 + tst S_1, #3
570 + bne 140f
571 + memcmp_short_inner_loop 0
572 +140: memcmp_short_inner_loop 1
573 +
574 +200: /* Difference found: determine sign. */
575 + movhi a1, #1
576 + movlo a1, #-1
577 + setend le
578 + pop {DAT1-DAT6, pc}
579 +
580 + .unreq S_1
581 + .unreq S_2
582 + .unreq N
583 + .unreq DAT0
584 + .unreq DAT1
585 + .unreq DAT2
586 + .unreq DAT3
587 + .unreq DAT4
588 + .unreq DAT5
589 + .unreq DAT6
590 + .unreq DAT7
591 + .unreq OFF
592 +ENDPROC(memcmp)
593 --- /dev/null
594 +++ b/arch/arm/lib/memcpy_rpi.S
595 @@ -0,0 +1,59 @@
596 +/*
597 +Copyright (c) 2013, Raspberry Pi Foundation
598 +Copyright (c) 2013, RISC OS Open Ltd
599 +All rights reserved.
600 +
601 +Redistribution and use in source and binary forms, with or without
602 +modification, are permitted provided that the following conditions are met:
603 + * Redistributions of source code must retain the above copyright
604 + notice, this list of conditions and the following disclaimer.
605 + * Redistributions in binary form must reproduce the above copyright
606 + notice, this list of conditions and the following disclaimer in the
607 + documentation and/or other materials provided with the distribution.
608 + * Neither the name of the copyright holder nor the
609 + names of its contributors may be used to endorse or promote products
610 + derived from this software without specific prior written permission.
611 +
612 +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
613 +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
614 +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
615 +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
616 +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
617 +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
618 +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
619 +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
620 +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
621 +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
622 +*/
623 +
624 +#include <linux/linkage.h>
625 +#include "arm-mem.h"
626 +#include "memcpymove.h"
627 +
628 +/* Prevent the stack from becoming executable */
629 +#if defined(__linux__) && defined(__ELF__)
630 +.section .note.GNU-stack,"",%progbits
631 +#endif
632 +
633 + .text
634 + .arch armv6
635 + .object_arch armv4
636 + .arm
637 + .altmacro
638 + .p2align 2
639 +
640 +/*
641 + * void *memcpy(void * restrict s1, const void * restrict s2, size_t n);
642 + * On entry:
643 + * a1 = pointer to destination
644 + * a2 = pointer to source
645 + * a3 = number of bytes to copy
646 + * On exit:
647 + * a1 preserved
648 + */
649 +
650 +.set prefetch_distance, 3
651 +
652 +ENTRY(memcpy)
653 + memcpy 0
654 +ENDPROC(memcpy)
655 --- /dev/null
656 +++ b/arch/arm/lib/memcpymove.h
657 @@ -0,0 +1,506 @@
658 +/*
659 +Copyright (c) 2013, Raspberry Pi Foundation
660 +Copyright (c) 2013, RISC OS Open Ltd
661 +All rights reserved.
662 +
663 +Redistribution and use in source and binary forms, with or without
664 +modification, are permitted provided that the following conditions are met:
665 + * Redistributions of source code must retain the above copyright
666 + notice, this list of conditions and the following disclaimer.
667 + * Redistributions in binary form must reproduce the above copyright
668 + notice, this list of conditions and the following disclaimer in the
669 + documentation and/or other materials provided with the distribution.
670 + * Neither the name of the copyright holder nor the
671 + names of its contributors may be used to endorse or promote products
672 + derived from this software without specific prior written permission.
673 +
674 +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
675 +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
676 +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
677 +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
678 +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
679 +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
680 +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
681 +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
682 +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
683 +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
684 +*/
685 +
686 +.macro unaligned_words backwards, align, use_pld, words, r0, r1, r2, r3, r4, r5, r6, r7, r8
687 + .if words == 1
688 + .if backwards
689 + mov r1, r0, lsl #32-align*8
690 + ldr r0, [S, #-4]!
691 + orr r1, r1, r0, lsr #align*8
692 + str r1, [D, #-4]!
693 + .else
694 + mov r0, r1, lsr #align*8
695 + ldr r1, [S, #4]!
696 + orr r0, r0, r1, lsl #32-align*8
697 + str r0, [D], #4
698 + .endif
699 + .elseif words == 2
700 + .if backwards
701 + ldr r1, [S, #-4]!
702 + mov r2, r0, lsl #32-align*8
703 + ldr r0, [S, #-4]!
704 + orr r2, r2, r1, lsr #align*8
705 + mov r1, r1, lsl #32-align*8
706 + orr r1, r1, r0, lsr #align*8
707 + stmdb D!, {r1, r2}
708 + .else
709 + ldr r1, [S, #4]!
710 + mov r0, r2, lsr #align*8
711 + ldr r2, [S, #4]!
712 + orr r0, r0, r1, lsl #32-align*8
713 + mov r1, r1, lsr #align*8
714 + orr r1, r1, r2, lsl #32-align*8
715 + stmia D!, {r0, r1}
716 + .endif
717 + .elseif words == 4
718 + .if backwards
719 + ldmdb S!, {r2, r3}
720 + mov r4, r0, lsl #32-align*8
721 + ldmdb S!, {r0, r1}
722 + orr r4, r4, r3, lsr #align*8
723 + mov r3, r3, lsl #32-align*8
724 + orr r3, r3, r2, lsr #align*8
725 + mov r2, r2, lsl #32-align*8
726 + orr r2, r2, r1, lsr #align*8
727 + mov r1, r1, lsl #32-align*8
728 + orr r1, r1, r0, lsr #align*8
729 + stmdb D!, {r1, r2, r3, r4}
730 + .else
731 + ldmib S!, {r1, r2}
732 + mov r0, r4, lsr #align*8
733 + ldmib S!, {r3, r4}
734 + orr r0, r0, r1, lsl #32-align*8
735 + mov r1, r1, lsr #align*8
736 + orr r1, r1, r2, lsl #32-align*8
737 + mov r2, r2, lsr #align*8
738 + orr r2, r2, r3, lsl #32-align*8
739 + mov r3, r3, lsr #align*8
740 + orr r3, r3, r4, lsl #32-align*8
741 + stmia D!, {r0, r1, r2, r3}
742 + .endif
743 + .elseif words == 8
744 + .if backwards
745 + ldmdb S!, {r4, r5, r6, r7}
746 + mov r8, r0, lsl #32-align*8
747 + ldmdb S!, {r0, r1, r2, r3}
748 + .if use_pld
749 + pld [S, OFF]
750 + .endif
751 + orr r8, r8, r7, lsr #align*8
752 + mov r7, r7, lsl #32-align*8
753 + orr r7, r7, r6, lsr #align*8
754 + mov r6, r6, lsl #32-align*8
755 + orr r6, r6, r5, lsr #align*8
756 + mov r5, r5, lsl #32-align*8
757 + orr r5, r5, r4, lsr #align*8
758 + mov r4, r4, lsl #32-align*8
759 + orr r4, r4, r3, lsr #align*8
760 + mov r3, r3, lsl #32-align*8
761 + orr r3, r3, r2, lsr #align*8
762 + mov r2, r2, lsl #32-align*8
763 + orr r2, r2, r1, lsr #align*8
764 + mov r1, r1, lsl #32-align*8
765 + orr r1, r1, r0, lsr #align*8
766 + stmdb D!, {r5, r6, r7, r8}
767 + stmdb D!, {r1, r2, r3, r4}
768 + .else
769 + ldmib S!, {r1, r2, r3, r4}
770 + mov r0, r8, lsr #align*8
771 + ldmib S!, {r5, r6, r7, r8}
772 + .if use_pld
773 + pld [S, OFF]
774 + .endif
775 + orr r0, r0, r1, lsl #32-align*8
776 + mov r1, r1, lsr #align*8
777 + orr r1, r1, r2, lsl #32-align*8
778 + mov r2, r2, lsr #align*8
779 + orr r2, r2, r3, lsl #32-align*8
780 + mov r3, r3, lsr #align*8
781 + orr r3, r3, r4, lsl #32-align*8
782 + mov r4, r4, lsr #align*8
783 + orr r4, r4, r5, lsl #32-align*8
784 + mov r5, r5, lsr #align*8
785 + orr r5, r5, r6, lsl #32-align*8
786 + mov r6, r6, lsr #align*8
787 + orr r6, r6, r7, lsl #32-align*8
788 + mov r7, r7, lsr #align*8
789 + orr r7, r7, r8, lsl #32-align*8
790 + stmia D!, {r0, r1, r2, r3}
791 + stmia D!, {r4, r5, r6, r7}
792 + .endif
793 + .endif
794 +.endm
795 +
796 +.macro memcpy_leading_15bytes backwards, align
797 + movs DAT1, DAT2, lsl #31
798 + sub N, N, DAT2
799 + .if backwards
800 + ldrmib DAT0, [S, #-1]!
801 + ldrcsh DAT1, [S, #-2]!
802 + strmib DAT0, [D, #-1]!
803 + strcsh DAT1, [D, #-2]!
804 + .else
805 + ldrmib DAT0, [S], #1
806 + ldrcsh DAT1, [S], #2
807 + strmib DAT0, [D], #1
808 + strcsh DAT1, [D], #2
809 + .endif
810 + movs DAT1, DAT2, lsl #29
811 + .if backwards
812 + ldrmi DAT0, [S, #-4]!
813 + .if align == 0
814 + ldmcsdb S!, {DAT1, DAT2}
815 + .else
816 + ldrcs DAT2, [S, #-4]!
817 + ldrcs DAT1, [S, #-4]!
818 + .endif
819 + strmi DAT0, [D, #-4]!
820 + stmcsdb D!, {DAT1, DAT2}
821 + .else
822 + ldrmi DAT0, [S], #4
823 + .if align == 0
824 + ldmcsia S!, {DAT1, DAT2}
825 + .else
826 + ldrcs DAT1, [S], #4
827 + ldrcs DAT2, [S], #4
828 + .endif
829 + strmi DAT0, [D], #4
830 + stmcsia D!, {DAT1, DAT2}
831 + .endif
832 +.endm
833 +
834 +.macro memcpy_trailing_15bytes backwards, align
835 + movs N, N, lsl #29
836 + .if backwards
837 + .if align == 0
838 + ldmcsdb S!, {DAT0, DAT1}
839 + .else
840 + ldrcs DAT1, [S, #-4]!
841 + ldrcs DAT0, [S, #-4]!
842 + .endif
843 + ldrmi DAT2, [S, #-4]!
844 + stmcsdb D!, {DAT0, DAT1}
845 + strmi DAT2, [D, #-4]!
846 + .else
847 + .if align == 0
848 + ldmcsia S!, {DAT0, DAT1}
849 + .else
850 + ldrcs DAT0, [S], #4
851 + ldrcs DAT1, [S], #4
852 + .endif
853 + ldrmi DAT2, [S], #4
854 + stmcsia D!, {DAT0, DAT1}
855 + strmi DAT2, [D], #4
856 + .endif
857 + movs N, N, lsl #2
858 + .if backwards
859 + ldrcsh DAT0, [S, #-2]!
860 + ldrmib DAT1, [S, #-1]
861 + strcsh DAT0, [D, #-2]!
862 + strmib DAT1, [D, #-1]
863 + .else
864 + ldrcsh DAT0, [S], #2
865 + ldrmib DAT1, [S]
866 + strcsh DAT0, [D], #2
867 + strmib DAT1, [D]
868 + .endif
869 +.endm
870 +
871 +.macro memcpy_long_inner_loop backwards, align
872 + .if align != 0
873 + .if backwards
874 + ldr DAT0, [S, #-align]!
875 + .else
876 + ldr LAST, [S, #-align]!
877 + .endif
878 + .endif
879 +110:
880 + .if align == 0
881 + .if backwards
882 + ldmdb S!, {DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, LAST}
883 + pld [S, OFF]
884 + stmdb D!, {DAT4, DAT5, DAT6, LAST}
885 + stmdb D!, {DAT0, DAT1, DAT2, DAT3}
886 + .else
887 + ldmia S!, {DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, LAST}
888 + pld [S, OFF]
889 + stmia D!, {DAT0, DAT1, DAT2, DAT3}
890 + stmia D!, {DAT4, DAT5, DAT6, LAST}
891 + .endif
892 + .else
893 + unaligned_words backwards, align, 1, 8, DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, DAT7, LAST
894 + .endif
895 + subs N, N, #32
896 + bhs 110b
897 + /* Just before the final (prefetch_distance+1) 32-byte blocks, deal with final preloads */
898 + preload_trailing backwards, S, N, OFF
899 + add N, N, #(prefetch_distance+2)*32 - 32
900 +120:
901 + .if align == 0
902 + .if backwards
903 + ldmdb S!, {DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, LAST}
904 + stmdb D!, {DAT4, DAT5, DAT6, LAST}
905 + stmdb D!, {DAT0, DAT1, DAT2, DAT3}
906 + .else
907 + ldmia S!, {DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, LAST}
908 + stmia D!, {DAT0, DAT1, DAT2, DAT3}
909 + stmia D!, {DAT4, DAT5, DAT6, LAST}
910 + .endif
911 + .else
912 + unaligned_words backwards, align, 0, 8, DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, DAT7, LAST
913 + .endif
914 + subs N, N, #32
915 + bhs 120b
916 + tst N, #16
917 + .if align == 0
918 + .if backwards
919 + ldmnedb S!, {DAT0, DAT1, DAT2, LAST}
920 + stmnedb D!, {DAT0, DAT1, DAT2, LAST}
921 + .else
922 + ldmneia S!, {DAT0, DAT1, DAT2, LAST}
923 + stmneia D!, {DAT0, DAT1, DAT2, LAST}
924 + .endif
925 + .else
926 + beq 130f
927 + unaligned_words backwards, align, 0, 4, DAT0, DAT1, DAT2, DAT3, LAST
928 +130:
929 + .endif
930 + /* Trailing words and bytes */
931 + tst N, #15
932 + beq 199f
933 + .if align != 0
934 + add S, S, #align
935 + .endif
936 + memcpy_trailing_15bytes backwards, align
937 +199:
938 + pop {DAT3, DAT4, DAT5, DAT6, DAT7}
939 + pop {D, DAT1, DAT2, pc}
940 +.endm
941 +
942 +.macro memcpy_medium_inner_loop backwards, align
943 +120:
944 + .if backwards
945 + .if align == 0
946 + ldmdb S!, {DAT0, DAT1, DAT2, LAST}
947 + .else
948 + ldr LAST, [S, #-4]!
949 + ldr DAT2, [S, #-4]!
950 + ldr DAT1, [S, #-4]!
951 + ldr DAT0, [S, #-4]!
952 + .endif
953 + stmdb D!, {DAT0, DAT1, DAT2, LAST}
954 + .else
955 + .if align == 0
956 + ldmia S!, {DAT0, DAT1, DAT2, LAST}
957 + .else
958 + ldr DAT0, [S], #4
959 + ldr DAT1, [S], #4
960 + ldr DAT2, [S], #4
961 + ldr LAST, [S], #4
962 + .endif
963 + stmia D!, {DAT0, DAT1, DAT2, LAST}
964 + .endif
965 + subs N, N, #16
966 + bhs 120b
967 + /* Trailing words and bytes */
968 + tst N, #15
969 + beq 199f
970 + memcpy_trailing_15bytes backwards, align
971 +199:
972 + pop {D, DAT1, DAT2, pc}
973 +.endm
974 +
975 +.macro memcpy_short_inner_loop backwards, align
976 + tst N, #16
977 + .if backwards
978 + .if align == 0
979 + ldmnedb S!, {DAT0, DAT1, DAT2, LAST}
980 + .else
981 + ldrne LAST, [S, #-4]!
982 + ldrne DAT2, [S, #-4]!
983 + ldrne DAT1, [S, #-4]!
984 + ldrne DAT0, [S, #-4]!
985 + .endif
986 + stmnedb D!, {DAT0, DAT1, DAT2, LAST}
987 + .else
988 + .if align == 0
989 + ldmneia S!, {DAT0, DAT1, DAT2, LAST}
990 + .else
991 + ldrne DAT0, [S], #4
992 + ldrne DAT1, [S], #4
993 + ldrne DAT2, [S], #4
994 + ldrne LAST, [S], #4
995 + .endif
996 + stmneia D!, {DAT0, DAT1, DAT2, LAST}
997 + .endif
998 + memcpy_trailing_15bytes backwards, align
999 +199:
1000 + pop {D, DAT1, DAT2, pc}
1001 +.endm
1002 +
1003 +.macro memcpy backwards
1004 + D .req a1
1005 + S .req a2
1006 + N .req a3
1007 + DAT0 .req a4
1008 + DAT1 .req v1
1009 + DAT2 .req v2
1010 + DAT3 .req v3
1011 + DAT4 .req v4
1012 + DAT5 .req v5
1013 + DAT6 .req v6
1014 + DAT7 .req sl
1015 + LAST .req ip
1016 + OFF .req lr
1017 +
1018 + .cfi_startproc
1019 +
1020 + push {D, DAT1, DAT2, lr}
1021 +
1022 + .cfi_def_cfa_offset 16
1023 + .cfi_rel_offset D, 0
1024 + .cfi_undefined S
1025 + .cfi_undefined N
1026 + .cfi_undefined DAT0
1027 + .cfi_rel_offset DAT1, 4
1028 + .cfi_rel_offset DAT2, 8
1029 + .cfi_undefined LAST
1030 + .cfi_rel_offset lr, 12
1031 +
1032 + .if backwards
1033 + add D, D, N
1034 + add S, S, N
1035 + .endif
1036 +
1037 + /* See if we're guaranteed to have at least one 16-byte aligned 16-byte write */
1038 + cmp N, #31
1039 + blo 170f
1040 + /* To preload ahead as we go, we need at least (prefetch_distance+2) 32-byte blocks */
1041 + cmp N, #(prefetch_distance+3)*32 - 1
1042 + blo 160f
1043 +
1044 + /* Long case */
1045 + push {DAT3, DAT4, DAT5, DAT6, DAT7}
1046 +
1047 + .cfi_def_cfa_offset 36
1048 + .cfi_rel_offset D, 20
1049 + .cfi_rel_offset DAT1, 24
1050 + .cfi_rel_offset DAT2, 28
1051 + .cfi_rel_offset DAT3, 0
1052 + .cfi_rel_offset DAT4, 4
1053 + .cfi_rel_offset DAT5, 8
1054 + .cfi_rel_offset DAT6, 12
1055 + .cfi_rel_offset DAT7, 16
1056 + .cfi_rel_offset lr, 32
1057 +
1058 + /* Adjust N so that the decrement instruction can also test for
1059 + * inner loop termination. We want it to stop when there are
1060 + * (prefetch_distance+1) complete blocks to go. */
1061 + sub N, N, #(prefetch_distance+2)*32
1062 + preload_leading_step1 backwards, DAT0, S
1063 + .if backwards
1064 + /* Bug in GAS: it accepts, but mis-assembles the instruction
1065 + * ands DAT2, D, #60, 2
1066 + * which sets DAT2 to the number of leading bytes until destination is aligned and also clears C (sets borrow)
1067 + */
1068 + .word 0xE210513C
1069 + beq 154f
1070 + .else
1071 + ands DAT2, D, #15
1072 + beq 154f
1073 + rsb DAT2, DAT2, #16 /* number of leading bytes until destination aligned */
1074 + .endif
1075 + preload_leading_step2 backwards, DAT0, S, DAT2, OFF
1076 + memcpy_leading_15bytes backwards, 1
1077 +154: /* Destination now 16-byte aligned; we have at least one prefetch as well as at least one 16-byte output block */
1078 + /* Prefetch offset is best selected such that it lies in the first 8 of each 32 bytes - but it's just as easy to aim for the first one */
1079 + .if backwards
1080 + rsb OFF, S, #3
1081 + and OFF, OFF, #28
1082 + sub OFF, OFF, #32*(prefetch_distance+1)
1083 + .else
1084 + and OFF, S, #28
1085 + rsb OFF, OFF, #32*prefetch_distance
1086 + .endif
1087 + movs DAT0, S, lsl #31
1088 + bhi 157f
1089 + bcs 156f
1090 + bmi 155f
1091 + memcpy_long_inner_loop backwards, 0
1092 +155: memcpy_long_inner_loop backwards, 1
1093 +156: memcpy_long_inner_loop backwards, 2
1094 +157: memcpy_long_inner_loop backwards, 3
1095 +
1096 + .cfi_def_cfa_offset 16
1097 + .cfi_rel_offset D, 0
1098 + .cfi_rel_offset DAT1, 4
1099 + .cfi_rel_offset DAT2, 8
1100 + .cfi_same_value DAT3
1101 + .cfi_same_value DAT4
1102 + .cfi_same_value DAT5
1103 + .cfi_same_value DAT6
1104 + .cfi_same_value DAT7
1105 + .cfi_rel_offset lr, 12
1106 +
1107 +160: /* Medium case */
1108 + preload_all backwards, 0, 0, S, N, DAT2, OFF
1109 + sub N, N, #16 /* simplifies inner loop termination */
1110 + .if backwards
1111 + ands DAT2, D, #15
1112 + beq 164f
1113 + .else
1114 + ands DAT2, D, #15
1115 + beq 164f
1116 + rsb DAT2, DAT2, #16
1117 + .endif
1118 + memcpy_leading_15bytes backwards, align
1119 +164: /* Destination now 16-byte aligned; we have at least one 16-byte output block */
1120 + tst S, #3
1121 + bne 140f
1122 + memcpy_medium_inner_loop backwards, 0
1123 +140: memcpy_medium_inner_loop backwards, 1
1124 +
1125 +170: /* Short case, less than 31 bytes, so no guarantee of at least one 16-byte block */
1126 + teq N, #0
1127 + beq 199f
1128 + preload_all backwards, 1, 0, S, N, DAT2, LAST
1129 + tst D, #3
1130 + beq 174f
1131 +172: subs N, N, #1
1132 + blo 199f
1133 + .if backwards
1134 + ldrb DAT0, [S, #-1]!
1135 + strb DAT0, [D, #-1]!
1136 + .else
1137 + ldrb DAT0, [S], #1
1138 + strb DAT0, [D], #1
1139 + .endif
1140 + tst D, #3
1141 + bne 172b
1142 +174: /* Destination now 4-byte aligned; we have 0 or more output bytes to go */
1143 + tst S, #3
1144 + bne 140f
1145 + memcpy_short_inner_loop backwards, 0
1146 +140: memcpy_short_inner_loop backwards, 1
1147 +
1148 + .cfi_endproc
1149 +
1150 + .unreq D
1151 + .unreq S
1152 + .unreq N
1153 + .unreq DAT0
1154 + .unreq DAT1
1155 + .unreq DAT2
1156 + .unreq DAT3
1157 + .unreq DAT4
1158 + .unreq DAT5
1159 + .unreq DAT6
1160 + .unreq DAT7
1161 + .unreq LAST
1162 + .unreq OFF
1163 +.endm
1164 --- /dev/null
1165 +++ b/arch/arm/lib/memmove_rpi.S
1166 @@ -0,0 +1,61 @@
1167 +/*
1168 +Copyright (c) 2013, Raspberry Pi Foundation
1169 +Copyright (c) 2013, RISC OS Open Ltd
1170 +All rights reserved.
1171 +
1172 +Redistribution and use in source and binary forms, with or without
1173 +modification, are permitted provided that the following conditions are met:
1174 + * Redistributions of source code must retain the above copyright
1175 + notice, this list of conditions and the following disclaimer.
1176 + * Redistributions in binary form must reproduce the above copyright
1177 + notice, this list of conditions and the following disclaimer in the
1178 + documentation and/or other materials provided with the distribution.
1179 + * Neither the name of the copyright holder nor the
1180 + names of its contributors may be used to endorse or promote products
1181 + derived from this software without specific prior written permission.
1182 +
1183 +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
1184 +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
1185 +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
1186 +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
1187 +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
1188 +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
1189 +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
1190 +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
1191 +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
1192 +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
1193 +*/
1194 +
1195 +#include <linux/linkage.h>
1196 +#include "arm-mem.h"
1197 +#include "memcpymove.h"
1198 +
1199 +/* Prevent the stack from becoming executable */
1200 +#if defined(__linux__) && defined(__ELF__)
1201 +.section .note.GNU-stack,"",%progbits
1202 +#endif
1203 +
1204 + .text
1205 + .arch armv6
1206 + .object_arch armv4
1207 + .arm
1208 + .altmacro
1209 + .p2align 2
1210 +
1211 +/*
1212 + * void *memmove(void *s1, const void *s2, size_t n);
1213 + * On entry:
1214 + * a1 = pointer to destination
1215 + * a2 = pointer to source
1216 + * a3 = number of bytes to copy
1217 + * On exit:
1218 + * a1 preserved
1219 + */
1220 +
1221 +.set prefetch_distance, 3
1222 +
1223 +ENTRY(memmove)
1224 + cmp a2, a1
1225 + bpl memcpy /* pl works even over -1 - 0 and 0x7fffffff - 0x80000000 boundaries */
1226 + memcpy 1
1227 +ENDPROC(memmove)
1228 --- /dev/null
1229 +++ b/arch/arm/lib/memset_rpi.S
1230 @@ -0,0 +1,121 @@
1231 +/*
1232 +Copyright (c) 2013, Raspberry Pi Foundation
1233 +Copyright (c) 2013, RISC OS Open Ltd
1234 +All rights reserved.
1235 +
1236 +Redistribution and use in source and binary forms, with or without
1237 +modification, are permitted provided that the following conditions are met:
1238 + * Redistributions of source code must retain the above copyright
1239 + notice, this list of conditions and the following disclaimer.
1240 + * Redistributions in binary form must reproduce the above copyright
1241 + notice, this list of conditions and the following disclaimer in the
1242 + documentation and/or other materials provided with the distribution.
1243 + * Neither the name of the copyright holder nor the
1244 + names of its contributors may be used to endorse or promote products
1245 + derived from this software without specific prior written permission.
1246 +
1247 +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
1248 +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
1249 +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
1250 +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
1251 +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
1252 +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
1253 +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
1254 +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
1255 +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
1256 +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
1257 +*/
1258 +
1259 +#include <linux/linkage.h>
1260 +#include "arm-mem.h"
1261 +
1262 +/* Prevent the stack from becoming executable */
1263 +#if defined(__linux__) && defined(__ELF__)
1264 +.section .note.GNU-stack,"",%progbits
1265 +#endif
1266 +
1267 + .text
1268 + .arch armv6
1269 + .object_arch armv4
1270 + .arm
1271 + .altmacro
1272 + .p2align 2
1273 +
1274 +/*
1275 + * void *memset(void *s, int c, size_t n);
1276 + * On entry:
1277 + * a1 = pointer to buffer to fill
1278 + * a2 = byte pattern to fill with (caller-narrowed)
1279 + * a3 = number of bytes to fill
1280 + * On exit:
1281 + * a1 preserved
1282 + */
1283 +ENTRY(memset)
1284 + S .req a1
1285 + DAT0 .req a2
1286 + N .req a3
1287 + DAT1 .req a4
1288 + DAT2 .req ip
1289 + DAT3 .req lr
1290 +
1291 + orr DAT0, DAT0, lsl #8
1292 + push {S, lr}
1293 + orr DAT0, DAT0, lsl #16
1294 + mov DAT1, DAT0
1295 +
1296 + /* See if we're guaranteed to have at least one 16-byte aligned 16-byte write */
1297 + cmp N, #31
1298 + blo 170f
1299 +
1300 +161: sub N, N, #16 /* simplifies inner loop termination */
1301 + /* Leading words and bytes */
1302 + tst S, #15
1303 + beq 164f
1304 + rsb DAT3, S, #0 /* bits 0-3 = number of leading bytes until aligned */
1305 + movs DAT2, DAT3, lsl #31
1306 + submi N, N, #1
1307 + strmib DAT0, [S], #1
1308 + subcs N, N, #2
1309 + strcsh DAT0, [S], #2
1310 + movs DAT2, DAT3, lsl #29
1311 + submi N, N, #4
1312 + strmi DAT0, [S], #4
1313 + subcs N, N, #8
1314 + stmcsia S!, {DAT0, DAT1}
1315 +164: /* Delayed set up of DAT2 and DAT3 so we could use them as scratch registers above */
1316 + mov DAT2, DAT0
1317 + mov DAT3, DAT0
1318 + /* Now the inner loop of 16-byte stores */
1319 +165: stmia S!, {DAT0, DAT1, DAT2, DAT3}
1320 + subs N, N, #16
1321 + bhs 165b
1322 +166: /* Trailing words and bytes */
1323 + movs N, N, lsl #29
1324 + stmcsia S!, {DAT0, DAT1}
1325 + strmi DAT0, [S], #4
1326 + movs N, N, lsl #2
1327 + strcsh DAT0, [S], #2
1328 + strmib DAT0, [S]
1329 +199: pop {S, pc}
1330 +
1331 +170: /* Short case */
1332 + mov DAT2, DAT0
1333 + mov DAT3, DAT0
1334 + tst S, #3
1335 + beq 174f
1336 +172: subs N, N, #1
1337 + blo 199b
1338 + strb DAT0, [S], #1
1339 + tst S, #3
1340 + bne 172b
1341 +174: tst N, #16
1342 + stmneia S!, {DAT0, DAT1, DAT2, DAT3}
1343 + b 166b
1344 +
1345 + .unreq S
1346 + .unreq DAT0
1347 + .unreq N
1348 + .unreq DAT1
1349 + .unreq DAT2
1350 + .unreq DAT3
1351 +ENDPROC(memset)
1352 --- a/arch/arm/lib/uaccess_with_memcpy.c
1353 +++ b/arch/arm/lib/uaccess_with_memcpy.c
1354 @@ -22,6 +22,14 @@
1355 #include <asm/current.h>
1356 #include <asm/page.h>
1357
1358 +#ifndef COPY_FROM_USER_THRESHOLD
1359 +#define COPY_FROM_USER_THRESHOLD 64
1360 +#endif
1361 +
1362 +#ifndef COPY_TO_USER_THRESHOLD
1363 +#define COPY_TO_USER_THRESHOLD 64
1364 +#endif
1365 +
1366 static int
1367 pin_page_for_write(const void __user *_addr, pte_t **ptep, spinlock_t **ptlp)
1368 {
1369 @@ -85,7 +93,44 @@ pin_page_for_write(const void __user *_a
1370 return 1;
1371 }
1372
1373 -static unsigned long noinline
1374 +static int
1375 +pin_page_for_read(const void __user *_addr, pte_t **ptep, spinlock_t **ptlp)
1376 +{
1377 + unsigned long addr = (unsigned long)_addr;
1378 + pgd_t *pgd;
1379 + pmd_t *pmd;
1380 + pte_t *pte;
1381 + pud_t *pud;
1382 + spinlock_t *ptl;
1383 +
1384 + pgd = pgd_offset(current->mm, addr);
1385 + if (unlikely(pgd_none(*pgd) || pgd_bad(*pgd)))
1386 + {
1387 + return 0;
1388 + }
1389 + pud = pud_offset(pgd, addr);
1390 + if (unlikely(pud_none(*pud) || pud_bad(*pud)))
1391 + {
1392 + return 0;
1393 + }
1394 +
1395 + pmd = pmd_offset(pud, addr);
1396 + if (unlikely(pmd_none(*pmd) || pmd_bad(*pmd)))
1397 + return 0;
1398 +
1399 + pte = pte_offset_map_lock(current->mm, pmd, addr, &ptl);
1400 + if (unlikely(!pte_present(*pte) || !pte_young(*pte))) {
1401 + pte_unmap_unlock(pte, ptl);
1402 + return 0;
1403 + }
1404 +
1405 + *ptep = pte;
1406 + *ptlp = ptl;
1407 +
1408 + return 1;
1409 +}
1410 +
1411 +unsigned long noinline
1412 __copy_to_user_memcpy(void __user *to, const void *from, unsigned long n)
1413 {
1414 int atomic;
1415 @@ -135,6 +180,54 @@ out:
1416 return n;
1417 }
1418
1419 +unsigned long noinline
1420 +__copy_from_user_memcpy(void *to, const void __user *from, unsigned long n)
1421 +{
1422 + int atomic;
1423 +
1424 + if (unlikely(segment_eq(get_fs(), KERNEL_DS))) {
1425 + memcpy(to, (const void *)from, n);
1426 + return 0;
1427 + }
1428 +
1429 + /* the mmap semaphore is taken only if not in an atomic context */
1430 + atomic = in_atomic();
1431 +
1432 + if (!atomic)
1433 + down_read(&current->mm->mmap_sem);
1434 + while (n) {
1435 + pte_t *pte;
1436 + spinlock_t *ptl;
1437 + int tocopy;
1438 +
1439 + while (!pin_page_for_read(from, &pte, &ptl)) {
1440 + char temp;
1441 + if (!atomic)
1442 + up_read(&current->mm->mmap_sem);
1443 + if (__get_user(temp, (char __user *)from))
1444 + goto out;
1445 + if (!atomic)
1446 + down_read(&current->mm->mmap_sem);
1447 + }
1448 +
1449 + tocopy = (~(unsigned long)from & ~PAGE_MASK) + 1;
1450 + if (tocopy > n)
1451 + tocopy = n;
1452 +
1453 + memcpy(to, (const void *)from, tocopy);
1454 + to += tocopy;
1455 + from += tocopy;
1456 + n -= tocopy;
1457 +
1458 + pte_unmap_unlock(pte, ptl);
1459 + }
1460 + if (!atomic)
1461 + up_read(&current->mm->mmap_sem);
1462 +
1463 +out:
1464 + return n;
1465 +}
1466 +
1467 unsigned long
1468 __copy_to_user(void __user *to, const void *from, unsigned long n)
1469 {
1470 @@ -145,10 +238,25 @@ __copy_to_user(void __user *to, const vo
1471 * With frame pointer disabled, tail call optimization kicks in
1472 * as well making this test almost invisible.
1473 */
1474 - if (n < 64)
1475 + if (n < COPY_TO_USER_THRESHOLD)
1476 return __copy_to_user_std(to, from, n);
1477 return __copy_to_user_memcpy(to, from, n);
1478 }
1479 +
1480 +unsigned long
1481 +__copy_from_user(void *to, const void __user *from, unsigned long n)
1482 +{
1483 + /*
1484 + * This test is stubbed out of the main function above to keep
1485 + * the overhead for small copies low by avoiding a large
1486 + * register dump on the stack just to reload them right away.
1487 + * With frame pointer disabled, tail call optimization kicks in
1488 + * as well making this test almost invisible.
1489 + */
1490 + if (n < COPY_FROM_USER_THRESHOLD)
1491 + return __copy_from_user_std(to, from, n);
1492 + return __copy_from_user_memcpy(to, from, n);
1493 +}
1494
1495 static unsigned long noinline
1496 __clear_user_memset(void __user *addr, unsigned long n)