~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/arch/arm64/lib/memcpy.S

Version: ~ [ linux-6.12-rc7 ] ~ [ linux-6.11.7 ] ~ [ linux-6.10.14 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.60 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.116 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.171 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.229 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.285 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.323 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.12 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 /* SPDX-License-Identifier: GPL-2.0-only */
  2 /*
  3  * Copyright (c) 2012-2021, Arm Limited.
  4  *
  5  * Adapted from the original at:
  6  * https://github.com/ARM-software/optimized-routines/blob/afd6244a1f8d9229/string/aarch64/memcpy.S
  7  */
  8 
  9 #include <linux/linkage.h>
 10 #include <asm/assembler.h>
 11 
 12 /* Assumptions:
 13  *
 14  * ARMv8-a, AArch64, unaligned accesses.
 15  *
 16  */
 17 
 18 #define L(label) .L ## label
 19 
 20 #define dstin   x0
 21 #define src     x1
 22 #define count   x2
 23 #define dst     x3
 24 #define srcend  x4
 25 #define dstend  x5
 26 #define A_l     x6
 27 #define A_lw    w6
 28 #define A_h     x7
 29 #define B_l     x8
 30 #define B_lw    w8
 31 #define B_h     x9
 32 #define C_l     x10
 33 #define C_lw    w10
 34 #define C_h     x11
 35 #define D_l     x12
 36 #define D_h     x13
 37 #define E_l     x14
 38 #define E_h     x15
 39 #define F_l     x16
 40 #define F_h     x17
 41 #define G_l     count
 42 #define G_h     dst
 43 #define H_l     src
 44 #define H_h     srcend
 45 #define tmp1    x14
 46 
 47 /* This implementation handles overlaps and supports both memcpy and memmove
 48    from a single entry point.  It uses unaligned accesses and branchless
 49    sequences to keep the code small, simple and improve performance.
 50 
 51    Copies are split into 3 main cases: small copies of up to 32 bytes, medium
 52    copies of up to 128 bytes, and large copies.  The overhead of the overlap
 53    check is negligible since it is only required for large copies.
 54 
 55    Large copies use a software pipelined loop processing 64 bytes per iteration.
 56    The destination pointer is 16-byte aligned to minimize unaligned accesses.
 57    The loop tail is handled by always copying 64 bytes from the end.
 58 */
 59 
 60 SYM_FUNC_START(__pi_memcpy)
 61         add     srcend, src, count
 62         add     dstend, dstin, count
 63         cmp     count, 128
 64         b.hi    L(copy_long)
 65         cmp     count, 32
 66         b.hi    L(copy32_128)
 67 
 68         /* Small copies: 0..32 bytes.  */
 69         cmp     count, 16
 70         b.lo    L(copy16)
 71         ldp     A_l, A_h, [src]
 72         ldp     D_l, D_h, [srcend, -16]
 73         stp     A_l, A_h, [dstin]
 74         stp     D_l, D_h, [dstend, -16]
 75         ret
 76 
 77         /* Copy 8-15 bytes.  */
 78 L(copy16):
 79         tbz     count, 3, L(copy8)
 80         ldr     A_l, [src]
 81         ldr     A_h, [srcend, -8]
 82         str     A_l, [dstin]
 83         str     A_h, [dstend, -8]
 84         ret
 85 
 86         .p2align 3
 87         /* Copy 4-7 bytes.  */
 88 L(copy8):
 89         tbz     count, 2, L(copy4)
 90         ldr     A_lw, [src]
 91         ldr     B_lw, [srcend, -4]
 92         str     A_lw, [dstin]
 93         str     B_lw, [dstend, -4]
 94         ret
 95 
 96         /* Copy 0..3 bytes using a branchless sequence.  */
 97 L(copy4):
 98         cbz     count, L(copy0)
 99         lsr     tmp1, count, 1
100         ldrb    A_lw, [src]
101         ldrb    C_lw, [srcend, -1]
102         ldrb    B_lw, [src, tmp1]
103         strb    A_lw, [dstin]
104         strb    B_lw, [dstin, tmp1]
105         strb    C_lw, [dstend, -1]
106 L(copy0):
107         ret
108 
109         .p2align 4
110         /* Medium copies: 33..128 bytes.  */
111 L(copy32_128):
112         ldp     A_l, A_h, [src]
113         ldp     B_l, B_h, [src, 16]
114         ldp     C_l, C_h, [srcend, -32]
115         ldp     D_l, D_h, [srcend, -16]
116         cmp     count, 64
117         b.hi    L(copy128)
118         stp     A_l, A_h, [dstin]
119         stp     B_l, B_h, [dstin, 16]
120         stp     C_l, C_h, [dstend, -32]
121         stp     D_l, D_h, [dstend, -16]
122         ret
123 
124         .p2align 4
125         /* Copy 65..128 bytes.  */
126 L(copy128):
127         ldp     E_l, E_h, [src, 32]
128         ldp     F_l, F_h, [src, 48]
129         cmp     count, 96
130         b.ls    L(copy96)
131         ldp     G_l, G_h, [srcend, -64]
132         ldp     H_l, H_h, [srcend, -48]
133         stp     G_l, G_h, [dstend, -64]
134         stp     H_l, H_h, [dstend, -48]
135 L(copy96):
136         stp     A_l, A_h, [dstin]
137         stp     B_l, B_h, [dstin, 16]
138         stp     E_l, E_h, [dstin, 32]
139         stp     F_l, F_h, [dstin, 48]
140         stp     C_l, C_h, [dstend, -32]
141         stp     D_l, D_h, [dstend, -16]
142         ret
143 
144         .p2align 4
145         /* Copy more than 128 bytes.  */
146 L(copy_long):
147         /* Use backwards copy if there is an overlap.  */
148         sub     tmp1, dstin, src
149         cbz     tmp1, L(copy0)
150         cmp     tmp1, count
151         b.lo    L(copy_long_backwards)
152 
153         /* Copy 16 bytes and then align dst to 16-byte alignment.  */
154 
155         ldp     D_l, D_h, [src]
156         and     tmp1, dstin, 15
157         bic     dst, dstin, 15
158         sub     src, src, tmp1
159         add     count, count, tmp1      /* Count is now 16 too large.  */
160         ldp     A_l, A_h, [src, 16]
161         stp     D_l, D_h, [dstin]
162         ldp     B_l, B_h, [src, 32]
163         ldp     C_l, C_h, [src, 48]
164         ldp     D_l, D_h, [src, 64]!
165         subs    count, count, 128 + 16  /* Test and readjust count.  */
166         b.ls    L(copy64_from_end)
167 
168 L(loop64):
169         stp     A_l, A_h, [dst, 16]
170         ldp     A_l, A_h, [src, 16]
171         stp     B_l, B_h, [dst, 32]
172         ldp     B_l, B_h, [src, 32]
173         stp     C_l, C_h, [dst, 48]
174         ldp     C_l, C_h, [src, 48]
175         stp     D_l, D_h, [dst, 64]!
176         ldp     D_l, D_h, [src, 64]!
177         subs    count, count, 64
178         b.hi    L(loop64)
179 
180         /* Write the last iteration and copy 64 bytes from the end.  */
181 L(copy64_from_end):
182         ldp     E_l, E_h, [srcend, -64]
183         stp     A_l, A_h, [dst, 16]
184         ldp     A_l, A_h, [srcend, -48]
185         stp     B_l, B_h, [dst, 32]
186         ldp     B_l, B_h, [srcend, -32]
187         stp     C_l, C_h, [dst, 48]
188         ldp     C_l, C_h, [srcend, -16]
189         stp     D_l, D_h, [dst, 64]
190         stp     E_l, E_h, [dstend, -64]
191         stp     A_l, A_h, [dstend, -48]
192         stp     B_l, B_h, [dstend, -32]
193         stp     C_l, C_h, [dstend, -16]
194         ret
195 
196         .p2align 4
197 
198         /* Large backwards copy for overlapping copies.
199            Copy 16 bytes and then align dst to 16-byte alignment.  */
200 L(copy_long_backwards):
201         ldp     D_l, D_h, [srcend, -16]
202         and     tmp1, dstend, 15
203         sub     srcend, srcend, tmp1
204         sub     count, count, tmp1
205         ldp     A_l, A_h, [srcend, -16]
206         stp     D_l, D_h, [dstend, -16]
207         ldp     B_l, B_h, [srcend, -32]
208         ldp     C_l, C_h, [srcend, -48]
209         ldp     D_l, D_h, [srcend, -64]!
210         sub     dstend, dstend, tmp1
211         subs    count, count, 128
212         b.ls    L(copy64_from_start)
213 
214 L(loop64_backwards):
215         stp     A_l, A_h, [dstend, -16]
216         ldp     A_l, A_h, [srcend, -16]
217         stp     B_l, B_h, [dstend, -32]
218         ldp     B_l, B_h, [srcend, -32]
219         stp     C_l, C_h, [dstend, -48]
220         ldp     C_l, C_h, [srcend, -48]
221         stp     D_l, D_h, [dstend, -64]!
222         ldp     D_l, D_h, [srcend, -64]!
223         subs    count, count, 64
224         b.hi    L(loop64_backwards)
225 
226         /* Write the last iteration and copy 64 bytes from the start.  */
227 L(copy64_from_start):
228         ldp     G_l, G_h, [src, 48]
229         stp     A_l, A_h, [dstend, -16]
230         ldp     A_l, A_h, [src, 32]
231         stp     B_l, B_h, [dstend, -32]
232         ldp     B_l, B_h, [src, 16]
233         stp     C_l, C_h, [dstend, -48]
234         ldp     C_l, C_h, [src]
235         stp     D_l, D_h, [dstend, -64]
236         stp     G_l, G_h, [dstin, 48]
237         stp     A_l, A_h, [dstin, 32]
238         stp     B_l, B_h, [dstin, 16]
239         stp     C_l, C_h, [dstin]
240         ret
241 SYM_FUNC_END(__pi_memcpy)
242 
243 SYM_FUNC_ALIAS(__memcpy, __pi_memcpy)
244 EXPORT_SYMBOL(__memcpy)
245 SYM_FUNC_ALIAS_WEAK(memcpy, __memcpy)
246 EXPORT_SYMBOL(memcpy)
247 
248 SYM_FUNC_ALIAS(__pi_memmove, __pi_memcpy)
249 
250 SYM_FUNC_ALIAS(__memmove, __pi_memmove)
251 EXPORT_SYMBOL(__memmove)
252 SYM_FUNC_ALIAS_WEAK(memmove, __memmove)
253 EXPORT_SYMBOL(memmove)

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

sflogo.php