~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/arch/sparc/lib/NG4memcpy.S

Version: ~ [ linux-6.11-rc3 ] ~ [ linux-6.10.4 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.45 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.104 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.164 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.223 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.281 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.319 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.9 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 /* SPDX-License-Identifier: GPL-2.0 */
  2 /* NG4memcpy.S: Niagara-4 optimized memcpy.
  3  *
  4  * Copyright (C) 2012 David S. Miller (davem@davemloft.net)
  5  */
  6 
  7 #ifdef __KERNEL__
  8 #include <linux/linkage.h>
  9 #include <asm/visasm.h>
 10 #include <asm/asi.h>
 11 #define GLOBAL_SPARE    %g7
 12 #else
 13 #define ASI_BLK_INIT_QUAD_LDD_P 0xe2
 14 #define FPRS_FEF  0x04
 15 
 16 /* On T4 it is very expensive to access ASRs like %fprs and
 17  * %asi, avoiding a read or a write can save ~50 cycles.
 18  */
 19 #define FPU_ENTER                       \
 20         rd      %fprs, %o5;             \
 21         andcc   %o5, FPRS_FEF, %g0;     \
 22         be,a,pn %icc, 999f;             \
 23          wr     %g0, FPRS_FEF, %fprs;   \
 24         999:
 25 
 26 #ifdef MEMCPY_DEBUG
 27 #define VISEntryHalf FPU_ENTER; \
 28                      clr %g1; clr %g2; clr %g3; clr %g5; subcc %g0, %g0, %g0;
 29 #define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
 30 #else
 31 #define VISEntryHalf FPU_ENTER
 32 #define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
 33 #endif
 34 
 35 #define GLOBAL_SPARE    %g5
 36 #endif
 37 
 38 #ifndef STORE_ASI
 39 #ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA
 40 #define STORE_ASI       ASI_BLK_INIT_QUAD_LDD_P
 41 #else
 42 #define STORE_ASI       0x80            /* ASI_P */
 43 #endif
 44 #endif
 45 
 46 #if !defined(EX_LD) && !defined(EX_ST)
 47 #define NON_USER_COPY
 48 #endif
 49 
 50 #ifndef EX_LD
 51 #define EX_LD(x,y)      x
 52 #endif
 53 #ifndef EX_LD_FP
 54 #define EX_LD_FP(x,y)   x
 55 #endif
 56 
 57 #ifndef EX_ST
 58 #define EX_ST(x,y)      x
 59 #endif
 60 #ifndef EX_ST_FP
 61 #define EX_ST_FP(x,y)   x
 62 #endif
 63 
 64 
 65 #ifndef LOAD
 66 #define LOAD(type,addr,dest)    type [addr], dest
 67 #endif
 68 
 69 #ifndef STORE
 70 #ifndef MEMCPY_DEBUG
 71 #define STORE(type,src,addr)    type src, [addr]
 72 #else
 73 #define STORE(type,src,addr)    type##a src, [addr] %asi
 74 #endif
 75 #endif
 76 
 77 #ifndef STORE_INIT
 78 #define STORE_INIT(src,addr)    stxa src, [addr] STORE_ASI
 79 #endif
 80 
 81 #ifndef FUNC_NAME
 82 #define FUNC_NAME       NG4memcpy
 83 #endif
 84 #ifndef PREAMBLE
 85 #define PREAMBLE
 86 #endif
 87 
 88 #ifndef XCC
 89 #define XCC xcc
 90 #endif
 91 
 92         .register       %g2,#scratch
 93         .register       %g3,#scratch
 94 
 95         .text
 96 #ifndef EX_RETVAL
 97 #define EX_RETVAL(x)    x
 98 #endif
 99         .align          64
100 
101         .globl  FUNC_NAME
102         .type   FUNC_NAME,#function
103 FUNC_NAME:      /* %o0=dst, %o1=src, %o2=len */
104 #ifdef MEMCPY_DEBUG
105         wr              %g0, 0x80, %asi
106 #endif
107         srlx            %o2, 31, %g2
108         cmp             %g2, 0
109         tne             %XCC, 5
110         PREAMBLE
111         mov             %o0, %o3
112         brz,pn          %o2, .Lexit
113          cmp            %o2, 3
114         ble,pn          %icc, .Ltiny
115          cmp            %o2, 19
116         ble,pn          %icc, .Lsmall
117          or             %o0, %o1, %g2
118         cmp             %o2, 128
119         bl,pn           %icc, .Lmedium
120          nop
121 
122 .Llarge:/* len >= 0x80 */
123         /* First get dest 8 byte aligned.  */
124         sub             %g0, %o0, %g1
125         and             %g1, 0x7, %g1
126         brz,pt          %g1, 51f
127          sub            %o2, %g1, %o2
128 
129 
130 1:      EX_LD(LOAD(ldub, %o1 + 0x00, %g2), memcpy_retl_o2_plus_g1)
131         add             %o1, 1, %o1
132         subcc           %g1, 1, %g1
133         add             %o0, 1, %o0
134         bne,pt          %icc, 1b
135          EX_ST(STORE(stb, %g2, %o0 - 0x01), memcpy_retl_o2_plus_g1_plus_1)
136 
137 51:     LOAD(prefetch, %o1 + 0x040, #n_reads_strong)
138         LOAD(prefetch, %o1 + 0x080, #n_reads_strong)
139         LOAD(prefetch, %o1 + 0x0c0, #n_reads_strong)
140         LOAD(prefetch, %o1 + 0x100, #n_reads_strong)
141         LOAD(prefetch, %o1 + 0x140, #n_reads_strong)
142         LOAD(prefetch, %o1 + 0x180, #n_reads_strong)
143         LOAD(prefetch, %o1 + 0x1c0, #n_reads_strong)
144         LOAD(prefetch, %o1 + 0x200, #n_reads_strong)
145 
146         /* Check if we can use the straight fully aligned
147          * loop, or we require the alignaddr/faligndata variant.
148          */
149         andcc           %o1, 0x7, %o5
150         bne,pn          %icc, .Llarge_src_unaligned
151          sub            %g0, %o0, %g1
152 
153         /* Legitimize the use of initializing stores by getting dest
154          * to be 64-byte aligned.
155          */
156         and             %g1, 0x3f, %g1
157         brz,pt          %g1, .Llarge_aligned
158          sub            %o2, %g1, %o2
159 
160 1:      EX_LD(LOAD(ldx, %o1 + 0x00, %g2), memcpy_retl_o2_plus_g1)
161         add             %o1, 8, %o1
162         subcc           %g1, 8, %g1
163         add             %o0, 8, %o0
164         bne,pt          %icc, 1b
165          EX_ST(STORE(stx, %g2, %o0 - 0x08), memcpy_retl_o2_plus_g1_plus_8)
166 
167 .Llarge_aligned:
168         /* len >= 0x80 && src 8-byte aligned && dest 8-byte aligned */
169         andn            %o2, 0x3f, %o4
170         sub             %o2, %o4, %o2
171 
172 1:      EX_LD(LOAD(ldx, %o1 + 0x00, %g1), memcpy_retl_o2_plus_o4)
173         add             %o1, 0x40, %o1
174         EX_LD(LOAD(ldx, %o1 - 0x38, %g2), memcpy_retl_o2_plus_o4)
175         subcc           %o4, 0x40, %o4
176         EX_LD(LOAD(ldx, %o1 - 0x30, %g3), memcpy_retl_o2_plus_o4_plus_64)
177         EX_LD(LOAD(ldx, %o1 - 0x28, GLOBAL_SPARE), memcpy_retl_o2_plus_o4_plus_64)
178         EX_LD(LOAD(ldx, %o1 - 0x20, %o5), memcpy_retl_o2_plus_o4_plus_64)
179         EX_ST(STORE_INIT(%g1, %o0), memcpy_retl_o2_plus_o4_plus_64)
180         add             %o0, 0x08, %o0
181         EX_ST(STORE_INIT(%g2, %o0), memcpy_retl_o2_plus_o4_plus_56)
182         add             %o0, 0x08, %o0
183         EX_LD(LOAD(ldx, %o1 - 0x18, %g2), memcpy_retl_o2_plus_o4_plus_48)
184         EX_ST(STORE_INIT(%g3, %o0), memcpy_retl_o2_plus_o4_plus_48)
185         add             %o0, 0x08, %o0
186         EX_LD(LOAD(ldx, %o1 - 0x10, %g3), memcpy_retl_o2_plus_o4_plus_40)
187         EX_ST(STORE_INIT(GLOBAL_SPARE, %o0), memcpy_retl_o2_plus_o4_plus_40)
188         add             %o0, 0x08, %o0
189         EX_LD(LOAD(ldx, %o1 - 0x08, GLOBAL_SPARE), memcpy_retl_o2_plus_o4_plus_32)
190         EX_ST(STORE_INIT(%o5, %o0), memcpy_retl_o2_plus_o4_plus_32)
191         add             %o0, 0x08, %o0
192         EX_ST(STORE_INIT(%g2, %o0), memcpy_retl_o2_plus_o4_plus_24)
193         add             %o0, 0x08, %o0
194         EX_ST(STORE_INIT(%g3, %o0), memcpy_retl_o2_plus_o4_plus_16)
195         add             %o0, 0x08, %o0
196         EX_ST(STORE_INIT(GLOBAL_SPARE, %o0), memcpy_retl_o2_plus_o4_plus_8)
197         add             %o0, 0x08, %o0
198         bne,pt          %icc, 1b
199          LOAD(prefetch, %o1 + 0x200, #n_reads_strong)
200 
201         membar          #StoreLoad | #StoreStore
202 
203         brz,pn          %o2, .Lexit
204          cmp            %o2, 19
205         ble,pn          %icc, .Lsmall_unaligned
206          nop
207         ba,a,pt         %icc, .Lmedium_noprefetch
208 
209 .Lexit: retl
210          mov            EX_RETVAL(%o3), %o0
211 
212 .Llarge_src_unaligned:
213 #ifdef NON_USER_COPY
214         VISEntryHalfFast(.Lmedium_vis_entry_fail)
215 #else
216         VISEntryHalf
217 #endif
218         andn            %o2, 0x3f, %o4
219         sub             %o2, %o4, %o2
220         alignaddr       %o1, %g0, %g1
221         add             %o1, %o4, %o1
222         EX_LD_FP(LOAD(ldd, %g1 + 0x00, %f0), memcpy_retl_o2_plus_o4)
223 1:      EX_LD_FP(LOAD(ldd, %g1 + 0x08, %f2), memcpy_retl_o2_plus_o4)
224         subcc           %o4, 0x40, %o4
225         EX_LD_FP(LOAD(ldd, %g1 + 0x10, %f4), memcpy_retl_o2_plus_o4_plus_64)
226         EX_LD_FP(LOAD(ldd, %g1 + 0x18, %f6), memcpy_retl_o2_plus_o4_plus_64)
227         EX_LD_FP(LOAD(ldd, %g1 + 0x20, %f8), memcpy_retl_o2_plus_o4_plus_64)
228         EX_LD_FP(LOAD(ldd, %g1 + 0x28, %f10), memcpy_retl_o2_plus_o4_plus_64)
229         EX_LD_FP(LOAD(ldd, %g1 + 0x30, %f12), memcpy_retl_o2_plus_o4_plus_64)
230         EX_LD_FP(LOAD(ldd, %g1 + 0x38, %f14), memcpy_retl_o2_plus_o4_plus_64)
231         faligndata      %f0, %f2, %f16
232         EX_LD_FP(LOAD(ldd, %g1 + 0x40, %f0), memcpy_retl_o2_plus_o4_plus_64)
233         faligndata      %f2, %f4, %f18
234         add             %g1, 0x40, %g1
235         faligndata      %f4, %f6, %f20
236         faligndata      %f6, %f8, %f22
237         faligndata      %f8, %f10, %f24
238         faligndata      %f10, %f12, %f26
239         faligndata      %f12, %f14, %f28
240         faligndata      %f14, %f0, %f30
241         EX_ST_FP(STORE(std, %f16, %o0 + 0x00), memcpy_retl_o2_plus_o4_plus_64)
242         EX_ST_FP(STORE(std, %f18, %o0 + 0x08), memcpy_retl_o2_plus_o4_plus_56)
243         EX_ST_FP(STORE(std, %f20, %o0 + 0x10), memcpy_retl_o2_plus_o4_plus_48)
244         EX_ST_FP(STORE(std, %f22, %o0 + 0x18), memcpy_retl_o2_plus_o4_plus_40)
245         EX_ST_FP(STORE(std, %f24, %o0 + 0x20), memcpy_retl_o2_plus_o4_plus_32)
246         EX_ST_FP(STORE(std, %f26, %o0 + 0x28), memcpy_retl_o2_plus_o4_plus_24)
247         EX_ST_FP(STORE(std, %f28, %o0 + 0x30), memcpy_retl_o2_plus_o4_plus_16)
248         EX_ST_FP(STORE(std, %f30, %o0 + 0x38), memcpy_retl_o2_plus_o4_plus_8)
249         add             %o0, 0x40, %o0
250         bne,pt          %icc, 1b
251          LOAD(prefetch, %g1 + 0x200, #n_reads_strong)
252 #ifdef NON_USER_COPY
253         VISExitHalfFast
254 #else
255         VISExitHalf
256 #endif
257         brz,pn          %o2, .Lexit
258          cmp            %o2, 19
259         ble,pn          %icc, .Lsmall_unaligned
260          nop
261         ba,a,pt         %icc, .Lmedium_unaligned
262 
263 #ifdef NON_USER_COPY
264 .Lmedium_vis_entry_fail:
265          or             %o0, %o1, %g2
266 #endif
267 .Lmedium:
268         LOAD(prefetch, %o1 + 0x40, #n_reads_strong)
269         andcc           %g2, 0x7, %g0
270         bne,pn          %icc, .Lmedium_unaligned
271          nop
272 .Lmedium_noprefetch:
273         andncc          %o2, 0x20 - 1, %o5
274         be,pn           %icc, 2f
275          sub            %o2, %o5, %o2
276 1:      EX_LD(LOAD(ldx, %o1 + 0x00, %g1), memcpy_retl_o2_plus_o5)
277         EX_LD(LOAD(ldx, %o1 + 0x08, %g2), memcpy_retl_o2_plus_o5)
278         EX_LD(LOAD(ldx, %o1 + 0x10, GLOBAL_SPARE), memcpy_retl_o2_plus_o5)
279         EX_LD(LOAD(ldx, %o1 + 0x18, %o4), memcpy_retl_o2_plus_o5)
280         add             %o1, 0x20, %o1
281         subcc           %o5, 0x20, %o5
282         EX_ST(STORE(stx, %g1, %o0 + 0x00), memcpy_retl_o2_plus_o5_plus_32)
283         EX_ST(STORE(stx, %g2, %o0 + 0x08), memcpy_retl_o2_plus_o5_plus_24)
284         EX_ST(STORE(stx, GLOBAL_SPARE, %o0 + 0x10), memcpy_retl_o2_plus_o5_plus_24)
285         EX_ST(STORE(stx, %o4, %o0 + 0x18), memcpy_retl_o2_plus_o5_plus_8)
286         bne,pt          %icc, 1b
287          add            %o0, 0x20, %o0
288 2:      andcc           %o2, 0x18, %o5
289         be,pt           %icc, 3f
290          sub            %o2, %o5, %o2
291 
292 1:      EX_LD(LOAD(ldx, %o1 + 0x00, %g1), memcpy_retl_o2_plus_o5)
293         add             %o1, 0x08, %o1
294         add             %o0, 0x08, %o0
295         subcc           %o5, 0x08, %o5
296         bne,pt          %icc, 1b
297          EX_ST(STORE(stx, %g1, %o0 - 0x08), memcpy_retl_o2_plus_o5_plus_8)
298 3:      brz,pt          %o2, .Lexit
299          cmp            %o2, 0x04
300         bl,pn           %icc, .Ltiny
301          nop
302         EX_LD(LOAD(lduw, %o1 + 0x00, %g1), memcpy_retl_o2)
303         add             %o1, 0x04, %o1
304         add             %o0, 0x04, %o0
305         subcc           %o2, 0x04, %o2
306         bne,pn          %icc, .Ltiny
307          EX_ST(STORE(stw, %g1, %o0 - 0x04), memcpy_retl_o2_plus_4)
308         ba,a,pt         %icc, .Lexit
309 .Lmedium_unaligned:
310         /* First get dest 8 byte aligned.  */
311         sub             %g0, %o0, %g1
312         and             %g1, 0x7, %g1
313         brz,pt          %g1, 2f
314          sub            %o2, %g1, %o2
315 
316 1:      EX_LD(LOAD(ldub, %o1 + 0x00, %g2), memcpy_retl_o2_plus_g1)
317         add             %o1, 1, %o1
318         subcc           %g1, 1, %g1
319         add             %o0, 1, %o0
320         bne,pt          %icc, 1b
321          EX_ST(STORE(stb, %g2, %o0 - 0x01), memcpy_retl_o2_plus_g1_plus_1)
322 2:
323         and             %o1, 0x7, %g1
324         brz,pn          %g1, .Lmedium_noprefetch
325          sll            %g1, 3, %g1
326         mov             64, %g2
327         sub             %g2, %g1, %g2
328         andn            %o1, 0x7, %o1
329         EX_LD(LOAD(ldx, %o1 + 0x00, %o4), memcpy_retl_o2)
330         sllx            %o4, %g1, %o4
331         andn            %o2, 0x08 - 1, %o5
332         sub             %o2, %o5, %o2
333 1:      EX_LD(LOAD(ldx, %o1 + 0x08, %g3), memcpy_retl_o2_plus_o5)
334         add             %o1, 0x08, %o1
335         subcc           %o5, 0x08, %o5
336         srlx            %g3, %g2, GLOBAL_SPARE
337         or              GLOBAL_SPARE, %o4, GLOBAL_SPARE
338         EX_ST(STORE(stx, GLOBAL_SPARE, %o0 + 0x00), memcpy_retl_o2_plus_o5_plus_8)
339         add             %o0, 0x08, %o0
340         bne,pt          %icc, 1b
341          sllx           %g3, %g1, %o4
342         srl             %g1, 3, %g1
343         add             %o1, %g1, %o1
344         brz,pn          %o2, .Lexit
345          nop
346         ba,pt           %icc, .Lsmall_unaligned
347 
348 .Ltiny:
349         EX_LD(LOAD(ldub, %o1 + 0x00, %g1), memcpy_retl_o2)
350         subcc           %o2, 1, %o2
351         be,pn           %icc, .Lexit
352          EX_ST(STORE(stb, %g1, %o0 + 0x00), memcpy_retl_o2_plus_1)
353         EX_LD(LOAD(ldub, %o1 + 0x01, %g1), memcpy_retl_o2)
354         subcc           %o2, 1, %o2
355         be,pn           %icc, .Lexit
356          EX_ST(STORE(stb, %g1, %o0 + 0x01), memcpy_retl_o2_plus_1)
357         EX_LD(LOAD(ldub, %o1 + 0x02, %g1), memcpy_retl_o2)
358         ba,pt           %icc, .Lexit
359          EX_ST(STORE(stb, %g1, %o0 + 0x02), memcpy_retl_o2)
360 
361 .Lsmall:
362         andcc           %g2, 0x3, %g0
363         bne,pn          %icc, .Lsmall_unaligned
364          andn           %o2, 0x4 - 1, %o5
365         sub             %o2, %o5, %o2
366 1:
367         EX_LD(LOAD(lduw, %o1 + 0x00, %g1), memcpy_retl_o2_plus_o5)
368         add             %o1, 0x04, %o1
369         subcc           %o5, 0x04, %o5
370         add             %o0, 0x04, %o0
371         bne,pt          %icc, 1b
372          EX_ST(STORE(stw, %g1, %o0 - 0x04), memcpy_retl_o2_plus_o5_plus_4)
373         brz,pt          %o2, .Lexit
374          nop
375         ba,a,pt         %icc, .Ltiny
376 
377 .Lsmall_unaligned:
378 1:      EX_LD(LOAD(ldub, %o1 + 0x00, %g1), memcpy_retl_o2)
379         add             %o1, 1, %o1
380         add             %o0, 1, %o0
381         subcc           %o2, 1, %o2
382         bne,pt          %icc, 1b
383          EX_ST(STORE(stb, %g1, %o0 - 0x01), memcpy_retl_o2_plus_1)
384         ba,a,pt         %icc, .Lexit
385          nop
386         .size           FUNC_NAME, .-FUNC_NAME

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

sflogo.php