~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/arch/powerpc/lib/memcpy_power7.S

Version: ~ [ linux-6.12-rc7 ] ~ [ linux-6.11.7 ] ~ [ linux-6.10.14 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.60 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.116 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.171 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.229 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.285 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.323 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.12 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 /* SPDX-License-Identifier: GPL-2.0-or-later */
  2 /*
  3  *
  4  * Copyright (C) IBM Corporation, 2012
  5  *
  6  * Author: Anton Blanchard <anton@au.ibm.com>
  7  */
  8 #include <asm/ppc_asm.h>
  9 
 10 #ifndef SELFTEST_CASE
 11 /* 0 == don't use VMX, 1 == use VMX */
 12 #define SELFTEST_CASE   0
 13 #endif
 14 
 15 #ifdef __BIG_ENDIAN__
 16 #define LVS(VRT,RA,RB)          lvsl    VRT,RA,RB
 17 #define VPERM(VRT,VRA,VRB,VRC)  vperm   VRT,VRA,VRB,VRC
 18 #else
 19 #define LVS(VRT,RA,RB)          lvsr    VRT,RA,RB
 20 #define VPERM(VRT,VRA,VRB,VRC)  vperm   VRT,VRB,VRA,VRC
 21 #endif
 22 
 23 _GLOBAL(memcpy_power7)
 24         cmpldi  r5,16
 25         cmpldi  cr1,r5,4096
 26         std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
 27         blt     .Lshort_copy
 28 
 29 #ifdef CONFIG_ALTIVEC
 30 test_feature = SELFTEST_CASE
 31 BEGIN_FTR_SECTION
 32         bgt     cr1, .Lvmx_copy
 33 END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
 34 #endif
 35 
 36 .Lnonvmx_copy:
 37         /* Get the source 8B aligned */
 38         neg     r6,r4
 39         mtocrf  0x01,r6
 40         clrldi  r6,r6,(64-3)
 41 
 42         bf      cr7*4+3,1f
 43         lbz     r0,0(r4)
 44         addi    r4,r4,1
 45         stb     r0,0(r3)
 46         addi    r3,r3,1
 47 
 48 1:      bf      cr7*4+2,2f
 49         lhz     r0,0(r4)
 50         addi    r4,r4,2
 51         sth     r0,0(r3)
 52         addi    r3,r3,2
 53 
 54 2:      bf      cr7*4+1,3f
 55         lwz     r0,0(r4)
 56         addi    r4,r4,4
 57         stw     r0,0(r3)
 58         addi    r3,r3,4
 59 
 60 3:      sub     r5,r5,r6
 61         cmpldi  r5,128
 62         blt     5f
 63 
 64         mflr    r0
 65         stdu    r1,-STACKFRAMESIZE(r1)
 66         std     r14,STK_REG(R14)(r1)
 67         std     r15,STK_REG(R15)(r1)
 68         std     r16,STK_REG(R16)(r1)
 69         std     r17,STK_REG(R17)(r1)
 70         std     r18,STK_REG(R18)(r1)
 71         std     r19,STK_REG(R19)(r1)
 72         std     r20,STK_REG(R20)(r1)
 73         std     r21,STK_REG(R21)(r1)
 74         std     r22,STK_REG(R22)(r1)
 75         std     r0,STACKFRAMESIZE+16(r1)
 76 
 77         srdi    r6,r5,7
 78         mtctr   r6
 79 
 80         /* Now do cacheline (128B) sized loads and stores. */
 81         .align  5
 82 4:
 83         ld      r0,0(r4)
 84         ld      r6,8(r4)
 85         ld      r7,16(r4)
 86         ld      r8,24(r4)
 87         ld      r9,32(r4)
 88         ld      r10,40(r4)
 89         ld      r11,48(r4)
 90         ld      r12,56(r4)
 91         ld      r14,64(r4)
 92         ld      r15,72(r4)
 93         ld      r16,80(r4)
 94         ld      r17,88(r4)
 95         ld      r18,96(r4)
 96         ld      r19,104(r4)
 97         ld      r20,112(r4)
 98         ld      r21,120(r4)
 99         addi    r4,r4,128
100         std     r0,0(r3)
101         std     r6,8(r3)
102         std     r7,16(r3)
103         std     r8,24(r3)
104         std     r9,32(r3)
105         std     r10,40(r3)
106         std     r11,48(r3)
107         std     r12,56(r3)
108         std     r14,64(r3)
109         std     r15,72(r3)
110         std     r16,80(r3)
111         std     r17,88(r3)
112         std     r18,96(r3)
113         std     r19,104(r3)
114         std     r20,112(r3)
115         std     r21,120(r3)
116         addi    r3,r3,128
117         bdnz    4b
118 
119         clrldi  r5,r5,(64-7)
120 
121         ld      r14,STK_REG(R14)(r1)
122         ld      r15,STK_REG(R15)(r1)
123         ld      r16,STK_REG(R16)(r1)
124         ld      r17,STK_REG(R17)(r1)
125         ld      r18,STK_REG(R18)(r1)
126         ld      r19,STK_REG(R19)(r1)
127         ld      r20,STK_REG(R20)(r1)
128         ld      r21,STK_REG(R21)(r1)
129         ld      r22,STK_REG(R22)(r1)
130         addi    r1,r1,STACKFRAMESIZE
131 
132         /* Up to 127B to go */
133 5:      srdi    r6,r5,4
134         mtocrf  0x01,r6
135 
136 6:      bf      cr7*4+1,7f
137         ld      r0,0(r4)
138         ld      r6,8(r4)
139         ld      r7,16(r4)
140         ld      r8,24(r4)
141         ld      r9,32(r4)
142         ld      r10,40(r4)
143         ld      r11,48(r4)
144         ld      r12,56(r4)
145         addi    r4,r4,64
146         std     r0,0(r3)
147         std     r6,8(r3)
148         std     r7,16(r3)
149         std     r8,24(r3)
150         std     r9,32(r3)
151         std     r10,40(r3)
152         std     r11,48(r3)
153         std     r12,56(r3)
154         addi    r3,r3,64
155 
156         /* Up to 63B to go */
157 7:      bf      cr7*4+2,8f
158         ld      r0,0(r4)
159         ld      r6,8(r4)
160         ld      r7,16(r4)
161         ld      r8,24(r4)
162         addi    r4,r4,32
163         std     r0,0(r3)
164         std     r6,8(r3)
165         std     r7,16(r3)
166         std     r8,24(r3)
167         addi    r3,r3,32
168 
169         /* Up to 31B to go */
170 8:      bf      cr7*4+3,9f
171         ld      r0,0(r4)
172         ld      r6,8(r4)
173         addi    r4,r4,16
174         std     r0,0(r3)
175         std     r6,8(r3)
176         addi    r3,r3,16
177 
178 9:      clrldi  r5,r5,(64-4)
179 
180         /* Up to 15B to go */
181 .Lshort_copy:
182         mtocrf  0x01,r5
183         bf      cr7*4+0,12f
184         lwz     r0,0(r4)        /* Less chance of a reject with word ops */
185         lwz     r6,4(r4)
186         addi    r4,r4,8
187         stw     r0,0(r3)
188         stw     r6,4(r3)
189         addi    r3,r3,8
190 
191 12:     bf      cr7*4+1,13f
192         lwz     r0,0(r4)
193         addi    r4,r4,4
194         stw     r0,0(r3)
195         addi    r3,r3,4
196 
197 13:     bf      cr7*4+2,14f
198         lhz     r0,0(r4)
199         addi    r4,r4,2
200         sth     r0,0(r3)
201         addi    r3,r3,2
202 
203 14:     bf      cr7*4+3,15f
204         lbz     r0,0(r4)
205         stb     r0,0(r3)
206 
207 15:     ld      r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
208         blr
209 
210 .Lunwind_stack_nonvmx_copy:
211         addi    r1,r1,STACKFRAMESIZE
212         b       .Lnonvmx_copy
213 
214 .Lvmx_copy:
215 #ifdef CONFIG_ALTIVEC
216         mflr    r0
217         std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
218         std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
219         std     r0,16(r1)
220         stdu    r1,-STACKFRAMESIZE(r1)
221         bl      CFUNC(enter_vmx_ops)
222         cmpwi   cr1,r3,0
223         ld      r0,STACKFRAMESIZE+16(r1)
224         ld      r3,STK_REG(R31)(r1)
225         ld      r4,STK_REG(R30)(r1)
226         ld      r5,STK_REG(R29)(r1)
227         mtlr    r0
228 
229         /*
230          * We prefetch both the source and destination using enhanced touch
231          * instructions. We use a stream ID of 0 for the load side and
232          * 1 for the store side.
233          */
234         clrrdi  r6,r4,7
235         clrrdi  r9,r3,7
236         ori     r9,r9,1         /* stream=1 */
237 
238         srdi    r7,r5,7         /* length in cachelines, capped at 0x3FF */
239         cmpldi  r7,0x3FF
240         ble     1f
241         li      r7,0x3FF
242 1:      lis     r0,0x0E00       /* depth=7 */
243         sldi    r7,r7,7
244         or      r7,r7,r0
245         ori     r10,r7,1        /* stream=1 */
246 
247         DCBT_SETUP_STREAMS(r6, r7, r9, r10, r8)
248 
249         beq     cr1,.Lunwind_stack_nonvmx_copy
250 
251         /*
252          * If source and destination are not relatively aligned we use a
253          * slower permute loop.
254          */
255         xor     r6,r4,r3
256         rldicl. r6,r6,0,(64-4)
257         bne     .Lvmx_unaligned_copy
258 
259         /* Get the destination 16B aligned */
260         neg     r6,r3
261         mtocrf  0x01,r6
262         clrldi  r6,r6,(64-4)
263 
264         bf      cr7*4+3,1f
265         lbz     r0,0(r4)
266         addi    r4,r4,1
267         stb     r0,0(r3)
268         addi    r3,r3,1
269 
270 1:      bf      cr7*4+2,2f
271         lhz     r0,0(r4)
272         addi    r4,r4,2
273         sth     r0,0(r3)
274         addi    r3,r3,2
275 
276 2:      bf      cr7*4+1,3f
277         lwz     r0,0(r4)
278         addi    r4,r4,4
279         stw     r0,0(r3)
280         addi    r3,r3,4
281 
282 3:      bf      cr7*4+0,4f
283         ld      r0,0(r4)
284         addi    r4,r4,8
285         std     r0,0(r3)
286         addi    r3,r3,8
287 
288 4:      sub     r5,r5,r6
289 
290         /* Get the desination 128B aligned */
291         neg     r6,r3
292         srdi    r7,r6,4
293         mtocrf  0x01,r7
294         clrldi  r6,r6,(64-7)
295 
296         li      r9,16
297         li      r10,32
298         li      r11,48
299 
300         bf      cr7*4+3,5f
301         lvx     v1,0,r4
302         addi    r4,r4,16
303         stvx    v1,0,r3
304         addi    r3,r3,16
305 
306 5:      bf      cr7*4+2,6f
307         lvx     v1,0,r4
308         lvx     v0,r4,r9
309         addi    r4,r4,32
310         stvx    v1,0,r3
311         stvx    v0,r3,r9
312         addi    r3,r3,32
313 
314 6:      bf      cr7*4+1,7f
315         lvx     v3,0,r4
316         lvx     v2,r4,r9
317         lvx     v1,r4,r10
318         lvx     v0,r4,r11
319         addi    r4,r4,64
320         stvx    v3,0,r3
321         stvx    v2,r3,r9
322         stvx    v1,r3,r10
323         stvx    v0,r3,r11
324         addi    r3,r3,64
325 
326 7:      sub     r5,r5,r6
327         srdi    r6,r5,7
328 
329         std     r14,STK_REG(R14)(r1)
330         std     r15,STK_REG(R15)(r1)
331         std     r16,STK_REG(R16)(r1)
332 
333         li      r12,64
334         li      r14,80
335         li      r15,96
336         li      r16,112
337 
338         mtctr   r6
339 
340         /*
341          * Now do cacheline sized loads and stores. By this stage the
342          * cacheline stores are also cacheline aligned.
343          */
344         .align  5
345 8:
346         lvx     v7,0,r4
347         lvx     v6,r4,r9
348         lvx     v5,r4,r10
349         lvx     v4,r4,r11
350         lvx     v3,r4,r12
351         lvx     v2,r4,r14
352         lvx     v1,r4,r15
353         lvx     v0,r4,r16
354         addi    r4,r4,128
355         stvx    v7,0,r3
356         stvx    v6,r3,r9
357         stvx    v5,r3,r10
358         stvx    v4,r3,r11
359         stvx    v3,r3,r12
360         stvx    v2,r3,r14
361         stvx    v1,r3,r15
362         stvx    v0,r3,r16
363         addi    r3,r3,128
364         bdnz    8b
365 
366         ld      r14,STK_REG(R14)(r1)
367         ld      r15,STK_REG(R15)(r1)
368         ld      r16,STK_REG(R16)(r1)
369 
370         /* Up to 127B to go */
371         clrldi  r5,r5,(64-7)
372         srdi    r6,r5,4
373         mtocrf  0x01,r6
374 
375         bf      cr7*4+1,9f
376         lvx     v3,0,r4
377         lvx     v2,r4,r9
378         lvx     v1,r4,r10
379         lvx     v0,r4,r11
380         addi    r4,r4,64
381         stvx    v3,0,r3
382         stvx    v2,r3,r9
383         stvx    v1,r3,r10
384         stvx    v0,r3,r11
385         addi    r3,r3,64
386 
387 9:      bf      cr7*4+2,10f
388         lvx     v1,0,r4
389         lvx     v0,r4,r9
390         addi    r4,r4,32
391         stvx    v1,0,r3
392         stvx    v0,r3,r9
393         addi    r3,r3,32
394 
395 10:     bf      cr7*4+3,11f
396         lvx     v1,0,r4
397         addi    r4,r4,16
398         stvx    v1,0,r3
399         addi    r3,r3,16
400 
401         /* Up to 15B to go */
402 11:     clrldi  r5,r5,(64-4)
403         mtocrf  0x01,r5
404         bf      cr7*4+0,12f
405         ld      r0,0(r4)
406         addi    r4,r4,8
407         std     r0,0(r3)
408         addi    r3,r3,8
409 
410 12:     bf      cr7*4+1,13f
411         lwz     r0,0(r4)
412         addi    r4,r4,4
413         stw     r0,0(r3)
414         addi    r3,r3,4
415 
416 13:     bf      cr7*4+2,14f
417         lhz     r0,0(r4)
418         addi    r4,r4,2
419         sth     r0,0(r3)
420         addi    r3,r3,2
421 
422 14:     bf      cr7*4+3,15f
423         lbz     r0,0(r4)
424         stb     r0,0(r3)
425 
426 15:     addi    r1,r1,STACKFRAMESIZE
427         ld      r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
428         b       CFUNC(exit_vmx_ops)             /* tail call optimise */
429 
430 .Lvmx_unaligned_copy:
431         /* Get the destination 16B aligned */
432         neg     r6,r3
433         mtocrf  0x01,r6
434         clrldi  r6,r6,(64-4)
435 
436         bf      cr7*4+3,1f
437         lbz     r0,0(r4)
438         addi    r4,r4,1
439         stb     r0,0(r3)
440         addi    r3,r3,1
441 
442 1:      bf      cr7*4+2,2f
443         lhz     r0,0(r4)
444         addi    r4,r4,2
445         sth     r0,0(r3)
446         addi    r3,r3,2
447 
448 2:      bf      cr7*4+1,3f
449         lwz     r0,0(r4)
450         addi    r4,r4,4
451         stw     r0,0(r3)
452         addi    r3,r3,4
453 
454 3:      bf      cr7*4+0,4f
455         lwz     r0,0(r4)        /* Less chance of a reject with word ops */
456         lwz     r7,4(r4)
457         addi    r4,r4,8
458         stw     r0,0(r3)
459         stw     r7,4(r3)
460         addi    r3,r3,8
461 
462 4:      sub     r5,r5,r6
463 
464         /* Get the desination 128B aligned */
465         neg     r6,r3
466         srdi    r7,r6,4
467         mtocrf  0x01,r7
468         clrldi  r6,r6,(64-7)
469 
470         li      r9,16
471         li      r10,32
472         li      r11,48
473 
474         LVS(v16,0,r4)           /* Setup permute control vector */
475         lvx     v0,0,r4
476         addi    r4,r4,16
477 
478         bf      cr7*4+3,5f
479         lvx     v1,0,r4
480         VPERM(v8,v0,v1,v16)
481         addi    r4,r4,16
482         stvx    v8,0,r3
483         addi    r3,r3,16
484         vor     v0,v1,v1
485 
486 5:      bf      cr7*4+2,6f
487         lvx     v1,0,r4
488         VPERM(v8,v0,v1,v16)
489         lvx     v0,r4,r9
490         VPERM(v9,v1,v0,v16)
491         addi    r4,r4,32
492         stvx    v8,0,r3
493         stvx    v9,r3,r9
494         addi    r3,r3,32
495 
496 6:      bf      cr7*4+1,7f
497         lvx     v3,0,r4
498         VPERM(v8,v0,v3,v16)
499         lvx     v2,r4,r9
500         VPERM(v9,v3,v2,v16)
501         lvx     v1,r4,r10
502         VPERM(v10,v2,v1,v16)
503         lvx     v0,r4,r11
504         VPERM(v11,v1,v0,v16)
505         addi    r4,r4,64
506         stvx    v8,0,r3
507         stvx    v9,r3,r9
508         stvx    v10,r3,r10
509         stvx    v11,r3,r11
510         addi    r3,r3,64
511 
512 7:      sub     r5,r5,r6
513         srdi    r6,r5,7
514 
515         std     r14,STK_REG(R14)(r1)
516         std     r15,STK_REG(R15)(r1)
517         std     r16,STK_REG(R16)(r1)
518 
519         li      r12,64
520         li      r14,80
521         li      r15,96
522         li      r16,112
523 
524         mtctr   r6
525 
526         /*
527          * Now do cacheline sized loads and stores. By this stage the
528          * cacheline stores are also cacheline aligned.
529          */
530         .align  5
531 8:
532         lvx     v7,0,r4
533         VPERM(v8,v0,v7,v16)
534         lvx     v6,r4,r9
535         VPERM(v9,v7,v6,v16)
536         lvx     v5,r4,r10
537         VPERM(v10,v6,v5,v16)
538         lvx     v4,r4,r11
539         VPERM(v11,v5,v4,v16)
540         lvx     v3,r4,r12
541         VPERM(v12,v4,v3,v16)
542         lvx     v2,r4,r14
543         VPERM(v13,v3,v2,v16)
544         lvx     v1,r4,r15
545         VPERM(v14,v2,v1,v16)
546         lvx     v0,r4,r16
547         VPERM(v15,v1,v0,v16)
548         addi    r4,r4,128
549         stvx    v8,0,r3
550         stvx    v9,r3,r9
551         stvx    v10,r3,r10
552         stvx    v11,r3,r11
553         stvx    v12,r3,r12
554         stvx    v13,r3,r14
555         stvx    v14,r3,r15
556         stvx    v15,r3,r16
557         addi    r3,r3,128
558         bdnz    8b
559 
560         ld      r14,STK_REG(R14)(r1)
561         ld      r15,STK_REG(R15)(r1)
562         ld      r16,STK_REG(R16)(r1)
563 
564         /* Up to 127B to go */
565         clrldi  r5,r5,(64-7)
566         srdi    r6,r5,4
567         mtocrf  0x01,r6
568 
569         bf      cr7*4+1,9f
570         lvx     v3,0,r4
571         VPERM(v8,v0,v3,v16)
572         lvx     v2,r4,r9
573         VPERM(v9,v3,v2,v16)
574         lvx     v1,r4,r10
575         VPERM(v10,v2,v1,v16)
576         lvx     v0,r4,r11
577         VPERM(v11,v1,v0,v16)
578         addi    r4,r4,64
579         stvx    v8,0,r3
580         stvx    v9,r3,r9
581         stvx    v10,r3,r10
582         stvx    v11,r3,r11
583         addi    r3,r3,64
584 
585 9:      bf      cr7*4+2,10f
586         lvx     v1,0,r4
587         VPERM(v8,v0,v1,v16)
588         lvx     v0,r4,r9
589         VPERM(v9,v1,v0,v16)
590         addi    r4,r4,32
591         stvx    v8,0,r3
592         stvx    v9,r3,r9
593         addi    r3,r3,32
594 
595 10:     bf      cr7*4+3,11f
596         lvx     v1,0,r4
597         VPERM(v8,v0,v1,v16)
598         addi    r4,r4,16
599         stvx    v8,0,r3
600         addi    r3,r3,16
601 
602         /* Up to 15B to go */
603 11:     clrldi  r5,r5,(64-4)
604         addi    r4,r4,-16       /* Unwind the +16 load offset */
605         mtocrf  0x01,r5
606         bf      cr7*4+0,12f
607         lwz     r0,0(r4)        /* Less chance of a reject with word ops */
608         lwz     r6,4(r4)
609         addi    r4,r4,8
610         stw     r0,0(r3)
611         stw     r6,4(r3)
612         addi    r3,r3,8
613 
614 12:     bf      cr7*4+1,13f
615         lwz     r0,0(r4)
616         addi    r4,r4,4
617         stw     r0,0(r3)
618         addi    r3,r3,4
619 
620 13:     bf      cr7*4+2,14f
621         lhz     r0,0(r4)
622         addi    r4,r4,2
623         sth     r0,0(r3)
624         addi    r3,r3,2
625 
626 14:     bf      cr7*4+3,15f
627         lbz     r0,0(r4)
628         stb     r0,0(r3)
629 
630 15:     addi    r1,r1,STACKFRAMESIZE
631         ld      r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
632         b       CFUNC(exit_vmx_ops)             /* tail call optimise */
633 #endif /* CONFIG_ALTIVEC */

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

sflogo.php