~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/tools/testing/selftests/powerpc/stringloops/memcmp_64.S

Version: ~ [ linux-6.11.5 ] ~ [ linux-6.10.14 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.58 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.114 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.169 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.228 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.284 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.322 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.9 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 /* SPDX-License-Identifier: GPL-2.0-or-later */
  2 /*
  3  * Author: Anton Blanchard <anton@au.ibm.com>
  4  * Copyright 2015 IBM Corporation.
  5  */
  6 #include <linux/export.h>
  7 #include <asm/ppc_asm.h>
  8 #include <asm/ppc-opcode.h>
  9 
 10 #define off8    r6
 11 #define off16   r7
 12 #define off24   r8
 13 
 14 #define rA      r9
 15 #define rB      r10
 16 #define rC      r11
 17 #define rD      r27
 18 #define rE      r28
 19 #define rF      r29
 20 #define rG      r30
 21 #define rH      r31
 22 
 23 #ifdef __LITTLE_ENDIAN__
 24 #define LH      lhbrx
 25 #define LW      lwbrx
 26 #define LD      ldbrx
 27 #define LVS     lvsr
 28 #define VPERM(_VRT,_VRA,_VRB,_VRC) \
 29         vperm _VRT,_VRB,_VRA,_VRC
 30 #else
 31 #define LH      lhzx
 32 #define LW      lwzx
 33 #define LD      ldx
 34 #define LVS     lvsl
 35 #define VPERM(_VRT,_VRA,_VRB,_VRC) \
 36         vperm _VRT,_VRA,_VRB,_VRC
 37 #endif
 38 
 39 #define VMX_THRESH 4096
 40 #define ENTER_VMX_OPS   \
 41         mflr    r0;     \
 42         std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
 43         std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
 44         std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
 45         std     r0,16(r1); \
 46         stdu    r1,-STACKFRAMESIZE(r1); \
 47         bl      CFUNC(enter_vmx_ops); \
 48         cmpwi   cr1,r3,0; \
 49         ld      r0,STACKFRAMESIZE+16(r1); \
 50         ld      r3,STK_REG(R31)(r1); \
 51         ld      r4,STK_REG(R30)(r1); \
 52         ld      r5,STK_REG(R29)(r1); \
 53         addi    r1,r1,STACKFRAMESIZE; \
 54         mtlr    r0
 55 
 56 #define EXIT_VMX_OPS \
 57         mflr    r0; \
 58         std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
 59         std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
 60         std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
 61         std     r0,16(r1); \
 62         stdu    r1,-STACKFRAMESIZE(r1); \
 63         bl      CFUNC(exit_vmx_ops); \
 64         ld      r0,STACKFRAMESIZE+16(r1); \
 65         ld      r3,STK_REG(R31)(r1); \
 66         ld      r4,STK_REG(R30)(r1); \
 67         ld      r5,STK_REG(R29)(r1); \
 68         addi    r1,r1,STACKFRAMESIZE; \
 69         mtlr    r0
 70 
 71 /*
 72  * LD_VSR_CROSS16B load the 2nd 16 bytes for _vaddr which is unaligned with
 73  * 16 bytes boundary and permute the result with the 1st 16 bytes.
 74 
 75  *    |  y y y y y y y y y y y y y 0 1 2 | 3 4 5 6 7 8 9 a b c d e f z z z |
 76  *    ^                                  ^                                 ^
 77  * 0xbbbb10                          0xbbbb20                          0xbbb30
 78  *                                 ^
 79  *                                _vaddr
 80  *
 81  *
 82  * _vmask is the mask generated by LVS
 83  * _v1st_qw is the 1st aligned QW of current addr which is already loaded.
 84  *   for example: 0xyyyyyyyyyyyyy012 for big endian
 85  * _v2nd_qw is the 2nd aligned QW of cur _vaddr to be loaded.
 86  *   for example: 0x3456789abcdefzzz for big endian
 87  * The permute result is saved in _v_res.
 88  *   for example: 0x0123456789abcdef for big endian.
 89  */
 90 #define LD_VSR_CROSS16B(_vaddr,_vmask,_v1st_qw,_v2nd_qw,_v_res) \
 91         lvx     _v2nd_qw,_vaddr,off16; \
 92         VPERM(_v_res,_v1st_qw,_v2nd_qw,_vmask)
 93 
 94 /*
 95  * There are 2 categories for memcmp:
 96  * 1) src/dst has the same offset to the 8 bytes boundary. The handlers
 97  * are named like .Lsameoffset_xxxx
 98  * 2) src/dst has different offset to the 8 bytes boundary. The handlers
 99  * are named like .Ldiffoffset_xxxx
100  */
101 _GLOBAL_TOC(memcmp)
102         cmpdi   cr1,r5,0
103 
104         /* Use the short loop if the src/dst addresses are not
105          * with the same offset of 8 bytes align boundary.
106          */
107         xor     r6,r3,r4
108         andi.   r6,r6,7
109 
110         /* Fall back to short loop if compare at aligned addrs
111          * with less than 8 bytes.
112          */
113         cmpdi   cr6,r5,7
114 
115         beq     cr1,.Lzero
116         bgt     cr6,.Lno_short
117 
118 .Lshort:
119         mtctr   r5
120 1:      lbz     rA,0(r3)
121         lbz     rB,0(r4)
122         subf.   rC,rB,rA
123         bne     .Lnon_zero
124         bdz     .Lzero
125 
126         lbz     rA,1(r3)
127         lbz     rB,1(r4)
128         subf.   rC,rB,rA
129         bne     .Lnon_zero
130         bdz     .Lzero
131 
132         lbz     rA,2(r3)
133         lbz     rB,2(r4)
134         subf.   rC,rB,rA
135         bne     .Lnon_zero
136         bdz     .Lzero
137 
138         lbz     rA,3(r3)
139         lbz     rB,3(r4)
140         subf.   rC,rB,rA
141         bne     .Lnon_zero
142 
143         addi    r3,r3,4
144         addi    r4,r4,4
145 
146         bdnz    1b
147 
148 .Lzero:
149         li      r3,0
150         blr
151 
152 .Lno_short:
153         dcbt    0,r3
154         dcbt    0,r4
155         bne     .Ldiffoffset_8bytes_make_align_start
156 
157 
158 .Lsameoffset_8bytes_make_align_start:
159         /* attempt to compare bytes not aligned with 8 bytes so that
160          * rest comparison can run based on 8 bytes alignment.
161          */
162         andi.   r6,r3,7
163 
164         /* Try to compare the first double word which is not 8 bytes aligned:
165          * load the first double word at (src & ~7UL) and shift left appropriate
166          * bits before comparision.
167          */
168         rlwinm  r6,r3,3,26,28
169         beq     .Lsameoffset_8bytes_aligned
170         clrrdi  r3,r3,3
171         clrrdi  r4,r4,3
172         LD      rA,0,r3
173         LD      rB,0,r4
174         sld     rA,rA,r6
175         sld     rB,rB,r6
176         cmpld   cr0,rA,rB
177         srwi    r6,r6,3
178         bne     cr0,.LcmpAB_lightweight
179         subfic  r6,r6,8
180         subf.   r5,r6,r5
181         addi    r3,r3,8
182         addi    r4,r4,8
183         beq     .Lzero
184 
185 .Lsameoffset_8bytes_aligned:
186         /* now we are aligned with 8 bytes.
187          * Use .Llong loop if left cmp bytes are equal or greater than 32B.
188          */
189         cmpdi   cr6,r5,31
190         bgt     cr6,.Llong
191 
192 .Lcmp_lt32bytes:
193         /* compare 1 ~ 31 bytes, at least r3 addr is 8 bytes aligned now */
194         cmpdi   cr5,r5,7
195         srdi    r0,r5,3
196         ble     cr5,.Lcmp_rest_lt8bytes
197 
198         /* handle 8 ~ 31 bytes */
199         clrldi  r5,r5,61
200         mtctr   r0
201 2:
202         LD      rA,0,r3
203         LD      rB,0,r4
204         cmpld   cr0,rA,rB
205         addi    r3,r3,8
206         addi    r4,r4,8
207         bne     cr0,.LcmpAB_lightweight
208         bdnz    2b
209 
210         cmpwi   r5,0
211         beq     .Lzero
212 
213 .Lcmp_rest_lt8bytes:
214         /*
215          * Here we have less than 8 bytes to compare. At least s1 is aligned to
216          * 8 bytes, but s2 may not be. We must make sure s2 + 7 doesn't cross a
217          * page boundary, otherwise we might read past the end of the buffer and
218          * trigger a page fault. We use 4K as the conservative minimum page
219          * size. If we detect that case we go to the byte-by-byte loop.
220          *
221          * Otherwise the next double word is loaded from s1 and s2, and shifted
222          * right to compare the appropriate bits.
223          */
224         clrldi  r6,r4,(64-12)   // r6 = r4 & 0xfff
225         cmpdi   r6,0xff8
226         bgt     .Lshort
227 
228         subfic  r6,r5,8
229         slwi    r6,r6,3
230         LD      rA,0,r3
231         LD      rB,0,r4
232         srd     rA,rA,r6
233         srd     rB,rB,r6
234         cmpld   cr0,rA,rB
235         bne     cr0,.LcmpAB_lightweight
236         b       .Lzero
237 
238 .Lnon_zero:
239         mr      r3,rC
240         blr
241 
242 .Llong:
243 #ifdef CONFIG_ALTIVEC
244 BEGIN_FTR_SECTION
245         /* Try to use vmx loop if length is equal or greater than 4K */
246         cmpldi  cr6,r5,VMX_THRESH
247         bge     cr6,.Lsameoffset_vmx_cmp
248 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
249 
250 .Llong_novmx_cmp:
251 #endif
252         /* At least s1 addr is aligned with 8 bytes */
253         li      off8,8
254         li      off16,16
255         li      off24,24
256 
257         std     r31,-8(r1)
258         std     r30,-16(r1)
259         std     r29,-24(r1)
260         std     r28,-32(r1)
261         std     r27,-40(r1)
262 
263         srdi    r0,r5,5
264         mtctr   r0
265         andi.   r5,r5,31
266 
267         LD      rA,0,r3
268         LD      rB,0,r4
269 
270         LD      rC,off8,r3
271         LD      rD,off8,r4
272 
273         LD      rE,off16,r3
274         LD      rF,off16,r4
275 
276         LD      rG,off24,r3
277         LD      rH,off24,r4
278         cmpld   cr0,rA,rB
279 
280         addi    r3,r3,32
281         addi    r4,r4,32
282 
283         bdz     .Lfirst32
284 
285         LD      rA,0,r3
286         LD      rB,0,r4
287         cmpld   cr1,rC,rD
288 
289         LD      rC,off8,r3
290         LD      rD,off8,r4
291         cmpld   cr6,rE,rF
292 
293         LD      rE,off16,r3
294         LD      rF,off16,r4
295         cmpld   cr7,rG,rH
296         bne     cr0,.LcmpAB
297 
298         LD      rG,off24,r3
299         LD      rH,off24,r4
300         cmpld   cr0,rA,rB
301         bne     cr1,.LcmpCD
302 
303         addi    r3,r3,32
304         addi    r4,r4,32
305 
306         bdz     .Lsecond32
307 
308         .balign 16
309 
310 1:      LD      rA,0,r3
311         LD      rB,0,r4
312         cmpld   cr1,rC,rD
313         bne     cr6,.LcmpEF
314 
315         LD      rC,off8,r3
316         LD      rD,off8,r4
317         cmpld   cr6,rE,rF
318         bne     cr7,.LcmpGH
319 
320         LD      rE,off16,r3
321         LD      rF,off16,r4
322         cmpld   cr7,rG,rH
323         bne     cr0,.LcmpAB
324 
325         LD      rG,off24,r3
326         LD      rH,off24,r4
327         cmpld   cr0,rA,rB
328         bne     cr1,.LcmpCD
329 
330         addi    r3,r3,32
331         addi    r4,r4,32
332 
333         bdnz    1b
334 
335 .Lsecond32:
336         cmpld   cr1,rC,rD
337         bne     cr6,.LcmpEF
338 
339         cmpld   cr6,rE,rF
340         bne     cr7,.LcmpGH
341 
342         cmpld   cr7,rG,rH
343         bne     cr0,.LcmpAB
344 
345         bne     cr1,.LcmpCD
346         bne     cr6,.LcmpEF
347         bne     cr7,.LcmpGH
348 
349 .Ltail:
350         ld      r31,-8(r1)
351         ld      r30,-16(r1)
352         ld      r29,-24(r1)
353         ld      r28,-32(r1)
354         ld      r27,-40(r1)
355 
356         cmpdi   r5,0
357         beq     .Lzero
358         b       .Lshort
359 
360 .Lfirst32:
361         cmpld   cr1,rC,rD
362         cmpld   cr6,rE,rF
363         cmpld   cr7,rG,rH
364 
365         bne     cr0,.LcmpAB
366         bne     cr1,.LcmpCD
367         bne     cr6,.LcmpEF
368         bne     cr7,.LcmpGH
369 
370         b       .Ltail
371 
372 .LcmpAB:
373         li      r3,1
374         bgt     cr0,.Lout
375         li      r3,-1
376         b       .Lout
377 
378 .LcmpCD:
379         li      r3,1
380         bgt     cr1,.Lout
381         li      r3,-1
382         b       .Lout
383 
384 .LcmpEF:
385         li      r3,1
386         bgt     cr6,.Lout
387         li      r3,-1
388         b       .Lout
389 
390 .LcmpGH:
391         li      r3,1
392         bgt     cr7,.Lout
393         li      r3,-1
394 
395 .Lout:
396         ld      r31,-8(r1)
397         ld      r30,-16(r1)
398         ld      r29,-24(r1)
399         ld      r28,-32(r1)
400         ld      r27,-40(r1)
401         blr
402 
403 .LcmpAB_lightweight:   /* skip NV GPRS restore */
404         li      r3,1
405         bgtlr
406         li      r3,-1
407         blr
408 
409 #ifdef CONFIG_ALTIVEC
410 .Lsameoffset_vmx_cmp:
411         /* Enter with src/dst addrs has the same offset with 8 bytes
412          * align boundary.
413          *
414          * There is an optimization based on following fact: memcmp()
415          * prones to fail early at the first 32 bytes.
416          * Before applying VMX instructions which will lead to 32x128bits
417          * VMX regs load/restore penalty, we compare the first 32 bytes
418          * so that we can catch the ~80% fail cases.
419          */
420 
421         li      r0,4
422         mtctr   r0
423 .Lsameoffset_prechk_32B_loop:
424         LD      rA,0,r3
425         LD      rB,0,r4
426         cmpld   cr0,rA,rB
427         addi    r3,r3,8
428         addi    r4,r4,8
429         bne     cr0,.LcmpAB_lightweight
430         addi    r5,r5,-8
431         bdnz    .Lsameoffset_prechk_32B_loop
432 
433         ENTER_VMX_OPS
434         beq     cr1,.Llong_novmx_cmp
435 
436 3:
437         /* need to check whether r4 has the same offset with r3
438          * for 16 bytes boundary.
439          */
440         xor     r0,r3,r4
441         andi.   r0,r0,0xf
442         bne     .Ldiffoffset_vmx_cmp_start
443 
444         /* len is no less than 4KB. Need to align with 16 bytes further.
445          */
446         andi.   rA,r3,8
447         LD      rA,0,r3
448         beq     4f
449         LD      rB,0,r4
450         cmpld   cr0,rA,rB
451         addi    r3,r3,8
452         addi    r4,r4,8
453         addi    r5,r5,-8
454 
455         beq     cr0,4f
456         /* save and restore cr0 */
457         mfocrf  r5,128
458         EXIT_VMX_OPS
459         mtocrf  128,r5
460         b       .LcmpAB_lightweight
461 
462 4:
463         /* compare 32 bytes for each loop */
464         srdi    r0,r5,5
465         mtctr   r0
466         clrldi  r5,r5,59
467         li      off16,16
468 
469 .balign 16
470 5:
471         lvx     v0,0,r3
472         lvx     v1,0,r4
473         VCMPEQUD_RC(v0,v0,v1)
474         bnl     cr6,7f
475         lvx     v0,off16,r3
476         lvx     v1,off16,r4
477         VCMPEQUD_RC(v0,v0,v1)
478         bnl     cr6,6f
479         addi    r3,r3,32
480         addi    r4,r4,32
481         bdnz    5b
482 
483         EXIT_VMX_OPS
484         cmpdi   r5,0
485         beq     .Lzero
486         b       .Lcmp_lt32bytes
487 
488 6:
489         addi    r3,r3,16
490         addi    r4,r4,16
491 
492 7:
493         /* diff the last 16 bytes */
494         EXIT_VMX_OPS
495         LD      rA,0,r3
496         LD      rB,0,r4
497         cmpld   cr0,rA,rB
498         li      off8,8
499         bne     cr0,.LcmpAB_lightweight
500 
501         LD      rA,off8,r3
502         LD      rB,off8,r4
503         cmpld   cr0,rA,rB
504         bne     cr0,.LcmpAB_lightweight
505         b       .Lzero
506 #endif
507 
508 .Ldiffoffset_8bytes_make_align_start:
509         /* now try to align s1 with 8 bytes */
510         rlwinm  r6,r3,3,26,28
511         beq     .Ldiffoffset_align_s1_8bytes
512 
513         clrrdi  r3,r3,3
514         LD      rA,0,r3
515         LD      rB,0,r4  /* unaligned load */
516         sld     rA,rA,r6
517         srd     rA,rA,r6
518         srd     rB,rB,r6
519         cmpld   cr0,rA,rB
520         srwi    r6,r6,3
521         bne     cr0,.LcmpAB_lightweight
522 
523         subfic  r6,r6,8
524         subf.   r5,r6,r5
525         addi    r3,r3,8
526         add     r4,r4,r6
527 
528         beq     .Lzero
529 
530 .Ldiffoffset_align_s1_8bytes:
531         /* now s1 is aligned with 8 bytes. */
532 #ifdef CONFIG_ALTIVEC
533 BEGIN_FTR_SECTION
534         /* only do vmx ops when the size equal or greater than 4K bytes */
535         cmpdi   cr5,r5,VMX_THRESH
536         bge     cr5,.Ldiffoffset_vmx_cmp
537 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
538 
539 .Ldiffoffset_novmx_cmp:
540 #endif
541 
542 
543         cmpdi   cr5,r5,31
544         ble     cr5,.Lcmp_lt32bytes
545 
546 #ifdef CONFIG_ALTIVEC
547         b       .Llong_novmx_cmp
548 #else
549         b       .Llong
550 #endif
551 
552 #ifdef CONFIG_ALTIVEC
553 .Ldiffoffset_vmx_cmp:
554         /* perform a 32 bytes pre-checking before
555          * enable VMX operations.
556          */
557         li      r0,4
558         mtctr   r0
559 .Ldiffoffset_prechk_32B_loop:
560         LD      rA,0,r3
561         LD      rB,0,r4
562         cmpld   cr0,rA,rB
563         addi    r3,r3,8
564         addi    r4,r4,8
565         bne     cr0,.LcmpAB_lightweight
566         addi    r5,r5,-8
567         bdnz    .Ldiffoffset_prechk_32B_loop
568 
569         ENTER_VMX_OPS
570         beq     cr1,.Ldiffoffset_novmx_cmp
571 
572 .Ldiffoffset_vmx_cmp_start:
573         /* Firstly try to align r3 with 16 bytes */
574         andi.   r6,r3,0xf
575         li      off16,16
576         beq     .Ldiffoffset_vmx_s1_16bytes_align
577 
578         LVS     v3,0,r3
579         LVS     v4,0,r4
580 
581         lvx     v5,0,r3
582         lvx     v6,0,r4
583         LD_VSR_CROSS16B(r3,v3,v5,v7,v9)
584         LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
585 
586         VCMPEQUB_RC(v7,v9,v10)
587         bnl     cr6,.Ldiffoffset_vmx_diff_found
588 
589         subfic  r6,r6,16
590         subf    r5,r6,r5
591         add     r3,r3,r6
592         add     r4,r4,r6
593 
594 .Ldiffoffset_vmx_s1_16bytes_align:
595         /* now s1 is aligned with 16 bytes */
596         lvx     v6,0,r4
597         LVS     v4,0,r4
598         srdi    r6,r5,5  /* loop for 32 bytes each */
599         clrldi  r5,r5,59
600         mtctr   r6
601 
602 .balign 16
603 .Ldiffoffset_vmx_32bytesloop:
604         /* the first qw of r4 was saved in v6 */
605         lvx     v9,0,r3
606         LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
607         VCMPEQUB_RC(v7,v9,v10)
608         vor     v6,v8,v8
609         bnl     cr6,.Ldiffoffset_vmx_diff_found
610 
611         addi    r3,r3,16
612         addi    r4,r4,16
613 
614         lvx     v9,0,r3
615         LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
616         VCMPEQUB_RC(v7,v9,v10)
617         vor     v6,v8,v8
618         bnl     cr6,.Ldiffoffset_vmx_diff_found
619 
620         addi    r3,r3,16
621         addi    r4,r4,16
622 
623         bdnz    .Ldiffoffset_vmx_32bytesloop
624 
625         EXIT_VMX_OPS
626 
627         cmpdi   r5,0
628         beq     .Lzero
629         b       .Lcmp_lt32bytes
630 
631 .Ldiffoffset_vmx_diff_found:
632         EXIT_VMX_OPS
633         /* anyway, the diff will appear in next 16 bytes */
634         li      r5,16
635         b       .Lcmp_lt32bytes
636 
637 #endif
638 EXPORT_SYMBOL(memcmp)

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

sflogo.php