~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/arch/arm64/crypto/poly1305-armv8.pl

Version: ~ [ linux-6.12-rc7 ] ~ [ linux-6.11.7 ] ~ [ linux-6.10.14 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.60 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.116 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.171 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.229 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.285 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.323 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.12 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

Diff markup

Differences between /arch/arm64/crypto/poly1305-armv8.pl (Version linux-6.12-rc7) and /arch/i386/crypto/poly1305-armv8.pl (Version linux-4.13.16)


  1 #!/usr/bin/env perl                               
  2 # SPDX-License-Identifier: GPL-1.0+ OR BSD-3-C    
  3 #                                                 
  4 # ============================================    
  5 # Written by Andy Polyakov, @dot-asm, initiall    
  6 # project.                                        
  7 # ============================================    
  8 #                                                 
  9 # This module implements Poly1305 hash for ARM    
 10 #                                                 
 11 # June 2015                                       
 12 #                                                 
 13 # Numbers are cycles per processed byte with p    
 14 #                                                 
 15 #               IALU/gcc-4.9    NEON              
 16 #                                                 
 17 # Apple A7      1.86/+5%        0.72              
 18 # Cortex-A53    2.69/+58%       1.47              
 19 # Cortex-A57    2.70/+7%        1.14              
 20 # Denver        1.64/+50%       1.18(*)           
 21 # X-Gene        2.13/+68%       2.27              
 22 # Mongoose      1.77/+75%       1.12              
 23 # Kryo          2.70/+55%       1.13              
 24 # ThunderX2     1.17/+95%       1.36              
 25 #                                                 
 26 # (*)   estimate based on resources availabili    
 27 #       i.e. measured result is worse than exp    
 28 #       translator is not almighty;               
 29                                                   
 30 $flavour=shift;                                   
 31 $output=shift;                                    
 32                                                   
 33 if ($flavour && $flavour ne "void") {             
 34     $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;        
 35     ( $xlate="${dir}arm-xlate.pl" and -f $xlat    
 36     ( $xlate="${dir}../../perlasm/arm-xlate.pl    
 37     die "can't locate arm-xlate.pl";              
 38                                                   
 39     open STDOUT,"| \"$^X\" $xlate $flavour $ou    
 40 } else {                                          
 41     open STDOUT,">$output";                       
 42 }                                                 
 43                                                   
 44 my ($ctx,$inp,$len,$padbit) = map("x$_",(0..3)    
 45 my ($mac,$nonce)=($inp,$len);                     
 46                                                   
 47 my ($h0,$h1,$h2,$r0,$r1,$s1,$t0,$t1,$d0,$d1,$d    
 48                                                   
 49 $code.=<<___;                                     
 50 #ifndef __KERNEL__                                
 51 # include "arm_arch.h"                            
 52 .extern OPENSSL_armcap_P                          
 53 #endif                                            
 54                                                   
 55 .text                                             
 56                                                   
 57 // forward "declarations" are required for App    
 58 .globl  poly1305_blocks                           
 59 .globl  poly1305_emit                             
 60                                                   
 61 .globl  poly1305_init                             
 62 .type   poly1305_init,%function                   
 63 .align  5                                         
 64 poly1305_init:                                    
 65         cmp     $inp,xzr                          
 66         stp     xzr,xzr,[$ctx]          // zer    
 67         stp     xzr,xzr,[$ctx,#16]      // [al    
 68                                                   
 69         csel    x0,xzr,x0,eq                      
 70         b.eq    .Lno_key                          
 71                                                   
 72 #ifndef __KERNEL__                                
 73         adrp    x17,OPENSSL_armcap_P              
 74         ldr     w17,[x17,#:lo12:OPENSSL_armcap    
 75 #endif                                            
 76                                                   
 77         ldp     $r0,$r1,[$inp]          // loa    
 78         mov     $s1,#0xfffffffc0fffffff           
 79         movk    $s1,#0x0fff,lsl#48                
 80 #ifdef  __AARCH64EB__                             
 81         rev     $r0,$r0                 // fli    
 82         rev     $r1,$r1                           
 83 #endif                                            
 84         and     $r0,$r0,$s1             // &=0    
 85         and     $s1,$s1,#-4                       
 86         and     $r1,$r1,$s1             // &=0    
 87         mov     w#$s1,#-1                         
 88         stp     $r0,$r1,[$ctx,#32]      // sav    
 89         str     w#$s1,[$ctx,#48]        // imp    
 90                                                   
 91 #ifndef __KERNEL__                                
 92         tst     w17,#ARMV7_NEON                   
 93                                                   
 94         adr     $d0,.Lpoly1305_blocks             
 95         adr     $r0,.Lpoly1305_blocks_neon        
 96         adr     $d1,.Lpoly1305_emit               
 97                                                   
 98         csel    $d0,$d0,$r0,eq                    
 99                                                   
100 # ifdef __ILP32__                                 
101         stp     w#$d0,w#$d1,[$len]                
102 # else                                            
103         stp     $d0,$d1,[$len]                    
104 # endif                                           
105 #endif                                            
106         mov     x0,#1                             
107 .Lno_key:                                         
108         ret                                       
109 .size   poly1305_init,.-poly1305_init             
110                                                   
111 .type   poly1305_blocks,%function                 
112 .align  5                                         
113 poly1305_blocks:                                  
114 .Lpoly1305_blocks:                                
115         ands    $len,$len,#-16                    
116         b.eq    .Lno_data                         
117                                                   
118         ldp     $h0,$h1,[$ctx]          // loa    
119         ldp     $h2,x17,[$ctx,#16]      // [al    
120         ldp     $r0,$r1,[$ctx,#32]      // loa    
121                                                   
122 #ifdef  __AARCH64EB__                             
123         lsr     $d0,$h0,#32                       
124         mov     w#$d1,w#$h0                       
125         lsr     $d2,$h1,#32                       
126         mov     w15,w#$h1                         
127         lsr     x16,$h2,#32                       
128 #else                                             
129         mov     w#$d0,w#$h0                       
130         lsr     $d1,$h0,#32                       
131         mov     w#$d2,w#$h1                       
132         lsr     x15,$h1,#32                       
133         mov     w16,w#$h2                         
134 #endif                                            
135                                                   
136         add     $d0,$d0,$d1,lsl#26      // bas    
137         lsr     $d1,$d2,#12                       
138         adds    $d0,$d0,$d2,lsl#52                
139         add     $d1,$d1,x15,lsl#14                
140         adc     $d1,$d1,xzr                       
141         lsr     $d2,x16,#24                       
142         adds    $d1,$d1,x16,lsl#40                
143         adc     $d2,$d2,xzr                       
144                                                   
145         cmp     x17,#0                  // is_    
146         add     $s1,$r1,$r1,lsr#2       // s1     
147         csel    $h0,$h0,$d0,eq          // cho    
148         csel    $h1,$h1,$d1,eq                    
149         csel    $h2,$h2,$d2,eq                    
150                                                   
151 .Loop:                                            
152         ldp     $t0,$t1,[$inp],#16      // loa    
153         sub     $len,$len,#16                     
154 #ifdef  __AARCH64EB__                             
155         rev     $t0,$t0                           
156         rev     $t1,$t1                           
157 #endif                                            
158         adds    $h0,$h0,$t0             // acc    
159         adcs    $h1,$h1,$t1                       
160                                                   
161         mul     $d0,$h0,$r0             // h0*    
162         adc     $h2,$h2,$padbit                   
163         umulh   $d1,$h0,$r0                       
164                                                   
165         mul     $t0,$h1,$s1             // h1*    
166         umulh   $t1,$h1,$s1                       
167                                                   
168         adds    $d0,$d0,$t0                       
169         mul     $t0,$h0,$r1             // h0*    
170         adc     $d1,$d1,$t1                       
171         umulh   $d2,$h0,$r1                       
172                                                   
173         adds    $d1,$d1,$t0                       
174         mul     $t0,$h1,$r0             // h1*    
175         adc     $d2,$d2,xzr                       
176         umulh   $t1,$h1,$r0                       
177                                                   
178         adds    $d1,$d1,$t0                       
179         mul     $t0,$h2,$s1             // h2*    
180         adc     $d2,$d2,$t1                       
181         mul     $t1,$h2,$r0             // h2*    
182                                                   
183         adds    $d1,$d1,$t0                       
184         adc     $d2,$d2,$t1                       
185                                                   
186         and     $t0,$d2,#-4             // fin    
187         and     $h2,$d2,#3                        
188         add     $t0,$t0,$d2,lsr#2                 
189         adds    $h0,$d0,$t0                       
190         adcs    $h1,$d1,xzr                       
191         adc     $h2,$h2,xzr                       
192                                                   
193         cbnz    $len,.Loop                        
194                                                   
195         stp     $h0,$h1,[$ctx]          // sto    
196         stp     $h2,xzr,[$ctx,#16]      // [an    
197                                                   
198 .Lno_data:                                        
199         ret                                       
200 .size   poly1305_blocks,.-poly1305_blocks         
201                                                   
202 .type   poly1305_emit,%function                   
203 .align  5                                         
204 poly1305_emit:                                    
205 .Lpoly1305_emit:                                  
206         ldp     $h0,$h1,[$ctx]          // loa    
207         ldp     $h2,$r0,[$ctx,#16]      // [al    
208         ldp     $t0,$t1,[$nonce]        // loa    
209                                                   
210 #ifdef  __AARCH64EB__                             
211         lsr     $d0,$h0,#32                       
212         mov     w#$d1,w#$h0                       
213         lsr     $d2,$h1,#32                       
214         mov     w15,w#$h1                         
215         lsr     x16,$h2,#32                       
216 #else                                             
217         mov     w#$d0,w#$h0                       
218         lsr     $d1,$h0,#32                       
219         mov     w#$d2,w#$h1                       
220         lsr     x15,$h1,#32                       
221         mov     w16,w#$h2                         
222 #endif                                            
223                                                   
224         add     $d0,$d0,$d1,lsl#26      // bas    
225         lsr     $d1,$d2,#12                       
226         adds    $d0,$d0,$d2,lsl#52                
227         add     $d1,$d1,x15,lsl#14                
228         adc     $d1,$d1,xzr                       
229         lsr     $d2,x16,#24                       
230         adds    $d1,$d1,x16,lsl#40                
231         adc     $d2,$d2,xzr                       
232                                                   
233         cmp     $r0,#0                  // is_    
234         csel    $h0,$h0,$d0,eq          // cho    
235         csel    $h1,$h1,$d1,eq                    
236         csel    $h2,$h2,$d2,eq                    
237                                                   
238         adds    $d0,$h0,#5              // com    
239         adcs    $d1,$h1,xzr                       
240         adc     $d2,$h2,xzr                       
241                                                   
242         tst     $d2,#-4                 // see    
243                                                   
244         csel    $h0,$h0,$d0,eq                    
245         csel    $h1,$h1,$d1,eq                    
246                                                   
247 #ifdef  __AARCH64EB__                             
248         ror     $t0,$t0,#32             // fli    
249         ror     $t1,$t1,#32                       
250 #endif                                            
251         adds    $h0,$h0,$t0             // acc    
252         adc     $h1,$h1,$t1                       
253 #ifdef  __AARCH64EB__                             
254         rev     $h0,$h0                 // fli    
255         rev     $h1,$h1                           
256 #endif                                            
257         stp     $h0,$h1,[$mac]          // wri    
258                                                   
259         ret                                       
260 .size   poly1305_emit,.-poly1305_emit             
261 ___                                               
262 my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map    
263 my ($IN01_0,$IN01_1,$IN01_2,$IN01_3,$IN01_4) =    
264 my ($IN23_0,$IN23_1,$IN23_2,$IN23_3,$IN23_4) =    
265 my ($ACC0,$ACC1,$ACC2,$ACC3,$ACC4) = map("v$_.    
266 my ($H0,$H1,$H2,$H3,$H4) = map("v$_.2s",(24..2    
267 my ($T0,$T1,$MASK) = map("v$_",(29..31));         
268                                                   
269 my ($in2,$zeros)=("x16","x17");                   
270 my $is_base2_26 = $zeros;               # borr    
271                                                   
272 $code.=<<___;                                     
273 .type   poly1305_mult,%function                   
274 .align  5                                         
275 poly1305_mult:                                    
276         mul     $d0,$h0,$r0             // h0*    
277         umulh   $d1,$h0,$r0                       
278                                                   
279         mul     $t0,$h1,$s1             // h1*    
280         umulh   $t1,$h1,$s1                       
281                                                   
282         adds    $d0,$d0,$t0                       
283         mul     $t0,$h0,$r1             // h0*    
284         adc     $d1,$d1,$t1                       
285         umulh   $d2,$h0,$r1                       
286                                                   
287         adds    $d1,$d1,$t0                       
288         mul     $t0,$h1,$r0             // h1*    
289         adc     $d2,$d2,xzr                       
290         umulh   $t1,$h1,$r0                       
291                                                   
292         adds    $d1,$d1,$t0                       
293         mul     $t0,$h2,$s1             // h2*    
294         adc     $d2,$d2,$t1                       
295         mul     $t1,$h2,$r0             // h2*    
296                                                   
297         adds    $d1,$d1,$t0                       
298         adc     $d2,$d2,$t1                       
299                                                   
300         and     $t0,$d2,#-4             // fin    
301         and     $h2,$d2,#3                        
302         add     $t0,$t0,$d2,lsr#2                 
303         adds    $h0,$d0,$t0                       
304         adcs    $h1,$d1,xzr                       
305         adc     $h2,$h2,xzr                       
306                                                   
307         ret                                       
308 .size   poly1305_mult,.-poly1305_mult             
309                                                   
310 .type   poly1305_splat,%function                  
311 .align  4                                         
312 poly1305_splat:                                   
313         and     x12,$h0,#0x03ffffff     // bas    
314         ubfx    x13,$h0,#26,#26                   
315         extr    x14,$h1,$h0,#52                   
316         and     x14,x14,#0x03ffffff               
317         ubfx    x15,$h1,#14,#26                   
318         extr    x16,$h2,$h1,#40                   
319                                                   
320         str     w12,[$ctx,#16*0]        // r0     
321         add     w12,w13,w13,lsl#2       // r1*    
322         str     w13,[$ctx,#16*1]        // r1     
323         add     w13,w14,w14,lsl#2       // r2*    
324         str     w12,[$ctx,#16*2]        // s1     
325         str     w14,[$ctx,#16*3]        // r2     
326         add     w14,w15,w15,lsl#2       // r3*    
327         str     w13,[$ctx,#16*4]        // s2     
328         str     w15,[$ctx,#16*5]        // r3     
329         add     w15,w16,w16,lsl#2       // r4*    
330         str     w14,[$ctx,#16*6]        // s3     
331         str     w16,[$ctx,#16*7]        // r4     
332         str     w15,[$ctx,#16*8]        // s4     
333                                                   
334         ret                                       
335 .size   poly1305_splat,.-poly1305_splat           
336                                                   
337 #ifdef  __KERNEL__                                
338 .globl  poly1305_blocks_neon                      
339 #endif                                            
340 .type   poly1305_blocks_neon,%function            
341 .align  5                                         
342 poly1305_blocks_neon:                             
343 .Lpoly1305_blocks_neon:                           
344         ldr     $is_base2_26,[$ctx,#24]           
345         cmp     $len,#128                         
346         b.lo    .Lpoly1305_blocks                 
347                                                   
348         .inst   0xd503233f              // pac    
349         stp     x29,x30,[sp,#-80]!                
350         add     x29,sp,#0                         
351                                                   
352         stp     d8,d9,[sp,#16]          // mee    
353         stp     d10,d11,[sp,#32]                  
354         stp     d12,d13,[sp,#48]                  
355         stp     d14,d15,[sp,#64]                  
356                                                   
357         cbz     $is_base2_26,.Lbase2_64_neon      
358                                                   
359         ldp     w10,w11,[$ctx]          // loa    
360         ldp     w12,w13,[$ctx,#8]                 
361         ldr     w14,[$ctx,#16]                    
362                                                   
363         tst     $len,#31                          
364         b.eq    .Leven_neon                       
365                                                   
366         ldp     $r0,$r1,[$ctx,#32]      // loa    
367                                                   
368         add     $h0,x10,x11,lsl#26      // bas    
369         lsr     $h1,x12,#12                       
370         adds    $h0,$h0,x12,lsl#52                
371         add     $h1,$h1,x13,lsl#14                
372         adc     $h1,$h1,xzr                       
373         lsr     $h2,x14,#24                       
374         adds    $h1,$h1,x14,lsl#40                
375         adc     $d2,$h2,xzr             // can    
376                                                   
377         ldp     $d0,$d1,[$inp],#16      // loa    
378         sub     $len,$len,#16                     
379         add     $s1,$r1,$r1,lsr#2       // s1     
380                                                   
381 #ifdef  __AARCH64EB__                             
382         rev     $d0,$d0                           
383         rev     $d1,$d1                           
384 #endif                                            
385         adds    $h0,$h0,$d0             // acc    
386         adcs    $h1,$h1,$d1                       
387         adc     $h2,$h2,$padbit                   
388                                                   
389         bl      poly1305_mult                     
390                                                   
391         and     x10,$h0,#0x03ffffff     // bas    
392         ubfx    x11,$h0,#26,#26                   
393         extr    x12,$h1,$h0,#52                   
394         and     x12,x12,#0x03ffffff               
395         ubfx    x13,$h1,#14,#26                   
396         extr    x14,$h2,$h1,#40                   
397                                                   
398         b       .Leven_neon                       
399                                                   
400 .align  4                                         
401 .Lbase2_64_neon:                                  
402         ldp     $r0,$r1,[$ctx,#32]      // loa    
403                                                   
404         ldp     $h0,$h1,[$ctx]          // loa    
405         ldr     $h2,[$ctx,#16]                    
406                                                   
407         tst     $len,#31                          
408         b.eq    .Linit_neon                       
409                                                   
410         ldp     $d0,$d1,[$inp],#16      // loa    
411         sub     $len,$len,#16                     
412         add     $s1,$r1,$r1,lsr#2       // s1     
413 #ifdef  __AARCH64EB__                             
414         rev     $d0,$d0                           
415         rev     $d1,$d1                           
416 #endif                                            
417         adds    $h0,$h0,$d0             // acc    
418         adcs    $h1,$h1,$d1                       
419         adc     $h2,$h2,$padbit                   
420                                                   
421         bl      poly1305_mult                     
422                                                   
423 .Linit_neon:                                      
424         ldr     w17,[$ctx,#48]          // fir    
425         and     x10,$h0,#0x03ffffff     // bas    
426         ubfx    x11,$h0,#26,#26                   
427         extr    x12,$h1,$h0,#52                   
428         and     x12,x12,#0x03ffffff               
429         ubfx    x13,$h1,#14,#26                   
430         extr    x14,$h2,$h1,#40                   
431                                                   
432         cmp     w17,#-1                 // is     
433         b.ne    .Leven_neon                       
434                                                   
435         fmov    ${H0},x10                         
436         fmov    ${H1},x11                         
437         fmov    ${H2},x12                         
438         fmov    ${H3},x13                         
439         fmov    ${H4},x14                         
440                                                   
441         ////////////////////////////////// ini    
442         mov     $h0,$r0                 // r^1    
443         add     $s1,$r1,$r1,lsr#2       // s1     
444         mov     $h1,$r1                           
445         mov     $h2,xzr                           
446         add     $ctx,$ctx,#48+12                  
447         bl      poly1305_splat                    
448                                                   
449         bl      poly1305_mult           // r^2    
450         sub     $ctx,$ctx,#4                      
451         bl      poly1305_splat                    
452                                                   
453         bl      poly1305_mult           // r^3    
454         sub     $ctx,$ctx,#4                      
455         bl      poly1305_splat                    
456                                                   
457         bl      poly1305_mult           // r^4    
458         sub     $ctx,$ctx,#4                      
459         bl      poly1305_splat                    
460         sub     $ctx,$ctx,#48           // res    
461         b       .Ldo_neon                         
462                                                   
463 .align  4                                         
464 .Leven_neon:                                      
465         fmov    ${H0},x10                         
466         fmov    ${H1},x11                         
467         fmov    ${H2},x12                         
468         fmov    ${H3},x13                         
469         fmov    ${H4},x14                         
470                                                   
471 .Ldo_neon:                                        
472         ldp     x8,x12,[$inp,#32]       // inp    
473         subs    $len,$len,#64                     
474         ldp     x9,x13,[$inp,#48]                 
475         add     $in2,$inp,#96                     
476         adrp    $zeros,.Lzeros                    
477         add     $zeros,$zeros,#:lo12:.Lzeros      
478                                                   
479         lsl     $padbit,$padbit,#24               
480         add     x15,$ctx,#48                      
481                                                   
482 #ifdef  __AARCH64EB__                             
483         rev     x8,x8                             
484         rev     x12,x12                           
485         rev     x9,x9                             
486         rev     x13,x13                           
487 #endif                                            
488         and     x4,x8,#0x03ffffff       // bas    
489         and     x5,x9,#0x03ffffff                 
490         ubfx    x6,x8,#26,#26                     
491         ubfx    x7,x9,#26,#26                     
492         add     x4,x4,x5,lsl#32         // bfi    
493         extr    x8,x12,x8,#52                     
494         extr    x9,x13,x9,#52                     
495         add     x6,x6,x7,lsl#32         // bfi    
496         fmov    $IN23_0,x4                        
497         and     x8,x8,#0x03ffffff                 
498         and     x9,x9,#0x03ffffff                 
499         ubfx    x10,x12,#14,#26                   
500         ubfx    x11,x13,#14,#26                   
501         add     x12,$padbit,x12,lsr#40            
502         add     x13,$padbit,x13,lsr#40            
503         add     x8,x8,x9,lsl#32         // bfi    
504         fmov    $IN23_1,x6                        
505         add     x10,x10,x11,lsl#32      // bfi    
506         add     x12,x12,x13,lsl#32      // bfi    
507         fmov    $IN23_2,x8                        
508         fmov    $IN23_3,x10                       
509         fmov    $IN23_4,x12                       
510                                                   
511         ldp     x8,x12,[$inp],#16       // inp    
512         ldp     x9,x13,[$inp],#48                 
513                                                   
514         ld1     {$R0,$R1,$S1,$R2},[x15],#64       
515         ld1     {$S2,$R3,$S3,$R4},[x15],#64       
516         ld1     {$S4},[x15]                       
517                                                   
518 #ifdef  __AARCH64EB__                             
519         rev     x8,x8                             
520         rev     x12,x12                           
521         rev     x9,x9                             
522         rev     x13,x13                           
523 #endif                                            
524         and     x4,x8,#0x03ffffff       // bas    
525         and     x5,x9,#0x03ffffff                 
526         ubfx    x6,x8,#26,#26                     
527         ubfx    x7,x9,#26,#26                     
528         add     x4,x4,x5,lsl#32         // bfi    
529         extr    x8,x12,x8,#52                     
530         extr    x9,x13,x9,#52                     
531         add     x6,x6,x7,lsl#32         // bfi    
532         fmov    $IN01_0,x4                        
533         and     x8,x8,#0x03ffffff                 
534         and     x9,x9,#0x03ffffff                 
535         ubfx    x10,x12,#14,#26                   
536         ubfx    x11,x13,#14,#26                   
537         add     x12,$padbit,x12,lsr#40            
538         add     x13,$padbit,x13,lsr#40            
539         add     x8,x8,x9,lsl#32         // bfi    
540         fmov    $IN01_1,x6                        
541         add     x10,x10,x11,lsl#32      // bfi    
542         add     x12,x12,x13,lsl#32      // bfi    
543         movi    $MASK.2d,#-1                      
544         fmov    $IN01_2,x8                        
545         fmov    $IN01_3,x10                       
546         fmov    $IN01_4,x12                       
547         ushr    $MASK.2d,$MASK.2d,#38             
548                                                   
549         b.ls    .Lskip_loop                       
550                                                   
551 .align  4                                         
552 .Loop_neon:                                       
553         //////////////////////////////////////    
554         // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4    
555         // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3    
556         //   \___________________/                
557         // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4    
558         // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4    
559         //   \___________________/ \__________    
560         //                                        
561         // Note that we start with inp[2:3]*r^    
562         // doesn't depend on reduction in prev    
563         //////////////////////////////////////    
564         // d4 = h0*r4 + h1*r3   + h2*r2   + h3    
565         // d3 = h0*r3 + h1*r2   + h2*r1   + h3    
566         // d2 = h0*r2 + h1*r1   + h2*r0   + h3    
567         // d1 = h0*r1 + h1*r0   + h2*5*r4 + h3    
568         // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3    
569                                                   
570         subs    $len,$len,#64                     
571         umull   $ACC4,$IN23_0,${R4}[2]            
572         csel    $in2,$zeros,$in2,lo               
573         umull   $ACC3,$IN23_0,${R3}[2]            
574         umull   $ACC2,$IN23_0,${R2}[2]            
575          ldp    x8,x12,[$in2],#16       // inp    
576         umull   $ACC1,$IN23_0,${R1}[2]            
577          ldp    x9,x13,[$in2],#48                 
578         umull   $ACC0,$IN23_0,${R0}[2]            
579 #ifdef  __AARCH64EB__                             
580          rev    x8,x8                             
581          rev    x12,x12                           
582          rev    x9,x9                             
583          rev    x13,x13                           
584 #endif                                            
585                                                   
586         umlal   $ACC4,$IN23_1,${R3}[2]            
587          and    x4,x8,#0x03ffffff       // bas    
588         umlal   $ACC3,$IN23_1,${R2}[2]            
589          and    x5,x9,#0x03ffffff                 
590         umlal   $ACC2,$IN23_1,${R1}[2]            
591          ubfx   x6,x8,#26,#26                     
592         umlal   $ACC1,$IN23_1,${R0}[2]            
593          ubfx   x7,x9,#26,#26                     
594         umlal   $ACC0,$IN23_1,${S4}[2]            
595          add    x4,x4,x5,lsl#32         // bfi    
596                                                   
597         umlal   $ACC4,$IN23_2,${R2}[2]            
598          extr   x8,x12,x8,#52                     
599         umlal   $ACC3,$IN23_2,${R1}[2]            
600          extr   x9,x13,x9,#52                     
601         umlal   $ACC2,$IN23_2,${R0}[2]            
602          add    x6,x6,x7,lsl#32         // bfi    
603         umlal   $ACC1,$IN23_2,${S4}[2]            
604          fmov   $IN23_0,x4                        
605         umlal   $ACC0,$IN23_2,${S3}[2]            
606          and    x8,x8,#0x03ffffff                 
607                                                   
608         umlal   $ACC4,$IN23_3,${R1}[2]            
609          and    x9,x9,#0x03ffffff                 
610         umlal   $ACC3,$IN23_3,${R0}[2]            
611          ubfx   x10,x12,#14,#26                   
612         umlal   $ACC2,$IN23_3,${S4}[2]            
613          ubfx   x11,x13,#14,#26                   
614         umlal   $ACC1,$IN23_3,${S3}[2]            
615          add    x8,x8,x9,lsl#32         // bfi    
616         umlal   $ACC0,$IN23_3,${S2}[2]            
617          fmov   $IN23_1,x6                        
618                                                   
619         add     $IN01_2,$IN01_2,$H2               
620          add    x12,$padbit,x12,lsr#40            
621         umlal   $ACC4,$IN23_4,${R0}[2]            
622          add    x13,$padbit,x13,lsr#40            
623         umlal   $ACC3,$IN23_4,${S4}[2]            
624          add    x10,x10,x11,lsl#32      // bfi    
625         umlal   $ACC2,$IN23_4,${S3}[2]            
626          add    x12,x12,x13,lsl#32      // bfi    
627         umlal   $ACC1,$IN23_4,${S2}[2]            
628          fmov   $IN23_2,x8                        
629         umlal   $ACC0,$IN23_4,${S1}[2]            
630          fmov   $IN23_3,x10                       
631                                                   
632         //////////////////////////////////////    
633         // (hash+inp[0:1])*r^4 and accumulate     
634                                                   
635         add     $IN01_0,$IN01_0,$H0               
636          fmov   $IN23_4,x12                       
637         umlal   $ACC3,$IN01_2,${R1}[0]            
638          ldp    x8,x12,[$inp],#16       // inp    
639         umlal   $ACC0,$IN01_2,${S3}[0]            
640          ldp    x9,x13,[$inp],#48                 
641         umlal   $ACC4,$IN01_2,${R2}[0]            
642         umlal   $ACC1,$IN01_2,${S4}[0]            
643         umlal   $ACC2,$IN01_2,${R0}[0]            
644 #ifdef  __AARCH64EB__                             
645          rev    x8,x8                             
646          rev    x12,x12                           
647          rev    x9,x9                             
648          rev    x13,x13                           
649 #endif                                            
650                                                   
651         add     $IN01_1,$IN01_1,$H1               
652         umlal   $ACC3,$IN01_0,${R3}[0]            
653         umlal   $ACC4,$IN01_0,${R4}[0]            
654          and    x4,x8,#0x03ffffff       // bas    
655         umlal   $ACC2,$IN01_0,${R2}[0]            
656          and    x5,x9,#0x03ffffff                 
657         umlal   $ACC0,$IN01_0,${R0}[0]            
658          ubfx   x6,x8,#26,#26                     
659         umlal   $ACC1,$IN01_0,${R1}[0]            
660          ubfx   x7,x9,#26,#26                     
661                                                   
662         add     $IN01_3,$IN01_3,$H3               
663          add    x4,x4,x5,lsl#32         // bfi    
664         umlal   $ACC3,$IN01_1,${R2}[0]            
665          extr   x8,x12,x8,#52                     
666         umlal   $ACC4,$IN01_1,${R3}[0]            
667          extr   x9,x13,x9,#52                     
668         umlal   $ACC0,$IN01_1,${S4}[0]            
669          add    x6,x6,x7,lsl#32         // bfi    
670         umlal   $ACC2,$IN01_1,${R1}[0]            
671          fmov   $IN01_0,x4                        
672         umlal   $ACC1,$IN01_1,${R0}[0]            
673          and    x8,x8,#0x03ffffff                 
674                                                   
675         add     $IN01_4,$IN01_4,$H4               
676          and    x9,x9,#0x03ffffff                 
677         umlal   $ACC3,$IN01_3,${R0}[0]            
678          ubfx   x10,x12,#14,#26                   
679         umlal   $ACC0,$IN01_3,${S2}[0]            
680          ubfx   x11,x13,#14,#26                   
681         umlal   $ACC4,$IN01_3,${R1}[0]            
682          add    x8,x8,x9,lsl#32         // bfi    
683         umlal   $ACC1,$IN01_3,${S3}[0]            
684          fmov   $IN01_1,x6                        
685         umlal   $ACC2,$IN01_3,${S4}[0]            
686          add    x12,$padbit,x12,lsr#40            
687                                                   
688         umlal   $ACC3,$IN01_4,${S4}[0]            
689          add    x13,$padbit,x13,lsr#40            
690         umlal   $ACC0,$IN01_4,${S1}[0]            
691          add    x10,x10,x11,lsl#32      // bfi    
692         umlal   $ACC4,$IN01_4,${R0}[0]            
693          add    x12,x12,x13,lsl#32      // bfi    
694         umlal   $ACC1,$IN01_4,${S2}[0]            
695          fmov   $IN01_2,x8                        
696         umlal   $ACC2,$IN01_4,${S3}[0]            
697          fmov   $IN01_3,x10                       
698          fmov   $IN01_4,x12                       
699                                                   
700         //////////////////////////////////////    
701         // lazy reduction as discussed in "NEO    
702         // and P. Schwabe                         
703         //                                        
704         // [see discussion in poly1305-armv4 m    
705                                                   
706         ushr    $T0.2d,$ACC3,#26                  
707         xtn     $H3,$ACC3                         
708          ushr   $T1.2d,$ACC0,#26                  
709          and    $ACC0,$ACC0,$MASK.2d              
710         add     $ACC4,$ACC4,$T0.2d      // h3     
711         bic     $H3,#0xfc,lsl#24        // &=0    
712          add    $ACC1,$ACC1,$T1.2d      // h0     
713                                                   
714         ushr    $T0.2d,$ACC4,#26                  
715         xtn     $H4,$ACC4                         
716          ushr   $T1.2d,$ACC1,#26                  
717          xtn    $H1,$ACC1                         
718         bic     $H4,#0xfc,lsl#24                  
719          add    $ACC2,$ACC2,$T1.2d      // h1     
720                                                   
721         add     $ACC0,$ACC0,$T0.2d                
722         shl     $T0.2d,$T0.2d,#2                  
723          shrn   $T1.2s,$ACC2,#26                  
724          xtn    $H2,$ACC2                         
725         add     $ACC0,$ACC0,$T0.2d      // h4     
726          bic    $H1,#0xfc,lsl#24                  
727          add    $H3,$H3,$T1.2s          // h2     
728          bic    $H2,#0xfc,lsl#24                  
729                                                   
730         shrn    $T0.2s,$ACC0,#26                  
731         xtn     $H0,$ACC0                         
732          ushr   $T1.2s,$H3,#26                    
733          bic    $H3,#0xfc,lsl#24                  
734          bic    $H0,#0xfc,lsl#24                  
735         add     $H1,$H1,$T0.2s          // h0     
736          add    $H4,$H4,$T1.2s          // h3     
737                                                   
738         b.hi    .Loop_neon                        
739                                                   
740 .Lskip_loop:                                      
741         dup     $IN23_2,${IN23_2}[0]              
742         add     $IN01_2,$IN01_2,$H2               
743                                                   
744         //////////////////////////////////////    
745         // multiply (inp[0:1]+hash) or inp[2:3    
746                                                   
747         adds    $len,$len,#32                     
748         b.ne    .Long_tail                        
749                                                   
750         dup     $IN23_2,${IN01_2}[0]              
751         add     $IN23_0,$IN01_0,$H0               
752         add     $IN23_3,$IN01_3,$H3               
753         add     $IN23_1,$IN01_1,$H1               
754         add     $IN23_4,$IN01_4,$H4               
755                                                   
756 .Long_tail:                                       
757         dup     $IN23_0,${IN23_0}[0]              
758         umull2  $ACC0,$IN23_2,${S3}               
759         umull2  $ACC3,$IN23_2,${R1}               
760         umull2  $ACC4,$IN23_2,${R2}               
761         umull2  $ACC2,$IN23_2,${R0}               
762         umull2  $ACC1,$IN23_2,${S4}               
763                                                   
764         dup     $IN23_1,${IN23_1}[0]              
765         umlal2  $ACC0,$IN23_0,${R0}               
766         umlal2  $ACC2,$IN23_0,${R2}               
767         umlal2  $ACC3,$IN23_0,${R3}               
768         umlal2  $ACC4,$IN23_0,${R4}               
769         umlal2  $ACC1,$IN23_0,${R1}               
770                                                   
771         dup     $IN23_3,${IN23_3}[0]              
772         umlal2  $ACC0,$IN23_1,${S4}               
773         umlal2  $ACC3,$IN23_1,${R2}               
774         umlal2  $ACC2,$IN23_1,${R1}               
775         umlal2  $ACC4,$IN23_1,${R3}               
776         umlal2  $ACC1,$IN23_1,${R0}               
777                                                   
778         dup     $IN23_4,${IN23_4}[0]              
779         umlal2  $ACC3,$IN23_3,${R0}               
780         umlal2  $ACC4,$IN23_3,${R1}               
781         umlal2  $ACC0,$IN23_3,${S2}               
782         umlal2  $ACC1,$IN23_3,${S3}               
783         umlal2  $ACC2,$IN23_3,${S4}               
784                                                   
785         umlal2  $ACC3,$IN23_4,${S4}               
786         umlal2  $ACC0,$IN23_4,${S1}               
787         umlal2  $ACC4,$IN23_4,${R0}               
788         umlal2  $ACC1,$IN23_4,${S2}               
789         umlal2  $ACC2,$IN23_4,${S3}               
790                                                   
791         b.eq    .Lshort_tail                      
792                                                   
793         //////////////////////////////////////    
794         // (hash+inp[0:1])*r^4:r^3 and accumul    
795                                                   
796         add     $IN01_0,$IN01_0,$H0               
797         umlal   $ACC3,$IN01_2,${R1}               
798         umlal   $ACC0,$IN01_2,${S3}               
799         umlal   $ACC4,$IN01_2,${R2}               
800         umlal   $ACC1,$IN01_2,${S4}               
801         umlal   $ACC2,$IN01_2,${R0}               
802                                                   
803         add     $IN01_1,$IN01_1,$H1               
804         umlal   $ACC3,$IN01_0,${R3}               
805         umlal   $ACC0,$IN01_0,${R0}               
806         umlal   $ACC4,$IN01_0,${R4}               
807         umlal   $ACC1,$IN01_0,${R1}               
808         umlal   $ACC2,$IN01_0,${R2}               
809                                                   
810         add     $IN01_3,$IN01_3,$H3               
811         umlal   $ACC3,$IN01_1,${R2}               
812         umlal   $ACC0,$IN01_1,${S4}               
813         umlal   $ACC4,$IN01_1,${R3}               
814         umlal   $ACC1,$IN01_1,${R0}               
815         umlal   $ACC2,$IN01_1,${R1}               
816                                                   
817         add     $IN01_4,$IN01_4,$H4               
818         umlal   $ACC3,$IN01_3,${R0}               
819         umlal   $ACC0,$IN01_3,${S2}               
820         umlal   $ACC4,$IN01_3,${R1}               
821         umlal   $ACC1,$IN01_3,${S3}               
822         umlal   $ACC2,$IN01_3,${S4}               
823                                                   
824         umlal   $ACC3,$IN01_4,${S4}               
825         umlal   $ACC0,$IN01_4,${S1}               
826         umlal   $ACC4,$IN01_4,${R0}               
827         umlal   $ACC1,$IN01_4,${S2}               
828         umlal   $ACC2,$IN01_4,${S3}               
829                                                   
830 .Lshort_tail:                                     
831         //////////////////////////////////////    
832         // horizontal add                         
833                                                   
834         addp    $ACC3,$ACC3,$ACC3                 
835          ldp    d8,d9,[sp,#16]          // mee    
836         addp    $ACC0,$ACC0,$ACC0                 
837          ldp    d10,d11,[sp,#32]                  
838         addp    $ACC4,$ACC4,$ACC4                 
839          ldp    d12,d13,[sp,#48]                  
840         addp    $ACC1,$ACC1,$ACC1                 
841          ldp    d14,d15,[sp,#64]                  
842         addp    $ACC2,$ACC2,$ACC2                 
843          ldr    x30,[sp,#8]                       
844                                                   
845         //////////////////////////////////////    
846         // lazy reduction, but without narrowi    
847                                                   
848         ushr    $T0.2d,$ACC3,#26                  
849         and     $ACC3,$ACC3,$MASK.2d              
850          ushr   $T1.2d,$ACC0,#26                  
851          and    $ACC0,$ACC0,$MASK.2d              
852                                                   
853         add     $ACC4,$ACC4,$T0.2d      // h3     
854          add    $ACC1,$ACC1,$T1.2d      // h0     
855                                                   
856         ushr    $T0.2d,$ACC4,#26                  
857         and     $ACC4,$ACC4,$MASK.2d              
858          ushr   $T1.2d,$ACC1,#26                  
859          and    $ACC1,$ACC1,$MASK.2d              
860          add    $ACC2,$ACC2,$T1.2d      // h1     
861                                                   
862         add     $ACC0,$ACC0,$T0.2d                
863         shl     $T0.2d,$T0.2d,#2                  
864          ushr   $T1.2d,$ACC2,#26                  
865          and    $ACC2,$ACC2,$MASK.2d              
866         add     $ACC0,$ACC0,$T0.2d      // h4     
867          add    $ACC3,$ACC3,$T1.2d      // h2     
868                                                   
869         ushr    $T0.2d,$ACC0,#26                  
870         and     $ACC0,$ACC0,$MASK.2d              
871          ushr   $T1.2d,$ACC3,#26                  
872          and    $ACC3,$ACC3,$MASK.2d              
873         add     $ACC1,$ACC1,$T0.2d      // h0     
874          add    $ACC4,$ACC4,$T1.2d      // h3     
875                                                   
876         //////////////////////////////////////    
877         // write the result, can be partially     
878                                                   
879         st4     {$ACC0,$ACC1,$ACC2,$ACC3}[0],[    
880         mov     x4,#1                             
881         st1     {$ACC4}[0],[$ctx]                 
882         str     x4,[$ctx,#8]            // set    
883                                                   
884         ldr     x29,[sp],#80                      
885          .inst  0xd50323bf              // aut    
886         ret                                       
887 .size   poly1305_blocks_neon,.-poly1305_blocks    
888                                                   
889 .pushsection .rodata                              
890 .align  5                                         
891 .Lzeros:                                          
892 .long   0,0,0,0,0,0,0,0                           
893 .asciz  "Poly1305 for ARMv8, CRYPTOGAMS by \@d    
894 .popsection                                       
895                                                   
896 .align  2                                         
897 #if !defined(__KERNEL__) && !defined(_WIN64)      
898 .comm   OPENSSL_armcap_P,4,4                      
899 .hidden OPENSSL_armcap_P                          
900 #endif                                            
901 ___                                               
902                                                   
903 foreach (split("\n",$code)) {                     
904         s/\b(shrn\s+v[0-9]+)\.[24]d/$1.2s/        
905         s/\b(fmov\s+)v([0-9]+)[^,]*,\s*x([0-9]    
906         (m/\bdup\b/ and (s/\.[24]s/.2d/g or 1)    
907         (m/\b(eor|and)/ and (s/\.[248][sdh]/.1    
908         (m/\bum(ul|la)l\b/ and (s/\.4s/.2s/g o    
909         (m/\bum(ul|la)l2\b/ and (s/\.2s/.4s/g     
910         (m/\bst[1-4]\s+{[^}]+}\[/ and (s/\.[24    
911                                                   
912         s/\.[124]([sd])\[/.$1\[/;                 
913         s/w#x([0-9]+)/w$1/g;                      
914                                                   
915         print $_,"\n";                            
916 }                                                 
917 close STDOUT;                                     
                                                      

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

sflogo.php