~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/arch/x86/crypto/chacha-avx2-x86_64.S

Version: ~ [ linux-6.12-rc7 ] ~ [ linux-6.11.7 ] ~ [ linux-6.10.14 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.60 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.116 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.171 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.229 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.285 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.323 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.12 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

Diff markup

Differences between /arch/x86/crypto/chacha-avx2-x86_64.S (Architecture alpha) and /arch/i386/crypto/chacha-avx2-x86_64.S (Architecture i386)


  1 /* SPDX-License-Identifier: GPL-2.0-or-later *    
  2 /*                                                
  3  * ChaCha 256-bit cipher algorithm, x64 AVX2 f    
  4  *                                                
  5  * Copyright (C) 2015 Martin Willi                
  6  */                                               
  7                                                   
  8 #include <linux/linkage.h>                        
  9                                                   
 10 .section        .rodata.cst32.ROT8, "aM", @pro    
 11 .align 32                                         
 12 ROT8:   .octa 0x0e0d0c0f0a09080b06050407020100    
 13         .octa 0x0e0d0c0f0a09080b06050407020100    
 14                                                   
 15 .section        .rodata.cst32.ROT16, "aM", @pr    
 16 .align 32                                         
 17 ROT16:  .octa 0x0d0c0f0e09080b0a05040706010003    
 18         .octa 0x0d0c0f0e09080b0a05040706010003    
 19                                                   
 20 .section        .rodata.cst32.CTRINC, "aM", @p    
 21 .align 32                                         
 22 CTRINC: .octa 0x000000030000000200000001000000    
 23         .octa 0x000000070000000600000005000000    
 24                                                   
 25 .section        .rodata.cst32.CTR2BL, "aM", @p    
 26 .align 32                                         
 27 CTR2BL: .octa 0x000000000000000000000000000000    
 28         .octa 0x000000000000000000000000000000    
 29                                                   
 30 .section        .rodata.cst32.CTR4BL, "aM", @p    
 31 .align 32                                         
 32 CTR4BL: .octa 0x000000000000000000000000000000    
 33         .octa 0x000000000000000000000000000000    
 34                                                   
 35 .text                                             
 36                                                   
 37 SYM_FUNC_START(chacha_2block_xor_avx2)            
 38         # %rdi: Input state matrix, s             
 39         # %rsi: up to 2 data blocks output, o     
 40         # %rdx: up to 2 data blocks input, i      
 41         # %rcx: input/output length in bytes      
 42         # %r8d: nrounds                           
 43                                                   
 44         # This function encrypts two ChaCha bl    
 45         # matrix twice across four AVX registe    
 46         # on four words in each matrix in para    
 47         # rearrange the words after each round    
 48                                                   
 49         vzeroupper                                
 50                                                   
 51         # x0..3[0-2] = s0..3                      
 52         vbroadcasti128  0x00(%rdi),%ymm0          
 53         vbroadcasti128  0x10(%rdi),%ymm1          
 54         vbroadcasti128  0x20(%rdi),%ymm2          
 55         vbroadcasti128  0x30(%rdi),%ymm3          
 56                                                   
 57         vpaddd          CTR2BL(%rip),%ymm3,%ym    
 58                                                   
 59         vmovdqa         %ymm0,%ymm8               
 60         vmovdqa         %ymm1,%ymm9               
 61         vmovdqa         %ymm2,%ymm10              
 62         vmovdqa         %ymm3,%ymm11              
 63                                                   
 64         vmovdqa         ROT8(%rip),%ymm4          
 65         vmovdqa         ROT16(%rip),%ymm5         
 66                                                   
 67         mov             %rcx,%rax                 
 68                                                   
 69 .Ldoubleround:                                    
 70                                                   
 71         # x0 += x1, x3 = rotl32(x3 ^ x0, 16)      
 72         vpaddd          %ymm1,%ymm0,%ymm0         
 73         vpxor           %ymm0,%ymm3,%ymm3         
 74         vpshufb         %ymm5,%ymm3,%ymm3         
 75                                                   
 76         # x2 += x3, x1 = rotl32(x1 ^ x2, 12)      
 77         vpaddd          %ymm3,%ymm2,%ymm2         
 78         vpxor           %ymm2,%ymm1,%ymm1         
 79         vmovdqa         %ymm1,%ymm6               
 80         vpslld          $12,%ymm6,%ymm6           
 81         vpsrld          $20,%ymm1,%ymm1           
 82         vpor            %ymm6,%ymm1,%ymm1         
 83                                                   
 84         # x0 += x1, x3 = rotl32(x3 ^ x0, 8)       
 85         vpaddd          %ymm1,%ymm0,%ymm0         
 86         vpxor           %ymm0,%ymm3,%ymm3         
 87         vpshufb         %ymm4,%ymm3,%ymm3         
 88                                                   
 89         # x2 += x3, x1 = rotl32(x1 ^ x2, 7)       
 90         vpaddd          %ymm3,%ymm2,%ymm2         
 91         vpxor           %ymm2,%ymm1,%ymm1         
 92         vmovdqa         %ymm1,%ymm7               
 93         vpslld          $7,%ymm7,%ymm7            
 94         vpsrld          $25,%ymm1,%ymm1           
 95         vpor            %ymm7,%ymm1,%ymm1         
 96                                                   
 97         # x1 = shuffle32(x1, MASK(0, 3, 2, 1))    
 98         vpshufd         $0x39,%ymm1,%ymm1         
 99         # x2 = shuffle32(x2, MASK(1, 0, 3, 2))    
100         vpshufd         $0x4e,%ymm2,%ymm2         
101         # x3 = shuffle32(x3, MASK(2, 1, 0, 3))    
102         vpshufd         $0x93,%ymm3,%ymm3         
103                                                   
104         # x0 += x1, x3 = rotl32(x3 ^ x0, 16)      
105         vpaddd          %ymm1,%ymm0,%ymm0         
106         vpxor           %ymm0,%ymm3,%ymm3         
107         vpshufb         %ymm5,%ymm3,%ymm3         
108                                                   
109         # x2 += x3, x1 = rotl32(x1 ^ x2, 12)      
110         vpaddd          %ymm3,%ymm2,%ymm2         
111         vpxor           %ymm2,%ymm1,%ymm1         
112         vmovdqa         %ymm1,%ymm6               
113         vpslld          $12,%ymm6,%ymm6           
114         vpsrld          $20,%ymm1,%ymm1           
115         vpor            %ymm6,%ymm1,%ymm1         
116                                                   
117         # x0 += x1, x3 = rotl32(x3 ^ x0, 8)       
118         vpaddd          %ymm1,%ymm0,%ymm0         
119         vpxor           %ymm0,%ymm3,%ymm3         
120         vpshufb         %ymm4,%ymm3,%ymm3         
121                                                   
122         # x2 += x3, x1 = rotl32(x1 ^ x2, 7)       
123         vpaddd          %ymm3,%ymm2,%ymm2         
124         vpxor           %ymm2,%ymm1,%ymm1         
125         vmovdqa         %ymm1,%ymm7               
126         vpslld          $7,%ymm7,%ymm7            
127         vpsrld          $25,%ymm1,%ymm1           
128         vpor            %ymm7,%ymm1,%ymm1         
129                                                   
130         # x1 = shuffle32(x1, MASK(2, 1, 0, 3))    
131         vpshufd         $0x93,%ymm1,%ymm1         
132         # x2 = shuffle32(x2, MASK(1, 0, 3, 2))    
133         vpshufd         $0x4e,%ymm2,%ymm2         
134         # x3 = shuffle32(x3, MASK(0, 3, 2, 1))    
135         vpshufd         $0x39,%ymm3,%ymm3         
136                                                   
137         sub             $2,%r8d                   
138         jnz             .Ldoubleround             
139                                                   
140         # o0 = i0 ^ (x0 + s0)                     
141         vpaddd          %ymm8,%ymm0,%ymm7         
142         cmp             $0x10,%rax                
143         jl              .Lxorpart2                
144         vpxor           0x00(%rdx),%xmm7,%xmm6    
145         vmovdqu         %xmm6,0x00(%rsi)          
146         vextracti128    $1,%ymm7,%xmm0            
147         # o1 = i1 ^ (x1 + s1)                     
148         vpaddd          %ymm9,%ymm1,%ymm7         
149         cmp             $0x20,%rax                
150         jl              .Lxorpart2                
151         vpxor           0x10(%rdx),%xmm7,%xmm6    
152         vmovdqu         %xmm6,0x10(%rsi)          
153         vextracti128    $1,%ymm7,%xmm1            
154         # o2 = i2 ^ (x2 + s2)                     
155         vpaddd          %ymm10,%ymm2,%ymm7        
156         cmp             $0x30,%rax                
157         jl              .Lxorpart2                
158         vpxor           0x20(%rdx),%xmm7,%xmm6    
159         vmovdqu         %xmm6,0x20(%rsi)          
160         vextracti128    $1,%ymm7,%xmm2            
161         # o3 = i3 ^ (x3 + s3)                     
162         vpaddd          %ymm11,%ymm3,%ymm7        
163         cmp             $0x40,%rax                
164         jl              .Lxorpart2                
165         vpxor           0x30(%rdx),%xmm7,%xmm6    
166         vmovdqu         %xmm6,0x30(%rsi)          
167         vextracti128    $1,%ymm7,%xmm3            
168                                                   
169         # xor and write second block              
170         vmovdqa         %xmm0,%xmm7               
171         cmp             $0x50,%rax                
172         jl              .Lxorpart2                
173         vpxor           0x40(%rdx),%xmm7,%xmm6    
174         vmovdqu         %xmm6,0x40(%rsi)          
175                                                   
176         vmovdqa         %xmm1,%xmm7               
177         cmp             $0x60,%rax                
178         jl              .Lxorpart2                
179         vpxor           0x50(%rdx),%xmm7,%xmm6    
180         vmovdqu         %xmm6,0x50(%rsi)          
181                                                   
182         vmovdqa         %xmm2,%xmm7               
183         cmp             $0x70,%rax                
184         jl              .Lxorpart2                
185         vpxor           0x60(%rdx),%xmm7,%xmm6    
186         vmovdqu         %xmm6,0x60(%rsi)          
187                                                   
188         vmovdqa         %xmm3,%xmm7               
189         cmp             $0x80,%rax                
190         jl              .Lxorpart2                
191         vpxor           0x70(%rdx),%xmm7,%xmm6    
192         vmovdqu         %xmm6,0x70(%rsi)          
193                                                   
194 .Ldone2:                                          
195         vzeroupper                                
196         RET                                       
197                                                   
198 .Lxorpart2:                                       
199         # xor remaining bytes from partial reg    
200         mov             %rax,%r9                  
201         and             $0x0f,%r9                 
202         jz              .Ldone2                   
203         and             $~0x0f,%rax               
204                                                   
205         mov             %rsi,%r11                 
206                                                   
207         lea             8(%rsp),%r10              
208         sub             $0x10,%rsp                
209         and             $~31,%rsp                 
210                                                   
211         lea             (%rdx,%rax),%rsi          
212         mov             %rsp,%rdi                 
213         mov             %r9,%rcx                  
214         rep movsb                                 
215                                                   
216         vpxor           0x00(%rsp),%xmm7,%xmm7    
217         vmovdqa         %xmm7,0x00(%rsp)          
218                                                   
219         mov             %rsp,%rsi                 
220         lea             (%r11,%rax),%rdi          
221         mov             %r9,%rcx                  
222         rep movsb                                 
223                                                   
224         lea             -8(%r10),%rsp             
225         jmp             .Ldone2                   
226                                                   
227 SYM_FUNC_END(chacha_2block_xor_avx2)              
228                                                   
229 SYM_FUNC_START(chacha_4block_xor_avx2)            
230         # %rdi: Input state matrix, s             
231         # %rsi: up to 4 data blocks output, o     
232         # %rdx: up to 4 data blocks input, i      
233         # %rcx: input/output length in bytes      
234         # %r8d: nrounds                           
235                                                   
236         # This function encrypts four ChaCha b    
237         # matrix four times across eight AVX r    
238         # operations on four words in two matr    
239         # to the operations on the four words     
240         # required word shuffling has a rather    
241         # arithmetic on two matrix-pairs witho    
242                                                   
243         vzeroupper                                
244                                                   
245         # x0..3[0-4] = s0..3                      
246         vbroadcasti128  0x00(%rdi),%ymm0          
247         vbroadcasti128  0x10(%rdi),%ymm1          
248         vbroadcasti128  0x20(%rdi),%ymm2          
249         vbroadcasti128  0x30(%rdi),%ymm3          
250                                                   
251         vmovdqa         %ymm0,%ymm4               
252         vmovdqa         %ymm1,%ymm5               
253         vmovdqa         %ymm2,%ymm6               
254         vmovdqa         %ymm3,%ymm7               
255                                                   
256         vpaddd          CTR2BL(%rip),%ymm3,%ym    
257         vpaddd          CTR4BL(%rip),%ymm7,%ym    
258                                                   
259         vmovdqa         %ymm0,%ymm11              
260         vmovdqa         %ymm1,%ymm12              
261         vmovdqa         %ymm2,%ymm13              
262         vmovdqa         %ymm3,%ymm14              
263         vmovdqa         %ymm7,%ymm15              
264                                                   
265         vmovdqa         ROT8(%rip),%ymm8          
266         vmovdqa         ROT16(%rip),%ymm9         
267                                                   
268         mov             %rcx,%rax                 
269                                                   
270 .Ldoubleround4:                                   
271                                                   
272         # x0 += x1, x3 = rotl32(x3 ^ x0, 16)      
273         vpaddd          %ymm1,%ymm0,%ymm0         
274         vpxor           %ymm0,%ymm3,%ymm3         
275         vpshufb         %ymm9,%ymm3,%ymm3         
276                                                   
277         vpaddd          %ymm5,%ymm4,%ymm4         
278         vpxor           %ymm4,%ymm7,%ymm7         
279         vpshufb         %ymm9,%ymm7,%ymm7         
280                                                   
281         # x2 += x3, x1 = rotl32(x1 ^ x2, 12)      
282         vpaddd          %ymm3,%ymm2,%ymm2         
283         vpxor           %ymm2,%ymm1,%ymm1         
284         vmovdqa         %ymm1,%ymm10              
285         vpslld          $12,%ymm10,%ymm10         
286         vpsrld          $20,%ymm1,%ymm1           
287         vpor            %ymm10,%ymm1,%ymm1        
288                                                   
289         vpaddd          %ymm7,%ymm6,%ymm6         
290         vpxor           %ymm6,%ymm5,%ymm5         
291         vmovdqa         %ymm5,%ymm10              
292         vpslld          $12,%ymm10,%ymm10         
293         vpsrld          $20,%ymm5,%ymm5           
294         vpor            %ymm10,%ymm5,%ymm5        
295                                                   
296         # x0 += x1, x3 = rotl32(x3 ^ x0, 8)       
297         vpaddd          %ymm1,%ymm0,%ymm0         
298         vpxor           %ymm0,%ymm3,%ymm3         
299         vpshufb         %ymm8,%ymm3,%ymm3         
300                                                   
301         vpaddd          %ymm5,%ymm4,%ymm4         
302         vpxor           %ymm4,%ymm7,%ymm7         
303         vpshufb         %ymm8,%ymm7,%ymm7         
304                                                   
305         # x2 += x3, x1 = rotl32(x1 ^ x2, 7)       
306         vpaddd          %ymm3,%ymm2,%ymm2         
307         vpxor           %ymm2,%ymm1,%ymm1         
308         vmovdqa         %ymm1,%ymm10              
309         vpslld          $7,%ymm10,%ymm10          
310         vpsrld          $25,%ymm1,%ymm1           
311         vpor            %ymm10,%ymm1,%ymm1        
312                                                   
313         vpaddd          %ymm7,%ymm6,%ymm6         
314         vpxor           %ymm6,%ymm5,%ymm5         
315         vmovdqa         %ymm5,%ymm10              
316         vpslld          $7,%ymm10,%ymm10          
317         vpsrld          $25,%ymm5,%ymm5           
318         vpor            %ymm10,%ymm5,%ymm5        
319                                                   
320         # x1 = shuffle32(x1, MASK(0, 3, 2, 1))    
321         vpshufd         $0x39,%ymm1,%ymm1         
322         vpshufd         $0x39,%ymm5,%ymm5         
323         # x2 = shuffle32(x2, MASK(1, 0, 3, 2))    
324         vpshufd         $0x4e,%ymm2,%ymm2         
325         vpshufd         $0x4e,%ymm6,%ymm6         
326         # x3 = shuffle32(x3, MASK(2, 1, 0, 3))    
327         vpshufd         $0x93,%ymm3,%ymm3         
328         vpshufd         $0x93,%ymm7,%ymm7         
329                                                   
330         # x0 += x1, x3 = rotl32(x3 ^ x0, 16)      
331         vpaddd          %ymm1,%ymm0,%ymm0         
332         vpxor           %ymm0,%ymm3,%ymm3         
333         vpshufb         %ymm9,%ymm3,%ymm3         
334                                                   
335         vpaddd          %ymm5,%ymm4,%ymm4         
336         vpxor           %ymm4,%ymm7,%ymm7         
337         vpshufb         %ymm9,%ymm7,%ymm7         
338                                                   
339         # x2 += x3, x1 = rotl32(x1 ^ x2, 12)      
340         vpaddd          %ymm3,%ymm2,%ymm2         
341         vpxor           %ymm2,%ymm1,%ymm1         
342         vmovdqa         %ymm1,%ymm10              
343         vpslld          $12,%ymm10,%ymm10         
344         vpsrld          $20,%ymm1,%ymm1           
345         vpor            %ymm10,%ymm1,%ymm1        
346                                                   
347         vpaddd          %ymm7,%ymm6,%ymm6         
348         vpxor           %ymm6,%ymm5,%ymm5         
349         vmovdqa         %ymm5,%ymm10              
350         vpslld          $12,%ymm10,%ymm10         
351         vpsrld          $20,%ymm5,%ymm5           
352         vpor            %ymm10,%ymm5,%ymm5        
353                                                   
354         # x0 += x1, x3 = rotl32(x3 ^ x0, 8)       
355         vpaddd          %ymm1,%ymm0,%ymm0         
356         vpxor           %ymm0,%ymm3,%ymm3         
357         vpshufb         %ymm8,%ymm3,%ymm3         
358                                                   
359         vpaddd          %ymm5,%ymm4,%ymm4         
360         vpxor           %ymm4,%ymm7,%ymm7         
361         vpshufb         %ymm8,%ymm7,%ymm7         
362                                                   
363         # x2 += x3, x1 = rotl32(x1 ^ x2, 7)       
364         vpaddd          %ymm3,%ymm2,%ymm2         
365         vpxor           %ymm2,%ymm1,%ymm1         
366         vmovdqa         %ymm1,%ymm10              
367         vpslld          $7,%ymm10,%ymm10          
368         vpsrld          $25,%ymm1,%ymm1           
369         vpor            %ymm10,%ymm1,%ymm1        
370                                                   
371         vpaddd          %ymm7,%ymm6,%ymm6         
372         vpxor           %ymm6,%ymm5,%ymm5         
373         vmovdqa         %ymm5,%ymm10              
374         vpslld          $7,%ymm10,%ymm10          
375         vpsrld          $25,%ymm5,%ymm5           
376         vpor            %ymm10,%ymm5,%ymm5        
377                                                   
378         # x1 = shuffle32(x1, MASK(2, 1, 0, 3))    
379         vpshufd         $0x93,%ymm1,%ymm1         
380         vpshufd         $0x93,%ymm5,%ymm5         
381         # x2 = shuffle32(x2, MASK(1, 0, 3, 2))    
382         vpshufd         $0x4e,%ymm2,%ymm2         
383         vpshufd         $0x4e,%ymm6,%ymm6         
384         # x3 = shuffle32(x3, MASK(0, 3, 2, 1))    
385         vpshufd         $0x39,%ymm3,%ymm3         
386         vpshufd         $0x39,%ymm7,%ymm7         
387                                                   
388         sub             $2,%r8d                   
389         jnz             .Ldoubleround4            
390                                                   
391         # o0 = i0 ^ (x0 + s0), first block        
392         vpaddd          %ymm11,%ymm0,%ymm10       
393         cmp             $0x10,%rax                
394         jl              .Lxorpart4                
395         vpxor           0x00(%rdx),%xmm10,%xmm    
396         vmovdqu         %xmm9,0x00(%rsi)          
397         vextracti128    $1,%ymm10,%xmm0           
398         # o1 = i1 ^ (x1 + s1), first block        
399         vpaddd          %ymm12,%ymm1,%ymm10       
400         cmp             $0x20,%rax                
401         jl              .Lxorpart4                
402         vpxor           0x10(%rdx),%xmm10,%xmm    
403         vmovdqu         %xmm9,0x10(%rsi)          
404         vextracti128    $1,%ymm10,%xmm1           
405         # o2 = i2 ^ (x2 + s2), first block        
406         vpaddd          %ymm13,%ymm2,%ymm10       
407         cmp             $0x30,%rax                
408         jl              .Lxorpart4                
409         vpxor           0x20(%rdx),%xmm10,%xmm    
410         vmovdqu         %xmm9,0x20(%rsi)          
411         vextracti128    $1,%ymm10,%xmm2           
412         # o3 = i3 ^ (x3 + s3), first block        
413         vpaddd          %ymm14,%ymm3,%ymm10       
414         cmp             $0x40,%rax                
415         jl              .Lxorpart4                
416         vpxor           0x30(%rdx),%xmm10,%xmm    
417         vmovdqu         %xmm9,0x30(%rsi)          
418         vextracti128    $1,%ymm10,%xmm3           
419                                                   
420         # xor and write second block              
421         vmovdqa         %xmm0,%xmm10              
422         cmp             $0x50,%rax                
423         jl              .Lxorpart4                
424         vpxor           0x40(%rdx),%xmm10,%xmm    
425         vmovdqu         %xmm9,0x40(%rsi)          
426                                                   
427         vmovdqa         %xmm1,%xmm10              
428         cmp             $0x60,%rax                
429         jl              .Lxorpart4                
430         vpxor           0x50(%rdx),%xmm10,%xmm    
431         vmovdqu         %xmm9,0x50(%rsi)          
432                                                   
433         vmovdqa         %xmm2,%xmm10              
434         cmp             $0x70,%rax                
435         jl              .Lxorpart4                
436         vpxor           0x60(%rdx),%xmm10,%xmm    
437         vmovdqu         %xmm9,0x60(%rsi)          
438                                                   
439         vmovdqa         %xmm3,%xmm10              
440         cmp             $0x80,%rax                
441         jl              .Lxorpart4                
442         vpxor           0x70(%rdx),%xmm10,%xmm    
443         vmovdqu         %xmm9,0x70(%rsi)          
444                                                   
445         # o0 = i0 ^ (x0 + s0), third block        
446         vpaddd          %ymm11,%ymm4,%ymm10       
447         cmp             $0x90,%rax                
448         jl              .Lxorpart4                
449         vpxor           0x80(%rdx),%xmm10,%xmm    
450         vmovdqu         %xmm9,0x80(%rsi)          
451         vextracti128    $1,%ymm10,%xmm4           
452         # o1 = i1 ^ (x1 + s1), third block        
453         vpaddd          %ymm12,%ymm5,%ymm10       
454         cmp             $0xa0,%rax                
455         jl              .Lxorpart4                
456         vpxor           0x90(%rdx),%xmm10,%xmm    
457         vmovdqu         %xmm9,0x90(%rsi)          
458         vextracti128    $1,%ymm10,%xmm5           
459         # o2 = i2 ^ (x2 + s2), third block        
460         vpaddd          %ymm13,%ymm6,%ymm10       
461         cmp             $0xb0,%rax                
462         jl              .Lxorpart4                
463         vpxor           0xa0(%rdx),%xmm10,%xmm    
464         vmovdqu         %xmm9,0xa0(%rsi)          
465         vextracti128    $1,%ymm10,%xmm6           
466         # o3 = i3 ^ (x3 + s3), third block        
467         vpaddd          %ymm15,%ymm7,%ymm10       
468         cmp             $0xc0,%rax                
469         jl              .Lxorpart4                
470         vpxor           0xb0(%rdx),%xmm10,%xmm    
471         vmovdqu         %xmm9,0xb0(%rsi)          
472         vextracti128    $1,%ymm10,%xmm7           
473                                                   
474         # xor and write fourth block              
475         vmovdqa         %xmm4,%xmm10              
476         cmp             $0xd0,%rax                
477         jl              .Lxorpart4                
478         vpxor           0xc0(%rdx),%xmm10,%xmm    
479         vmovdqu         %xmm9,0xc0(%rsi)          
480                                                   
481         vmovdqa         %xmm5,%xmm10              
482         cmp             $0xe0,%rax                
483         jl              .Lxorpart4                
484         vpxor           0xd0(%rdx),%xmm10,%xmm    
485         vmovdqu         %xmm9,0xd0(%rsi)          
486                                                   
487         vmovdqa         %xmm6,%xmm10              
488         cmp             $0xf0,%rax                
489         jl              .Lxorpart4                
490         vpxor           0xe0(%rdx),%xmm10,%xmm    
491         vmovdqu         %xmm9,0xe0(%rsi)          
492                                                   
493         vmovdqa         %xmm7,%xmm10              
494         cmp             $0x100,%rax               
495         jl              .Lxorpart4                
496         vpxor           0xf0(%rdx),%xmm10,%xmm    
497         vmovdqu         %xmm9,0xf0(%rsi)          
498                                                   
499 .Ldone4:                                          
500         vzeroupper                                
501         RET                                       
502                                                   
503 .Lxorpart4:                                       
504         # xor remaining bytes from partial reg    
505         mov             %rax,%r9                  
506         and             $0x0f,%r9                 
507         jz              .Ldone4                   
508         and             $~0x0f,%rax               
509                                                   
510         mov             %rsi,%r11                 
511                                                   
512         lea             8(%rsp),%r10              
513         sub             $0x10,%rsp                
514         and             $~31,%rsp                 
515                                                   
516         lea             (%rdx,%rax),%rsi          
517         mov             %rsp,%rdi                 
518         mov             %r9,%rcx                  
519         rep movsb                                 
520                                                   
521         vpxor           0x00(%rsp),%xmm10,%xmm    
522         vmovdqa         %xmm10,0x00(%rsp)         
523                                                   
524         mov             %rsp,%rsi                 
525         lea             (%r11,%rax),%rdi          
526         mov             %r9,%rcx                  
527         rep movsb                                 
528                                                   
529         lea             -8(%r10),%rsp             
530         jmp             .Ldone4                   
531                                                   
532 SYM_FUNC_END(chacha_4block_xor_avx2)              
533                                                   
534 SYM_FUNC_START(chacha_8block_xor_avx2)            
535         # %rdi: Input state matrix, s             
536         # %rsi: up to 8 data blocks output, o     
537         # %rdx: up to 8 data blocks input, i      
538         # %rcx: input/output length in bytes      
539         # %r8d: nrounds                           
540                                                   
541         # This function encrypts eight consecu    
542         # the state matrix in AVX registers ei    
543         # scratch registers, we save the first    
544         # algorithm performs each operation on    
545         # state matrix, hence requires no word    
546         # we transpose the matrix by interleav    
547         # words, which allows us to do XOR in     
548         # rotation is done with the slightly b    
549         # 7/12-bit word rotation uses traditio    
550                                                   
551         vzeroupper                                
552         # 4 * 32 byte stack, 32-byte aligned      
553         lea             8(%rsp),%r10              
554         and             $~31, %rsp                
555         sub             $0x80, %rsp               
556         mov             %rcx,%rax                 
557                                                   
558         # x0..15[0-7] = s[0..15]                  
559         vpbroadcastd    0x00(%rdi),%ymm0          
560         vpbroadcastd    0x04(%rdi),%ymm1          
561         vpbroadcastd    0x08(%rdi),%ymm2          
562         vpbroadcastd    0x0c(%rdi),%ymm3          
563         vpbroadcastd    0x10(%rdi),%ymm4          
564         vpbroadcastd    0x14(%rdi),%ymm5          
565         vpbroadcastd    0x18(%rdi),%ymm6          
566         vpbroadcastd    0x1c(%rdi),%ymm7          
567         vpbroadcastd    0x20(%rdi),%ymm8          
568         vpbroadcastd    0x24(%rdi),%ymm9          
569         vpbroadcastd    0x28(%rdi),%ymm10         
570         vpbroadcastd    0x2c(%rdi),%ymm11         
571         vpbroadcastd    0x30(%rdi),%ymm12         
572         vpbroadcastd    0x34(%rdi),%ymm13         
573         vpbroadcastd    0x38(%rdi),%ymm14         
574         vpbroadcastd    0x3c(%rdi),%ymm15         
575         # x0..3 on stack                          
576         vmovdqa         %ymm0,0x00(%rsp)          
577         vmovdqa         %ymm1,0x20(%rsp)          
578         vmovdqa         %ymm2,0x40(%rsp)          
579         vmovdqa         %ymm3,0x60(%rsp)          
580                                                   
581         vmovdqa         CTRINC(%rip),%ymm1        
582         vmovdqa         ROT8(%rip),%ymm2          
583         vmovdqa         ROT16(%rip),%ymm3         
584                                                   
585         # x12 += counter values 0-3               
586         vpaddd          %ymm1,%ymm12,%ymm12       
587                                                   
588 .Ldoubleround8:                                   
589         # x0 += x4, x12 = rotl32(x12 ^ x0, 16)    
590         vpaddd          0x00(%rsp),%ymm4,%ymm0    
591         vmovdqa         %ymm0,0x00(%rsp)          
592         vpxor           %ymm0,%ymm12,%ymm12       
593         vpshufb         %ymm3,%ymm12,%ymm12       
594         # x1 += x5, x13 = rotl32(x13 ^ x1, 16)    
595         vpaddd          0x20(%rsp),%ymm5,%ymm0    
596         vmovdqa         %ymm0,0x20(%rsp)          
597         vpxor           %ymm0,%ymm13,%ymm13       
598         vpshufb         %ymm3,%ymm13,%ymm13       
599         # x2 += x6, x14 = rotl32(x14 ^ x2, 16)    
600         vpaddd          0x40(%rsp),%ymm6,%ymm0    
601         vmovdqa         %ymm0,0x40(%rsp)          
602         vpxor           %ymm0,%ymm14,%ymm14       
603         vpshufb         %ymm3,%ymm14,%ymm14       
604         # x3 += x7, x15 = rotl32(x15 ^ x3, 16)    
605         vpaddd          0x60(%rsp),%ymm7,%ymm0    
606         vmovdqa         %ymm0,0x60(%rsp)          
607         vpxor           %ymm0,%ymm15,%ymm15       
608         vpshufb         %ymm3,%ymm15,%ymm15       
609                                                   
610         # x8 += x12, x4 = rotl32(x4 ^ x8, 12)     
611         vpaddd          %ymm12,%ymm8,%ymm8        
612         vpxor           %ymm8,%ymm4,%ymm4         
613         vpslld          $12,%ymm4,%ymm0           
614         vpsrld          $20,%ymm4,%ymm4           
615         vpor            %ymm0,%ymm4,%ymm4         
616         # x9 += x13, x5 = rotl32(x5 ^ x9, 12)     
617         vpaddd          %ymm13,%ymm9,%ymm9        
618         vpxor           %ymm9,%ymm5,%ymm5         
619         vpslld          $12,%ymm5,%ymm0           
620         vpsrld          $20,%ymm5,%ymm5           
621         vpor            %ymm0,%ymm5,%ymm5         
622         # x10 += x14, x6 = rotl32(x6 ^ x10, 12    
623         vpaddd          %ymm14,%ymm10,%ymm10      
624         vpxor           %ymm10,%ymm6,%ymm6        
625         vpslld          $12,%ymm6,%ymm0           
626         vpsrld          $20,%ymm6,%ymm6           
627         vpor            %ymm0,%ymm6,%ymm6         
628         # x11 += x15, x7 = rotl32(x7 ^ x11, 12    
629         vpaddd          %ymm15,%ymm11,%ymm11      
630         vpxor           %ymm11,%ymm7,%ymm7        
631         vpslld          $12,%ymm7,%ymm0           
632         vpsrld          $20,%ymm7,%ymm7           
633         vpor            %ymm0,%ymm7,%ymm7         
634                                                   
635         # x0 += x4, x12 = rotl32(x12 ^ x0, 8)     
636         vpaddd          0x00(%rsp),%ymm4,%ymm0    
637         vmovdqa         %ymm0,0x00(%rsp)          
638         vpxor           %ymm0,%ymm12,%ymm12       
639         vpshufb         %ymm2,%ymm12,%ymm12       
640         # x1 += x5, x13 = rotl32(x13 ^ x1, 8)     
641         vpaddd          0x20(%rsp),%ymm5,%ymm0    
642         vmovdqa         %ymm0,0x20(%rsp)          
643         vpxor           %ymm0,%ymm13,%ymm13       
644         vpshufb         %ymm2,%ymm13,%ymm13       
645         # x2 += x6, x14 = rotl32(x14 ^ x2, 8)     
646         vpaddd          0x40(%rsp),%ymm6,%ymm0    
647         vmovdqa         %ymm0,0x40(%rsp)          
648         vpxor           %ymm0,%ymm14,%ymm14       
649         vpshufb         %ymm2,%ymm14,%ymm14       
650         # x3 += x7, x15 = rotl32(x15 ^ x3, 8)     
651         vpaddd          0x60(%rsp),%ymm7,%ymm0    
652         vmovdqa         %ymm0,0x60(%rsp)          
653         vpxor           %ymm0,%ymm15,%ymm15       
654         vpshufb         %ymm2,%ymm15,%ymm15       
655                                                   
656         # x8 += x12, x4 = rotl32(x4 ^ x8, 7)      
657         vpaddd          %ymm12,%ymm8,%ymm8        
658         vpxor           %ymm8,%ymm4,%ymm4         
659         vpslld          $7,%ymm4,%ymm0            
660         vpsrld          $25,%ymm4,%ymm4           
661         vpor            %ymm0,%ymm4,%ymm4         
662         # x9 += x13, x5 = rotl32(x5 ^ x9, 7)      
663         vpaddd          %ymm13,%ymm9,%ymm9        
664         vpxor           %ymm9,%ymm5,%ymm5         
665         vpslld          $7,%ymm5,%ymm0            
666         vpsrld          $25,%ymm5,%ymm5           
667         vpor            %ymm0,%ymm5,%ymm5         
668         # x10 += x14, x6 = rotl32(x6 ^ x10, 7)    
669         vpaddd          %ymm14,%ymm10,%ymm10      
670         vpxor           %ymm10,%ymm6,%ymm6        
671         vpslld          $7,%ymm6,%ymm0            
672         vpsrld          $25,%ymm6,%ymm6           
673         vpor            %ymm0,%ymm6,%ymm6         
674         # x11 += x15, x7 = rotl32(x7 ^ x11, 7)    
675         vpaddd          %ymm15,%ymm11,%ymm11      
676         vpxor           %ymm11,%ymm7,%ymm7        
677         vpslld          $7,%ymm7,%ymm0            
678         vpsrld          $25,%ymm7,%ymm7           
679         vpor            %ymm0,%ymm7,%ymm7         
680                                                   
681         # x0 += x5, x15 = rotl32(x15 ^ x0, 16)    
682         vpaddd          0x00(%rsp),%ymm5,%ymm0    
683         vmovdqa         %ymm0,0x00(%rsp)          
684         vpxor           %ymm0,%ymm15,%ymm15       
685         vpshufb         %ymm3,%ymm15,%ymm15       
686         # x1 += x6, x12 = rotl32(x12 ^ x1, 16)    
687         vpaddd          0x20(%rsp),%ymm6,%ymm0    
688         vmovdqa         %ymm0,0x20(%rsp)          
689         vpxor           %ymm0,%ymm12,%ymm12       
690         vpshufb         %ymm3,%ymm12,%ymm12       
691         # x2 += x7, x13 = rotl32(x13 ^ x2, 16)    
692         vpaddd          0x40(%rsp),%ymm7,%ymm0    
693         vmovdqa         %ymm0,0x40(%rsp)          
694         vpxor           %ymm0,%ymm13,%ymm13       
695         vpshufb         %ymm3,%ymm13,%ymm13       
696         # x3 += x4, x14 = rotl32(x14 ^ x3, 16)    
697         vpaddd          0x60(%rsp),%ymm4,%ymm0    
698         vmovdqa         %ymm0,0x60(%rsp)          
699         vpxor           %ymm0,%ymm14,%ymm14       
700         vpshufb         %ymm3,%ymm14,%ymm14       
701                                                   
702         # x10 += x15, x5 = rotl32(x5 ^ x10, 12    
703         vpaddd          %ymm15,%ymm10,%ymm10      
704         vpxor           %ymm10,%ymm5,%ymm5        
705         vpslld          $12,%ymm5,%ymm0           
706         vpsrld          $20,%ymm5,%ymm5           
707         vpor            %ymm0,%ymm5,%ymm5         
708         # x11 += x12, x6 = rotl32(x6 ^ x11, 12    
709         vpaddd          %ymm12,%ymm11,%ymm11      
710         vpxor           %ymm11,%ymm6,%ymm6        
711         vpslld          $12,%ymm6,%ymm0           
712         vpsrld          $20,%ymm6,%ymm6           
713         vpor            %ymm0,%ymm6,%ymm6         
714         # x8 += x13, x7 = rotl32(x7 ^ x8, 12)     
715         vpaddd          %ymm13,%ymm8,%ymm8        
716         vpxor           %ymm8,%ymm7,%ymm7         
717         vpslld          $12,%ymm7,%ymm0           
718         vpsrld          $20,%ymm7,%ymm7           
719         vpor            %ymm0,%ymm7,%ymm7         
720         # x9 += x14, x4 = rotl32(x4 ^ x9, 12)     
721         vpaddd          %ymm14,%ymm9,%ymm9        
722         vpxor           %ymm9,%ymm4,%ymm4         
723         vpslld          $12,%ymm4,%ymm0           
724         vpsrld          $20,%ymm4,%ymm4           
725         vpor            %ymm0,%ymm4,%ymm4         
726                                                   
727         # x0 += x5, x15 = rotl32(x15 ^ x0, 8)     
728         vpaddd          0x00(%rsp),%ymm5,%ymm0    
729         vmovdqa         %ymm0,0x00(%rsp)          
730         vpxor           %ymm0,%ymm15,%ymm15       
731         vpshufb         %ymm2,%ymm15,%ymm15       
732         # x1 += x6, x12 = rotl32(x12 ^ x1, 8)     
733         vpaddd          0x20(%rsp),%ymm6,%ymm0    
734         vmovdqa         %ymm0,0x20(%rsp)          
735         vpxor           %ymm0,%ymm12,%ymm12       
736         vpshufb         %ymm2,%ymm12,%ymm12       
737         # x2 += x7, x13 = rotl32(x13 ^ x2, 8)     
738         vpaddd          0x40(%rsp),%ymm7,%ymm0    
739         vmovdqa         %ymm0,0x40(%rsp)          
740         vpxor           %ymm0,%ymm13,%ymm13       
741         vpshufb         %ymm2,%ymm13,%ymm13       
742         # x3 += x4, x14 = rotl32(x14 ^ x3, 8)     
743         vpaddd          0x60(%rsp),%ymm4,%ymm0    
744         vmovdqa         %ymm0,0x60(%rsp)          
745         vpxor           %ymm0,%ymm14,%ymm14       
746         vpshufb         %ymm2,%ymm14,%ymm14       
747                                                   
748         # x10 += x15, x5 = rotl32(x5 ^ x10, 7)    
749         vpaddd          %ymm15,%ymm10,%ymm10      
750         vpxor           %ymm10,%ymm5,%ymm5        
751         vpslld          $7,%ymm5,%ymm0            
752         vpsrld          $25,%ymm5,%ymm5           
753         vpor            %ymm0,%ymm5,%ymm5         
754         # x11 += x12, x6 = rotl32(x6 ^ x11, 7)    
755         vpaddd          %ymm12,%ymm11,%ymm11      
756         vpxor           %ymm11,%ymm6,%ymm6        
757         vpslld          $7,%ymm6,%ymm0            
758         vpsrld          $25,%ymm6,%ymm6           
759         vpor            %ymm0,%ymm6,%ymm6         
760         # x8 += x13, x7 = rotl32(x7 ^ x8, 7)      
761         vpaddd          %ymm13,%ymm8,%ymm8        
762         vpxor           %ymm8,%ymm7,%ymm7         
763         vpslld          $7,%ymm7,%ymm0            
764         vpsrld          $25,%ymm7,%ymm7           
765         vpor            %ymm0,%ymm7,%ymm7         
766         # x9 += x14, x4 = rotl32(x4 ^ x9, 7)      
767         vpaddd          %ymm14,%ymm9,%ymm9        
768         vpxor           %ymm9,%ymm4,%ymm4         
769         vpslld          $7,%ymm4,%ymm0            
770         vpsrld          $25,%ymm4,%ymm4           
771         vpor            %ymm0,%ymm4,%ymm4         
772                                                   
773         sub             $2,%r8d                   
774         jnz             .Ldoubleround8            
775                                                   
776         # x0..15[0-3] += s[0..15]                 
777         vpbroadcastd    0x00(%rdi),%ymm0          
778         vpaddd          0x00(%rsp),%ymm0,%ymm0    
779         vmovdqa         %ymm0,0x00(%rsp)          
780         vpbroadcastd    0x04(%rdi),%ymm0          
781         vpaddd          0x20(%rsp),%ymm0,%ymm0    
782         vmovdqa         %ymm0,0x20(%rsp)          
783         vpbroadcastd    0x08(%rdi),%ymm0          
784         vpaddd          0x40(%rsp),%ymm0,%ymm0    
785         vmovdqa         %ymm0,0x40(%rsp)          
786         vpbroadcastd    0x0c(%rdi),%ymm0          
787         vpaddd          0x60(%rsp),%ymm0,%ymm0    
788         vmovdqa         %ymm0,0x60(%rsp)          
789         vpbroadcastd    0x10(%rdi),%ymm0          
790         vpaddd          %ymm0,%ymm4,%ymm4         
791         vpbroadcastd    0x14(%rdi),%ymm0          
792         vpaddd          %ymm0,%ymm5,%ymm5         
793         vpbroadcastd    0x18(%rdi),%ymm0          
794         vpaddd          %ymm0,%ymm6,%ymm6         
795         vpbroadcastd    0x1c(%rdi),%ymm0          
796         vpaddd          %ymm0,%ymm7,%ymm7         
797         vpbroadcastd    0x20(%rdi),%ymm0          
798         vpaddd          %ymm0,%ymm8,%ymm8         
799         vpbroadcastd    0x24(%rdi),%ymm0          
800         vpaddd          %ymm0,%ymm9,%ymm9         
801         vpbroadcastd    0x28(%rdi),%ymm0          
802         vpaddd          %ymm0,%ymm10,%ymm10       
803         vpbroadcastd    0x2c(%rdi),%ymm0          
804         vpaddd          %ymm0,%ymm11,%ymm11       
805         vpbroadcastd    0x30(%rdi),%ymm0          
806         vpaddd          %ymm0,%ymm12,%ymm12       
807         vpbroadcastd    0x34(%rdi),%ymm0          
808         vpaddd          %ymm0,%ymm13,%ymm13       
809         vpbroadcastd    0x38(%rdi),%ymm0          
810         vpaddd          %ymm0,%ymm14,%ymm14       
811         vpbroadcastd    0x3c(%rdi),%ymm0          
812         vpaddd          %ymm0,%ymm15,%ymm15       
813                                                   
814         # x12 += counter values 0-3               
815         vpaddd          %ymm1,%ymm12,%ymm12       
816                                                   
817         # interleave 32-bit words in state n,     
818         vmovdqa         0x00(%rsp),%ymm0          
819         vmovdqa         0x20(%rsp),%ymm1          
820         vpunpckldq      %ymm1,%ymm0,%ymm2         
821         vpunpckhdq      %ymm1,%ymm0,%ymm1         
822         vmovdqa         %ymm2,0x00(%rsp)          
823         vmovdqa         %ymm1,0x20(%rsp)          
824         vmovdqa         0x40(%rsp),%ymm0          
825         vmovdqa         0x60(%rsp),%ymm1          
826         vpunpckldq      %ymm1,%ymm0,%ymm2         
827         vpunpckhdq      %ymm1,%ymm0,%ymm1         
828         vmovdqa         %ymm2,0x40(%rsp)          
829         vmovdqa         %ymm1,0x60(%rsp)          
830         vmovdqa         %ymm4,%ymm0               
831         vpunpckldq      %ymm5,%ymm0,%ymm4         
832         vpunpckhdq      %ymm5,%ymm0,%ymm5         
833         vmovdqa         %ymm6,%ymm0               
834         vpunpckldq      %ymm7,%ymm0,%ymm6         
835         vpunpckhdq      %ymm7,%ymm0,%ymm7         
836         vmovdqa         %ymm8,%ymm0               
837         vpunpckldq      %ymm9,%ymm0,%ymm8         
838         vpunpckhdq      %ymm9,%ymm0,%ymm9         
839         vmovdqa         %ymm10,%ymm0              
840         vpunpckldq      %ymm11,%ymm0,%ymm10       
841         vpunpckhdq      %ymm11,%ymm0,%ymm11       
842         vmovdqa         %ymm12,%ymm0              
843         vpunpckldq      %ymm13,%ymm0,%ymm12       
844         vpunpckhdq      %ymm13,%ymm0,%ymm13       
845         vmovdqa         %ymm14,%ymm0              
846         vpunpckldq      %ymm15,%ymm0,%ymm14       
847         vpunpckhdq      %ymm15,%ymm0,%ymm15       
848                                                   
849         # interleave 64-bit words in state n,     
850         vmovdqa         0x00(%rsp),%ymm0          
851         vmovdqa         0x40(%rsp),%ymm2          
852         vpunpcklqdq     %ymm2,%ymm0,%ymm1         
853         vpunpckhqdq     %ymm2,%ymm0,%ymm2         
854         vmovdqa         %ymm1,0x00(%rsp)          
855         vmovdqa         %ymm2,0x40(%rsp)          
856         vmovdqa         0x20(%rsp),%ymm0          
857         vmovdqa         0x60(%rsp),%ymm2          
858         vpunpcklqdq     %ymm2,%ymm0,%ymm1         
859         vpunpckhqdq     %ymm2,%ymm0,%ymm2         
860         vmovdqa         %ymm1,0x20(%rsp)          
861         vmovdqa         %ymm2,0x60(%rsp)          
862         vmovdqa         %ymm4,%ymm0               
863         vpunpcklqdq     %ymm6,%ymm0,%ymm4         
864         vpunpckhqdq     %ymm6,%ymm0,%ymm6         
865         vmovdqa         %ymm5,%ymm0               
866         vpunpcklqdq     %ymm7,%ymm0,%ymm5         
867         vpunpckhqdq     %ymm7,%ymm0,%ymm7         
868         vmovdqa         %ymm8,%ymm0               
869         vpunpcklqdq     %ymm10,%ymm0,%ymm8        
870         vpunpckhqdq     %ymm10,%ymm0,%ymm10       
871         vmovdqa         %ymm9,%ymm0               
872         vpunpcklqdq     %ymm11,%ymm0,%ymm9        
873         vpunpckhqdq     %ymm11,%ymm0,%ymm11       
874         vmovdqa         %ymm12,%ymm0              
875         vpunpcklqdq     %ymm14,%ymm0,%ymm12       
876         vpunpckhqdq     %ymm14,%ymm0,%ymm14       
877         vmovdqa         %ymm13,%ymm0              
878         vpunpcklqdq     %ymm15,%ymm0,%ymm13       
879         vpunpckhqdq     %ymm15,%ymm0,%ymm15       
880                                                   
881         # interleave 128-bit words in state n,    
882         # xor/write first four blocks             
883         vmovdqa         0x00(%rsp),%ymm1          
884         vperm2i128      $0x20,%ymm4,%ymm1,%ymm    
885         cmp             $0x0020,%rax              
886         jl              .Lxorpart8                
887         vpxor           0x0000(%rdx),%ymm0,%ym    
888         vmovdqu         %ymm0,0x0000(%rsi)        
889         vperm2i128      $0x31,%ymm4,%ymm1,%ymm    
890                                                   
891         vperm2i128      $0x20,%ymm12,%ymm8,%ym    
892         cmp             $0x0040,%rax              
893         jl              .Lxorpart8                
894         vpxor           0x0020(%rdx),%ymm0,%ym    
895         vmovdqu         %ymm0,0x0020(%rsi)        
896         vperm2i128      $0x31,%ymm12,%ymm8,%ym    
897                                                   
898         vmovdqa         0x40(%rsp),%ymm1          
899         vperm2i128      $0x20,%ymm6,%ymm1,%ymm    
900         cmp             $0x0060,%rax              
901         jl              .Lxorpart8                
902         vpxor           0x0040(%rdx),%ymm0,%ym    
903         vmovdqu         %ymm0,0x0040(%rsi)        
904         vperm2i128      $0x31,%ymm6,%ymm1,%ymm    
905                                                   
906         vperm2i128      $0x20,%ymm14,%ymm10,%y    
907         cmp             $0x0080,%rax              
908         jl              .Lxorpart8                
909         vpxor           0x0060(%rdx),%ymm0,%ym    
910         vmovdqu         %ymm0,0x0060(%rsi)        
911         vperm2i128      $0x31,%ymm14,%ymm10,%y    
912                                                   
913         vmovdqa         0x20(%rsp),%ymm1          
914         vperm2i128      $0x20,%ymm5,%ymm1,%ymm    
915         cmp             $0x00a0,%rax              
916         jl              .Lxorpart8                
917         vpxor           0x0080(%rdx),%ymm0,%ym    
918         vmovdqu         %ymm0,0x0080(%rsi)        
919         vperm2i128      $0x31,%ymm5,%ymm1,%ymm    
920                                                   
921         vperm2i128      $0x20,%ymm13,%ymm9,%ym    
922         cmp             $0x00c0,%rax              
923         jl              .Lxorpart8                
924         vpxor           0x00a0(%rdx),%ymm0,%ym    
925         vmovdqu         %ymm0,0x00a0(%rsi)        
926         vperm2i128      $0x31,%ymm13,%ymm9,%ym    
927                                                   
928         vmovdqa         0x60(%rsp),%ymm1          
929         vperm2i128      $0x20,%ymm7,%ymm1,%ymm    
930         cmp             $0x00e0,%rax              
931         jl              .Lxorpart8                
932         vpxor           0x00c0(%rdx),%ymm0,%ym    
933         vmovdqu         %ymm0,0x00c0(%rsi)        
934         vperm2i128      $0x31,%ymm7,%ymm1,%ymm    
935                                                   
936         vperm2i128      $0x20,%ymm15,%ymm11,%y    
937         cmp             $0x0100,%rax              
938         jl              .Lxorpart8                
939         vpxor           0x00e0(%rdx),%ymm0,%ym    
940         vmovdqu         %ymm0,0x00e0(%rsi)        
941         vperm2i128      $0x31,%ymm15,%ymm11,%y    
942                                                   
943         # xor remaining blocks, write to outpu    
944         vmovdqa         %ymm4,%ymm0               
945         cmp             $0x0120,%rax              
946         jl              .Lxorpart8                
947         vpxor           0x0100(%rdx),%ymm0,%ym    
948         vmovdqu         %ymm0,0x0100(%rsi)        
949                                                   
950         vmovdqa         %ymm12,%ymm0              
951         cmp             $0x0140,%rax              
952         jl              .Lxorpart8                
953         vpxor           0x0120(%rdx),%ymm0,%ym    
954         vmovdqu         %ymm0,0x0120(%rsi)        
955                                                   
956         vmovdqa         %ymm6,%ymm0               
957         cmp             $0x0160,%rax              
958         jl              .Lxorpart8                
959         vpxor           0x0140(%rdx),%ymm0,%ym    
960         vmovdqu         %ymm0,0x0140(%rsi)        
961                                                   
962         vmovdqa         %ymm14,%ymm0              
963         cmp             $0x0180,%rax              
964         jl              .Lxorpart8                
965         vpxor           0x0160(%rdx),%ymm0,%ym    
966         vmovdqu         %ymm0,0x0160(%rsi)        
967                                                   
968         vmovdqa         %ymm5,%ymm0               
969         cmp             $0x01a0,%rax              
970         jl              .Lxorpart8                
971         vpxor           0x0180(%rdx),%ymm0,%ym    
972         vmovdqu         %ymm0,0x0180(%rsi)        
973                                                   
974         vmovdqa         %ymm13,%ymm0              
975         cmp             $0x01c0,%rax              
976         jl              .Lxorpart8                
977         vpxor           0x01a0(%rdx),%ymm0,%ym    
978         vmovdqu         %ymm0,0x01a0(%rsi)        
979                                                   
980         vmovdqa         %ymm7,%ymm0               
981         cmp             $0x01e0,%rax              
982         jl              .Lxorpart8                
983         vpxor           0x01c0(%rdx),%ymm0,%ym    
984         vmovdqu         %ymm0,0x01c0(%rsi)        
985                                                   
986         vmovdqa         %ymm15,%ymm0              
987         cmp             $0x0200,%rax              
988         jl              .Lxorpart8                
989         vpxor           0x01e0(%rdx),%ymm0,%ym    
990         vmovdqu         %ymm0,0x01e0(%rsi)        
991                                                   
992 .Ldone8:                                          
993         vzeroupper                                
994         lea             -8(%r10),%rsp             
995         RET                                       
996                                                   
997 .Lxorpart8:                                       
998         # xor remaining bytes from partial reg    
999         mov             %rax,%r9                  
1000         and             $0x1f,%r9                
1001         jz              .Ldone8                  
1002         and             $~0x1f,%rax              
1003                                                  
1004         mov             %rsi,%r11                
1005                                                  
1006         lea             (%rdx,%rax),%rsi         
1007         mov             %rsp,%rdi                
1008         mov             %r9,%rcx                 
1009         rep movsb                                
1010                                                  
1011         vpxor           0x00(%rsp),%ymm0,%ymm    
1012         vmovdqa         %ymm0,0x00(%rsp)         
1013                                                  
1014         mov             %rsp,%rsi                
1015         lea             (%r11,%rax),%rdi         
1016         mov             %r9,%rcx                 
1017         rep movsb                                
1018                                                  
1019         jmp             .Ldone8                  
1020                                                  
1021 SYM_FUNC_END(chacha_8block_xor_avx2)             
                                                      

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

sflogo.php