~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/arch/x86/crypto/chacha-ssse3-x86_64.S

Version: ~ [ linux-6.12-rc7 ] ~ [ linux-6.11.7 ] ~ [ linux-6.10.14 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.60 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.116 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.171 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.229 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.285 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.323 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.12 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

Diff markup

Differences between /arch/x86/crypto/chacha-ssse3-x86_64.S (Architecture alpha) and /arch/mips/crypto/chacha-ssse3-x86_64.S (Architecture mips)


  1 /* SPDX-License-Identifier: GPL-2.0-or-later *    
  2 /*                                                
  3  * ChaCha 256-bit cipher algorithm, x64 SSSE3     
  4  *                                                
  5  * Copyright (C) 2015 Martin Willi                
  6  */                                               
  7                                                   
  8 #include <linux/linkage.h>                        
  9 #include <asm/frame.h>                            
 10                                                   
 11 .section        .rodata.cst16.ROT8, "aM", @pro    
 12 .align 16                                         
 13 ROT8:   .octa 0x0e0d0c0f0a09080b06050407020100    
 14 .section        .rodata.cst16.ROT16, "aM", @pr    
 15 .align 16                                         
 16 ROT16:  .octa 0x0d0c0f0e09080b0a05040706010003    
 17 .section        .rodata.cst16.CTRINC, "aM", @p    
 18 .align 16                                         
 19 CTRINC: .octa 0x000000030000000200000001000000    
 20                                                   
 21 .text                                             
 22                                                   
 23 /*                                                
 24  * chacha_permute - permute one block             
 25  *                                                
 26  * Permute one 64-byte block where the state m    
 27  * function performs matrix operations on four    
 28  * shuffling to rearrange the words after each    
 29  * done with the slightly better performing SS    
 30  * rotation uses traditional shift+OR.            
 31  *                                                
 32  * The round count is given in %r8d.              
 33  *                                                
 34  * Clobbers: %r8d, %xmm4-%xmm7                    
 35  */                                               
 36 SYM_FUNC_START_LOCAL(chacha_permute)              
 37                                                   
 38         movdqa          ROT8(%rip),%xmm4          
 39         movdqa          ROT16(%rip),%xmm5         
 40                                                   
 41 .Ldoubleround:                                    
 42         # x0 += x1, x3 = rotl32(x3 ^ x0, 16)      
 43         paddd           %xmm1,%xmm0               
 44         pxor            %xmm0,%xmm3               
 45         pshufb          %xmm5,%xmm3               
 46                                                   
 47         # x2 += x3, x1 = rotl32(x1 ^ x2, 12)      
 48         paddd           %xmm3,%xmm2               
 49         pxor            %xmm2,%xmm1               
 50         movdqa          %xmm1,%xmm6               
 51         pslld           $12,%xmm6                 
 52         psrld           $20,%xmm1                 
 53         por             %xmm6,%xmm1               
 54                                                   
 55         # x0 += x1, x3 = rotl32(x3 ^ x0, 8)       
 56         paddd           %xmm1,%xmm0               
 57         pxor            %xmm0,%xmm3               
 58         pshufb          %xmm4,%xmm3               
 59                                                   
 60         # x2 += x3, x1 = rotl32(x1 ^ x2, 7)       
 61         paddd           %xmm3,%xmm2               
 62         pxor            %xmm2,%xmm1               
 63         movdqa          %xmm1,%xmm7               
 64         pslld           $7,%xmm7                  
 65         psrld           $25,%xmm1                 
 66         por             %xmm7,%xmm1               
 67                                                   
 68         # x1 = shuffle32(x1, MASK(0, 3, 2, 1))    
 69         pshufd          $0x39,%xmm1,%xmm1         
 70         # x2 = shuffle32(x2, MASK(1, 0, 3, 2))    
 71         pshufd          $0x4e,%xmm2,%xmm2         
 72         # x3 = shuffle32(x3, MASK(2, 1, 0, 3))    
 73         pshufd          $0x93,%xmm3,%xmm3         
 74                                                   
 75         # x0 += x1, x3 = rotl32(x3 ^ x0, 16)      
 76         paddd           %xmm1,%xmm0               
 77         pxor            %xmm0,%xmm3               
 78         pshufb          %xmm5,%xmm3               
 79                                                   
 80         # x2 += x3, x1 = rotl32(x1 ^ x2, 12)      
 81         paddd           %xmm3,%xmm2               
 82         pxor            %xmm2,%xmm1               
 83         movdqa          %xmm1,%xmm6               
 84         pslld           $12,%xmm6                 
 85         psrld           $20,%xmm1                 
 86         por             %xmm6,%xmm1               
 87                                                   
 88         # x0 += x1, x3 = rotl32(x3 ^ x0, 8)       
 89         paddd           %xmm1,%xmm0               
 90         pxor            %xmm0,%xmm3               
 91         pshufb          %xmm4,%xmm3               
 92                                                   
 93         # x2 += x3, x1 = rotl32(x1 ^ x2, 7)       
 94         paddd           %xmm3,%xmm2               
 95         pxor            %xmm2,%xmm1               
 96         movdqa          %xmm1,%xmm7               
 97         pslld           $7,%xmm7                  
 98         psrld           $25,%xmm1                 
 99         por             %xmm7,%xmm1               
100                                                   
101         # x1 = shuffle32(x1, MASK(2, 1, 0, 3))    
102         pshufd          $0x93,%xmm1,%xmm1         
103         # x2 = shuffle32(x2, MASK(1, 0, 3, 2))    
104         pshufd          $0x4e,%xmm2,%xmm2         
105         # x3 = shuffle32(x3, MASK(0, 3, 2, 1))    
106         pshufd          $0x39,%xmm3,%xmm3         
107                                                   
108         sub             $2,%r8d                   
109         jnz             .Ldoubleround             
110                                                   
111         RET                                       
112 SYM_FUNC_END(chacha_permute)                      
113                                                   
114 SYM_FUNC_START(chacha_block_xor_ssse3)            
115         # %rdi: Input state matrix, s             
116         # %rsi: up to 1 data block output, o      
117         # %rdx: up to 1 data block input, i       
118         # %rcx: input/output length in bytes      
119         # %r8d: nrounds                           
120         FRAME_BEGIN                               
121                                                   
122         # x0..3 = s0..3                           
123         movdqu          0x00(%rdi),%xmm0          
124         movdqu          0x10(%rdi),%xmm1          
125         movdqu          0x20(%rdi),%xmm2          
126         movdqu          0x30(%rdi),%xmm3          
127         movdqa          %xmm0,%xmm8               
128         movdqa          %xmm1,%xmm9               
129         movdqa          %xmm2,%xmm10              
130         movdqa          %xmm3,%xmm11              
131                                                   
132         mov             %rcx,%rax                 
133         call            chacha_permute            
134                                                   
135         # o0 = i0 ^ (x0 + s0)                     
136         paddd           %xmm8,%xmm0               
137         cmp             $0x10,%rax                
138         jl              .Lxorpart                 
139         movdqu          0x00(%rdx),%xmm4          
140         pxor            %xmm4,%xmm0               
141         movdqu          %xmm0,0x00(%rsi)          
142         # o1 = i1 ^ (x1 + s1)                     
143         paddd           %xmm9,%xmm1               
144         movdqa          %xmm1,%xmm0               
145         cmp             $0x20,%rax                
146         jl              .Lxorpart                 
147         movdqu          0x10(%rdx),%xmm0          
148         pxor            %xmm1,%xmm0               
149         movdqu          %xmm0,0x10(%rsi)          
150         # o2 = i2 ^ (x2 + s2)                     
151         paddd           %xmm10,%xmm2              
152         movdqa          %xmm2,%xmm0               
153         cmp             $0x30,%rax                
154         jl              .Lxorpart                 
155         movdqu          0x20(%rdx),%xmm0          
156         pxor            %xmm2,%xmm0               
157         movdqu          %xmm0,0x20(%rsi)          
158         # o3 = i3 ^ (x3 + s3)                     
159         paddd           %xmm11,%xmm3              
160         movdqa          %xmm3,%xmm0               
161         cmp             $0x40,%rax                
162         jl              .Lxorpart                 
163         movdqu          0x30(%rdx),%xmm0          
164         pxor            %xmm3,%xmm0               
165         movdqu          %xmm0,0x30(%rsi)          
166                                                   
167 .Ldone:                                           
168         FRAME_END                                 
169         RET                                       
170                                                   
171 .Lxorpart:                                        
172         # xor remaining bytes from partial reg    
173         mov             %rax,%r9                  
174         and             $0x0f,%r9                 
175         jz              .Ldone                    
176         and             $~0x0f,%rax               
177                                                   
178         mov             %rsi,%r11                 
179                                                   
180         lea             8(%rsp),%r10              
181         sub             $0x10,%rsp                
182         and             $~31,%rsp                 
183                                                   
184         lea             (%rdx,%rax),%rsi          
185         mov             %rsp,%rdi                 
186         mov             %r9,%rcx                  
187         rep movsb                                 
188                                                   
189         pxor            0x00(%rsp),%xmm0          
190         movdqa          %xmm0,0x00(%rsp)          
191                                                   
192         mov             %rsp,%rsi                 
193         lea             (%r11,%rax),%rdi          
194         mov             %r9,%rcx                  
195         rep movsb                                 
196                                                   
197         lea             -8(%r10),%rsp             
198         jmp             .Ldone                    
199                                                   
200 SYM_FUNC_END(chacha_block_xor_ssse3)              
201                                                   
202 SYM_FUNC_START(hchacha_block_ssse3)               
203         # %rdi: Input state matrix, s             
204         # %rsi: output (8 32-bit words)           
205         # %edx: nrounds                           
206         FRAME_BEGIN                               
207                                                   
208         movdqu          0x00(%rdi),%xmm0          
209         movdqu          0x10(%rdi),%xmm1          
210         movdqu          0x20(%rdi),%xmm2          
211         movdqu          0x30(%rdi),%xmm3          
212                                                   
213         mov             %edx,%r8d                 
214         call            chacha_permute            
215                                                   
216         movdqu          %xmm0,0x00(%rsi)          
217         movdqu          %xmm3,0x10(%rsi)          
218                                                   
219         FRAME_END                                 
220         RET                                       
221 SYM_FUNC_END(hchacha_block_ssse3)                 
222                                                   
223 SYM_FUNC_START(chacha_4block_xor_ssse3)           
224         # %rdi: Input state matrix, s             
225         # %rsi: up to 4 data blocks output, o     
226         # %rdx: up to 4 data blocks input, i      
227         # %rcx: input/output length in bytes      
228         # %r8d: nrounds                           
229                                                   
230         # This function encrypts four consecut    
231         # the state matrix in SSE registers fo    
232         # registers, we save the first four re    
233         # algorithm performs each operation on    
234         # state matrix, hence requires no word    
235         # we transpose the matrix by interleav    
236         # which allows us to do XOR in SSE reg    
237         # done with the slightly better perfor    
238         # 7/12-bit word rotation uses traditio    
239                                                   
240         lea             8(%rsp),%r10              
241         sub             $0x80,%rsp                
242         and             $~63,%rsp                 
243         mov             %rcx,%rax                 
244                                                   
245         # x0..15[0-3] = s0..3[0..3]               
246         movq            0x00(%rdi),%xmm1          
247         pshufd          $0x00,%xmm1,%xmm0         
248         pshufd          $0x55,%xmm1,%xmm1         
249         movq            0x08(%rdi),%xmm3          
250         pshufd          $0x00,%xmm3,%xmm2         
251         pshufd          $0x55,%xmm3,%xmm3         
252         movq            0x10(%rdi),%xmm5          
253         pshufd          $0x00,%xmm5,%xmm4         
254         pshufd          $0x55,%xmm5,%xmm5         
255         movq            0x18(%rdi),%xmm7          
256         pshufd          $0x00,%xmm7,%xmm6         
257         pshufd          $0x55,%xmm7,%xmm7         
258         movq            0x20(%rdi),%xmm9          
259         pshufd          $0x00,%xmm9,%xmm8         
260         pshufd          $0x55,%xmm9,%xmm9         
261         movq            0x28(%rdi),%xmm11         
262         pshufd          $0x00,%xmm11,%xmm10       
263         pshufd          $0x55,%xmm11,%xmm11       
264         movq            0x30(%rdi),%xmm13         
265         pshufd          $0x00,%xmm13,%xmm12       
266         pshufd          $0x55,%xmm13,%xmm13       
267         movq            0x38(%rdi),%xmm15         
268         pshufd          $0x00,%xmm15,%xmm14       
269         pshufd          $0x55,%xmm15,%xmm15       
270         # x0..3 on stack                          
271         movdqa          %xmm0,0x00(%rsp)          
272         movdqa          %xmm1,0x10(%rsp)          
273         movdqa          %xmm2,0x20(%rsp)          
274         movdqa          %xmm3,0x30(%rsp)          
275                                                   
276         movdqa          CTRINC(%rip),%xmm1        
277         movdqa          ROT8(%rip),%xmm2          
278         movdqa          ROT16(%rip),%xmm3         
279                                                   
280         # x12 += counter values 0-3               
281         paddd           %xmm1,%xmm12              
282                                                   
283 .Ldoubleround4:                                   
284         # x0 += x4, x12 = rotl32(x12 ^ x0, 16)    
285         movdqa          0x00(%rsp),%xmm0          
286         paddd           %xmm4,%xmm0               
287         movdqa          %xmm0,0x00(%rsp)          
288         pxor            %xmm0,%xmm12              
289         pshufb          %xmm3,%xmm12              
290         # x1 += x5, x13 = rotl32(x13 ^ x1, 16)    
291         movdqa          0x10(%rsp),%xmm0          
292         paddd           %xmm5,%xmm0               
293         movdqa          %xmm0,0x10(%rsp)          
294         pxor            %xmm0,%xmm13              
295         pshufb          %xmm3,%xmm13              
296         # x2 += x6, x14 = rotl32(x14 ^ x2, 16)    
297         movdqa          0x20(%rsp),%xmm0          
298         paddd           %xmm6,%xmm0               
299         movdqa          %xmm0,0x20(%rsp)          
300         pxor            %xmm0,%xmm14              
301         pshufb          %xmm3,%xmm14              
302         # x3 += x7, x15 = rotl32(x15 ^ x3, 16)    
303         movdqa          0x30(%rsp),%xmm0          
304         paddd           %xmm7,%xmm0               
305         movdqa          %xmm0,0x30(%rsp)          
306         pxor            %xmm0,%xmm15              
307         pshufb          %xmm3,%xmm15              
308                                                   
309         # x8 += x12, x4 = rotl32(x4 ^ x8, 12)     
310         paddd           %xmm12,%xmm8              
311         pxor            %xmm8,%xmm4               
312         movdqa          %xmm4,%xmm0               
313         pslld           $12,%xmm0                 
314         psrld           $20,%xmm4                 
315         por             %xmm0,%xmm4               
316         # x9 += x13, x5 = rotl32(x5 ^ x9, 12)     
317         paddd           %xmm13,%xmm9              
318         pxor            %xmm9,%xmm5               
319         movdqa          %xmm5,%xmm0               
320         pslld           $12,%xmm0                 
321         psrld           $20,%xmm5                 
322         por             %xmm0,%xmm5               
323         # x10 += x14, x6 = rotl32(x6 ^ x10, 12    
324         paddd           %xmm14,%xmm10             
325         pxor            %xmm10,%xmm6              
326         movdqa          %xmm6,%xmm0               
327         pslld           $12,%xmm0                 
328         psrld           $20,%xmm6                 
329         por             %xmm0,%xmm6               
330         # x11 += x15, x7 = rotl32(x7 ^ x11, 12    
331         paddd           %xmm15,%xmm11             
332         pxor            %xmm11,%xmm7              
333         movdqa          %xmm7,%xmm0               
334         pslld           $12,%xmm0                 
335         psrld           $20,%xmm7                 
336         por             %xmm0,%xmm7               
337                                                   
338         # x0 += x4, x12 = rotl32(x12 ^ x0, 8)     
339         movdqa          0x00(%rsp),%xmm0          
340         paddd           %xmm4,%xmm0               
341         movdqa          %xmm0,0x00(%rsp)          
342         pxor            %xmm0,%xmm12              
343         pshufb          %xmm2,%xmm12              
344         # x1 += x5, x13 = rotl32(x13 ^ x1, 8)     
345         movdqa          0x10(%rsp),%xmm0          
346         paddd           %xmm5,%xmm0               
347         movdqa          %xmm0,0x10(%rsp)          
348         pxor            %xmm0,%xmm13              
349         pshufb          %xmm2,%xmm13              
350         # x2 += x6, x14 = rotl32(x14 ^ x2, 8)     
351         movdqa          0x20(%rsp),%xmm0          
352         paddd           %xmm6,%xmm0               
353         movdqa          %xmm0,0x20(%rsp)          
354         pxor            %xmm0,%xmm14              
355         pshufb          %xmm2,%xmm14              
356         # x3 += x7, x15 = rotl32(x15 ^ x3, 8)     
357         movdqa          0x30(%rsp),%xmm0          
358         paddd           %xmm7,%xmm0               
359         movdqa          %xmm0,0x30(%rsp)          
360         pxor            %xmm0,%xmm15              
361         pshufb          %xmm2,%xmm15              
362                                                   
363         # x8 += x12, x4 = rotl32(x4 ^ x8, 7)      
364         paddd           %xmm12,%xmm8              
365         pxor            %xmm8,%xmm4               
366         movdqa          %xmm4,%xmm0               
367         pslld           $7,%xmm0                  
368         psrld           $25,%xmm4                 
369         por             %xmm0,%xmm4               
370         # x9 += x13, x5 = rotl32(x5 ^ x9, 7)      
371         paddd           %xmm13,%xmm9              
372         pxor            %xmm9,%xmm5               
373         movdqa          %xmm5,%xmm0               
374         pslld           $7,%xmm0                  
375         psrld           $25,%xmm5                 
376         por             %xmm0,%xmm5               
377         # x10 += x14, x6 = rotl32(x6 ^ x10, 7)    
378         paddd           %xmm14,%xmm10             
379         pxor            %xmm10,%xmm6              
380         movdqa          %xmm6,%xmm0               
381         pslld           $7,%xmm0                  
382         psrld           $25,%xmm6                 
383         por             %xmm0,%xmm6               
384         # x11 += x15, x7 = rotl32(x7 ^ x11, 7)    
385         paddd           %xmm15,%xmm11             
386         pxor            %xmm11,%xmm7              
387         movdqa          %xmm7,%xmm0               
388         pslld           $7,%xmm0                  
389         psrld           $25,%xmm7                 
390         por             %xmm0,%xmm7               
391                                                   
392         # x0 += x5, x15 = rotl32(x15 ^ x0, 16)    
393         movdqa          0x00(%rsp),%xmm0          
394         paddd           %xmm5,%xmm0               
395         movdqa          %xmm0,0x00(%rsp)          
396         pxor            %xmm0,%xmm15              
397         pshufb          %xmm3,%xmm15              
398         # x1 += x6, x12 = rotl32(x12 ^ x1, 16)    
399         movdqa          0x10(%rsp),%xmm0          
400         paddd           %xmm6,%xmm0               
401         movdqa          %xmm0,0x10(%rsp)          
402         pxor            %xmm0,%xmm12              
403         pshufb          %xmm3,%xmm12              
404         # x2 += x7, x13 = rotl32(x13 ^ x2, 16)    
405         movdqa          0x20(%rsp),%xmm0          
406         paddd           %xmm7,%xmm0               
407         movdqa          %xmm0,0x20(%rsp)          
408         pxor            %xmm0,%xmm13              
409         pshufb          %xmm3,%xmm13              
410         # x3 += x4, x14 = rotl32(x14 ^ x3, 16)    
411         movdqa          0x30(%rsp),%xmm0          
412         paddd           %xmm4,%xmm0               
413         movdqa          %xmm0,0x30(%rsp)          
414         pxor            %xmm0,%xmm14              
415         pshufb          %xmm3,%xmm14              
416                                                   
417         # x10 += x15, x5 = rotl32(x5 ^ x10, 12    
418         paddd           %xmm15,%xmm10             
419         pxor            %xmm10,%xmm5              
420         movdqa          %xmm5,%xmm0               
421         pslld           $12,%xmm0                 
422         psrld           $20,%xmm5                 
423         por             %xmm0,%xmm5               
424         # x11 += x12, x6 = rotl32(x6 ^ x11, 12    
425         paddd           %xmm12,%xmm11             
426         pxor            %xmm11,%xmm6              
427         movdqa          %xmm6,%xmm0               
428         pslld           $12,%xmm0                 
429         psrld           $20,%xmm6                 
430         por             %xmm0,%xmm6               
431         # x8 += x13, x7 = rotl32(x7 ^ x8, 12)     
432         paddd           %xmm13,%xmm8              
433         pxor            %xmm8,%xmm7               
434         movdqa          %xmm7,%xmm0               
435         pslld           $12,%xmm0                 
436         psrld           $20,%xmm7                 
437         por             %xmm0,%xmm7               
438         # x9 += x14, x4 = rotl32(x4 ^ x9, 12)     
439         paddd           %xmm14,%xmm9              
440         pxor            %xmm9,%xmm4               
441         movdqa          %xmm4,%xmm0               
442         pslld           $12,%xmm0                 
443         psrld           $20,%xmm4                 
444         por             %xmm0,%xmm4               
445                                                   
446         # x0 += x5, x15 = rotl32(x15 ^ x0, 8)     
447         movdqa          0x00(%rsp),%xmm0          
448         paddd           %xmm5,%xmm0               
449         movdqa          %xmm0,0x00(%rsp)          
450         pxor            %xmm0,%xmm15              
451         pshufb          %xmm2,%xmm15              
452         # x1 += x6, x12 = rotl32(x12 ^ x1, 8)     
453         movdqa          0x10(%rsp),%xmm0          
454         paddd           %xmm6,%xmm0               
455         movdqa          %xmm0,0x10(%rsp)          
456         pxor            %xmm0,%xmm12              
457         pshufb          %xmm2,%xmm12              
458         # x2 += x7, x13 = rotl32(x13 ^ x2, 8)     
459         movdqa          0x20(%rsp),%xmm0          
460         paddd           %xmm7,%xmm0               
461         movdqa          %xmm0,0x20(%rsp)          
462         pxor            %xmm0,%xmm13              
463         pshufb          %xmm2,%xmm13              
464         # x3 += x4, x14 = rotl32(x14 ^ x3, 8)     
465         movdqa          0x30(%rsp),%xmm0          
466         paddd           %xmm4,%xmm0               
467         movdqa          %xmm0,0x30(%rsp)          
468         pxor            %xmm0,%xmm14              
469         pshufb          %xmm2,%xmm14              
470                                                   
471         # x10 += x15, x5 = rotl32(x5 ^ x10, 7)    
472         paddd           %xmm15,%xmm10             
473         pxor            %xmm10,%xmm5              
474         movdqa          %xmm5,%xmm0               
475         pslld           $7,%xmm0                  
476         psrld           $25,%xmm5                 
477         por             %xmm0,%xmm5               
478         # x11 += x12, x6 = rotl32(x6 ^ x11, 7)    
479         paddd           %xmm12,%xmm11             
480         pxor            %xmm11,%xmm6              
481         movdqa          %xmm6,%xmm0               
482         pslld           $7,%xmm0                  
483         psrld           $25,%xmm6                 
484         por             %xmm0,%xmm6               
485         # x8 += x13, x7 = rotl32(x7 ^ x8, 7)      
486         paddd           %xmm13,%xmm8              
487         pxor            %xmm8,%xmm7               
488         movdqa          %xmm7,%xmm0               
489         pslld           $7,%xmm0                  
490         psrld           $25,%xmm7                 
491         por             %xmm0,%xmm7               
492         # x9 += x14, x4 = rotl32(x4 ^ x9, 7)      
493         paddd           %xmm14,%xmm9              
494         pxor            %xmm9,%xmm4               
495         movdqa          %xmm4,%xmm0               
496         pslld           $7,%xmm0                  
497         psrld           $25,%xmm4                 
498         por             %xmm0,%xmm4               
499                                                   
500         sub             $2,%r8d                   
501         jnz             .Ldoubleround4            
502                                                   
503         # x0[0-3] += s0[0]                        
504         # x1[0-3] += s0[1]                        
505         movq            0x00(%rdi),%xmm3          
506         pshufd          $0x00,%xmm3,%xmm2         
507         pshufd          $0x55,%xmm3,%xmm3         
508         paddd           0x00(%rsp),%xmm2          
509         movdqa          %xmm2,0x00(%rsp)          
510         paddd           0x10(%rsp),%xmm3          
511         movdqa          %xmm3,0x10(%rsp)          
512         # x2[0-3] += s0[2]                        
513         # x3[0-3] += s0[3]                        
514         movq            0x08(%rdi),%xmm3          
515         pshufd          $0x00,%xmm3,%xmm2         
516         pshufd          $0x55,%xmm3,%xmm3         
517         paddd           0x20(%rsp),%xmm2          
518         movdqa          %xmm2,0x20(%rsp)          
519         paddd           0x30(%rsp),%xmm3          
520         movdqa          %xmm3,0x30(%rsp)          
521                                                   
522         # x4[0-3] += s1[0]                        
523         # x5[0-3] += s1[1]                        
524         movq            0x10(%rdi),%xmm3          
525         pshufd          $0x00,%xmm3,%xmm2         
526         pshufd          $0x55,%xmm3,%xmm3         
527         paddd           %xmm2,%xmm4               
528         paddd           %xmm3,%xmm5               
529         # x6[0-3] += s1[2]                        
530         # x7[0-3] += s1[3]                        
531         movq            0x18(%rdi),%xmm3          
532         pshufd          $0x00,%xmm3,%xmm2         
533         pshufd          $0x55,%xmm3,%xmm3         
534         paddd           %xmm2,%xmm6               
535         paddd           %xmm3,%xmm7               
536                                                   
537         # x8[0-3] += s2[0]                        
538         # x9[0-3] += s2[1]                        
539         movq            0x20(%rdi),%xmm3          
540         pshufd          $0x00,%xmm3,%xmm2         
541         pshufd          $0x55,%xmm3,%xmm3         
542         paddd           %xmm2,%xmm8               
543         paddd           %xmm3,%xmm9               
544         # x10[0-3] += s2[2]                       
545         # x11[0-3] += s2[3]                       
546         movq            0x28(%rdi),%xmm3          
547         pshufd          $0x00,%xmm3,%xmm2         
548         pshufd          $0x55,%xmm3,%xmm3         
549         paddd           %xmm2,%xmm10              
550         paddd           %xmm3,%xmm11              
551                                                   
552         # x12[0-3] += s3[0]                       
553         # x13[0-3] += s3[1]                       
554         movq            0x30(%rdi),%xmm3          
555         pshufd          $0x00,%xmm3,%xmm2         
556         pshufd          $0x55,%xmm3,%xmm3         
557         paddd           %xmm2,%xmm12              
558         paddd           %xmm3,%xmm13              
559         # x14[0-3] += s3[2]                       
560         # x15[0-3] += s3[3]                       
561         movq            0x38(%rdi),%xmm3          
562         pshufd          $0x00,%xmm3,%xmm2         
563         pshufd          $0x55,%xmm3,%xmm3         
564         paddd           %xmm2,%xmm14              
565         paddd           %xmm3,%xmm15              
566                                                   
567         # x12 += counter values 0-3               
568         paddd           %xmm1,%xmm12              
569                                                   
570         # interleave 32-bit words in state n,     
571         movdqa          0x00(%rsp),%xmm0          
572         movdqa          0x10(%rsp),%xmm1          
573         movdqa          %xmm0,%xmm2               
574         punpckldq       %xmm1,%xmm2               
575         punpckhdq       %xmm1,%xmm0               
576         movdqa          %xmm2,0x00(%rsp)          
577         movdqa          %xmm0,0x10(%rsp)          
578         movdqa          0x20(%rsp),%xmm0          
579         movdqa          0x30(%rsp),%xmm1          
580         movdqa          %xmm0,%xmm2               
581         punpckldq       %xmm1,%xmm2               
582         punpckhdq       %xmm1,%xmm0               
583         movdqa          %xmm2,0x20(%rsp)          
584         movdqa          %xmm0,0x30(%rsp)          
585         movdqa          %xmm4,%xmm0               
586         punpckldq       %xmm5,%xmm4               
587         punpckhdq       %xmm5,%xmm0               
588         movdqa          %xmm0,%xmm5               
589         movdqa          %xmm6,%xmm0               
590         punpckldq       %xmm7,%xmm6               
591         punpckhdq       %xmm7,%xmm0               
592         movdqa          %xmm0,%xmm7               
593         movdqa          %xmm8,%xmm0               
594         punpckldq       %xmm9,%xmm8               
595         punpckhdq       %xmm9,%xmm0               
596         movdqa          %xmm0,%xmm9               
597         movdqa          %xmm10,%xmm0              
598         punpckldq       %xmm11,%xmm10             
599         punpckhdq       %xmm11,%xmm0              
600         movdqa          %xmm0,%xmm11              
601         movdqa          %xmm12,%xmm0              
602         punpckldq       %xmm13,%xmm12             
603         punpckhdq       %xmm13,%xmm0              
604         movdqa          %xmm0,%xmm13              
605         movdqa          %xmm14,%xmm0              
606         punpckldq       %xmm15,%xmm14             
607         punpckhdq       %xmm15,%xmm0              
608         movdqa          %xmm0,%xmm15              
609                                                   
610         # interleave 64-bit words in state n,     
611         movdqa          0x00(%rsp),%xmm0          
612         movdqa          0x20(%rsp),%xmm1          
613         movdqa          %xmm0,%xmm2               
614         punpcklqdq      %xmm1,%xmm2               
615         punpckhqdq      %xmm1,%xmm0               
616         movdqa          %xmm2,0x00(%rsp)          
617         movdqa          %xmm0,0x20(%rsp)          
618         movdqa          0x10(%rsp),%xmm0          
619         movdqa          0x30(%rsp),%xmm1          
620         movdqa          %xmm0,%xmm2               
621         punpcklqdq      %xmm1,%xmm2               
622         punpckhqdq      %xmm1,%xmm0               
623         movdqa          %xmm2,0x10(%rsp)          
624         movdqa          %xmm0,0x30(%rsp)          
625         movdqa          %xmm4,%xmm0               
626         punpcklqdq      %xmm6,%xmm4               
627         punpckhqdq      %xmm6,%xmm0               
628         movdqa          %xmm0,%xmm6               
629         movdqa          %xmm5,%xmm0               
630         punpcklqdq      %xmm7,%xmm5               
631         punpckhqdq      %xmm7,%xmm0               
632         movdqa          %xmm0,%xmm7               
633         movdqa          %xmm8,%xmm0               
634         punpcklqdq      %xmm10,%xmm8              
635         punpckhqdq      %xmm10,%xmm0              
636         movdqa          %xmm0,%xmm10              
637         movdqa          %xmm9,%xmm0               
638         punpcklqdq      %xmm11,%xmm9              
639         punpckhqdq      %xmm11,%xmm0              
640         movdqa          %xmm0,%xmm11              
641         movdqa          %xmm12,%xmm0              
642         punpcklqdq      %xmm14,%xmm12             
643         punpckhqdq      %xmm14,%xmm0              
644         movdqa          %xmm0,%xmm14              
645         movdqa          %xmm13,%xmm0              
646         punpcklqdq      %xmm15,%xmm13             
647         punpckhqdq      %xmm15,%xmm0              
648         movdqa          %xmm0,%xmm15              
649                                                   
650         # xor with corresponding input, write     
651         movdqa          0x00(%rsp),%xmm0          
652         cmp             $0x10,%rax                
653         jl              .Lxorpart4                
654         movdqu          0x00(%rdx),%xmm1          
655         pxor            %xmm1,%xmm0               
656         movdqu          %xmm0,0x00(%rsi)          
657                                                   
658         movdqu          %xmm4,%xmm0               
659         cmp             $0x20,%rax                
660         jl              .Lxorpart4                
661         movdqu          0x10(%rdx),%xmm1          
662         pxor            %xmm1,%xmm0               
663         movdqu          %xmm0,0x10(%rsi)          
664                                                   
665         movdqu          %xmm8,%xmm0               
666         cmp             $0x30,%rax                
667         jl              .Lxorpart4                
668         movdqu          0x20(%rdx),%xmm1          
669         pxor            %xmm1,%xmm0               
670         movdqu          %xmm0,0x20(%rsi)          
671                                                   
672         movdqu          %xmm12,%xmm0              
673         cmp             $0x40,%rax                
674         jl              .Lxorpart4                
675         movdqu          0x30(%rdx),%xmm1          
676         pxor            %xmm1,%xmm0               
677         movdqu          %xmm0,0x30(%rsi)          
678                                                   
679         movdqa          0x20(%rsp),%xmm0          
680         cmp             $0x50,%rax                
681         jl              .Lxorpart4                
682         movdqu          0x40(%rdx),%xmm1          
683         pxor            %xmm1,%xmm0               
684         movdqu          %xmm0,0x40(%rsi)          
685                                                   
686         movdqu          %xmm6,%xmm0               
687         cmp             $0x60,%rax                
688         jl              .Lxorpart4                
689         movdqu          0x50(%rdx),%xmm1          
690         pxor            %xmm1,%xmm0               
691         movdqu          %xmm0,0x50(%rsi)          
692                                                   
693         movdqu          %xmm10,%xmm0              
694         cmp             $0x70,%rax                
695         jl              .Lxorpart4                
696         movdqu          0x60(%rdx),%xmm1          
697         pxor            %xmm1,%xmm0               
698         movdqu          %xmm0,0x60(%rsi)          
699                                                   
700         movdqu          %xmm14,%xmm0              
701         cmp             $0x80,%rax                
702         jl              .Lxorpart4                
703         movdqu          0x70(%rdx),%xmm1          
704         pxor            %xmm1,%xmm0               
705         movdqu          %xmm0,0x70(%rsi)          
706                                                   
707         movdqa          0x10(%rsp),%xmm0          
708         cmp             $0x90,%rax                
709         jl              .Lxorpart4                
710         movdqu          0x80(%rdx),%xmm1          
711         pxor            %xmm1,%xmm0               
712         movdqu          %xmm0,0x80(%rsi)          
713                                                   
714         movdqu          %xmm5,%xmm0               
715         cmp             $0xa0,%rax                
716         jl              .Lxorpart4                
717         movdqu          0x90(%rdx),%xmm1          
718         pxor            %xmm1,%xmm0               
719         movdqu          %xmm0,0x90(%rsi)          
720                                                   
721         movdqu          %xmm9,%xmm0               
722         cmp             $0xb0,%rax                
723         jl              .Lxorpart4                
724         movdqu          0xa0(%rdx),%xmm1          
725         pxor            %xmm1,%xmm0               
726         movdqu          %xmm0,0xa0(%rsi)          
727                                                   
728         movdqu          %xmm13,%xmm0              
729         cmp             $0xc0,%rax                
730         jl              .Lxorpart4                
731         movdqu          0xb0(%rdx),%xmm1          
732         pxor            %xmm1,%xmm0               
733         movdqu          %xmm0,0xb0(%rsi)          
734                                                   
735         movdqa          0x30(%rsp),%xmm0          
736         cmp             $0xd0,%rax                
737         jl              .Lxorpart4                
738         movdqu          0xc0(%rdx),%xmm1          
739         pxor            %xmm1,%xmm0               
740         movdqu          %xmm0,0xc0(%rsi)          
741                                                   
742         movdqu          %xmm7,%xmm0               
743         cmp             $0xe0,%rax                
744         jl              .Lxorpart4                
745         movdqu          0xd0(%rdx),%xmm1          
746         pxor            %xmm1,%xmm0               
747         movdqu          %xmm0,0xd0(%rsi)          
748                                                   
749         movdqu          %xmm11,%xmm0              
750         cmp             $0xf0,%rax                
751         jl              .Lxorpart4                
752         movdqu          0xe0(%rdx),%xmm1          
753         pxor            %xmm1,%xmm0               
754         movdqu          %xmm0,0xe0(%rsi)          
755                                                   
756         movdqu          %xmm15,%xmm0              
757         cmp             $0x100,%rax               
758         jl              .Lxorpart4                
759         movdqu          0xf0(%rdx),%xmm1          
760         pxor            %xmm1,%xmm0               
761         movdqu          %xmm0,0xf0(%rsi)          
762                                                   
763 .Ldone4:                                          
764         lea             -8(%r10),%rsp             
765         RET                                       
766                                                   
767 .Lxorpart4:                                       
768         # xor remaining bytes from partial reg    
769         mov             %rax,%r9                  
770         and             $0x0f,%r9                 
771         jz              .Ldone4                   
772         and             $~0x0f,%rax               
773                                                   
774         mov             %rsi,%r11                 
775                                                   
776         lea             (%rdx,%rax),%rsi          
777         mov             %rsp,%rdi                 
778         mov             %r9,%rcx                  
779         rep movsb                                 
780                                                   
781         pxor            0x00(%rsp),%xmm0          
782         movdqa          %xmm0,0x00(%rsp)          
783                                                   
784         mov             %rsp,%rsi                 
785         lea             (%r11,%rax),%rdi          
786         mov             %r9,%rcx                  
787         rep movsb                                 
788                                                   
789         jmp             .Ldone4                   
790                                                   
791 SYM_FUNC_END(chacha_4block_xor_ssse3)             
                                                      

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

sflogo.php