~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/arch/arm64/crypto/chacha-neon-core.S

Version: ~ [ linux-6.12-rc7 ] ~ [ linux-6.11.7 ] ~ [ linux-6.10.14 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.60 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.116 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.171 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.229 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.285 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.323 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.12 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

Diff markup

Differences between /arch/arm64/crypto/chacha-neon-core.S (Architecture i386) and /arch/ppc/crypto/chacha-neon-core.S (Architecture ppc)


  1 /*                                                
  2  * ChaCha/XChaCha NEON helper functions           
  3  *                                                
  4  * Copyright (C) 2016-2018 Linaro, Ltd. <ard.bi    
  5  *                                                
  6  * This program is free software; you can redi    
  7  * it under the terms of the GNU General Publi    
  8  * published by the Free Software Foundation.     
  9  *                                                
 10  * Originally based on:                           
 11  * ChaCha20 256-bit cipher algorithm, RFC7539,    
 12  *                                                
 13  * Copyright (C) 2015 Martin Willi                
 14  *                                                
 15  * This program is free software; you can redi    
 16  * it under the terms of the GNU General Publi    
 17  * the Free Software Foundation; either versio    
 18  * (at your option) any later version.            
 19  */                                               
 20                                                   
 21 #include <linux/linkage.h>                        
 22 #include <asm/assembler.h>                        
 23 #include <asm/cache.h>                            
 24                                                   
 25         .text                                     
 26         .align          6                         
 27                                                   
 28 /*                                                
 29  * chacha_permute - permute one block             
 30  *                                                
 31  * Permute one 64-byte block where the state m    
 32  * registers v0-v3.  It performs matrix operat    
 33  * but requires shuffling to rearrange the wor    
 34  *                                                
 35  * The round count is given in w3.                
 36  *                                                
 37  * Clobbers: w3, x10, v4, v12                     
 38  */                                               
 39 SYM_FUNC_START_LOCAL(chacha_permute)              
 40                                                   
 41         adr_l           x10, ROT8                 
 42         ld1             {v12.4s}, [x10]           
 43                                                   
 44 .Ldoubleround:                                    
 45         // x0 += x1, x3 = rotl32(x3 ^ x0, 16)     
 46         add             v0.4s, v0.4s, v1.4s       
 47         eor             v3.16b, v3.16b, v0.16b    
 48         rev32           v3.8h, v3.8h              
 49                                                   
 50         // x2 += x3, x1 = rotl32(x1 ^ x2, 12)     
 51         add             v2.4s, v2.4s, v3.4s       
 52         eor             v4.16b, v1.16b, v2.16b    
 53         shl             v1.4s, v4.4s, #12         
 54         sri             v1.4s, v4.4s, #20         
 55                                                   
 56         // x0 += x1, x3 = rotl32(x3 ^ x0, 8)      
 57         add             v0.4s, v0.4s, v1.4s       
 58         eor             v3.16b, v3.16b, v0.16b    
 59         tbl             v3.16b, {v3.16b}, v12.    
 60                                                   
 61         // x2 += x3, x1 = rotl32(x1 ^ x2, 7)      
 62         add             v2.4s, v2.4s, v3.4s       
 63         eor             v4.16b, v1.16b, v2.16b    
 64         shl             v1.4s, v4.4s, #7          
 65         sri             v1.4s, v4.4s, #25         
 66                                                   
 67         // x1 = shuffle32(x1, MASK(0, 3, 2, 1)    
 68         ext             v1.16b, v1.16b, v1.16b    
 69         // x2 = shuffle32(x2, MASK(1, 0, 3, 2)    
 70         ext             v2.16b, v2.16b, v2.16b    
 71         // x3 = shuffle32(x3, MASK(2, 1, 0, 3)    
 72         ext             v3.16b, v3.16b, v3.16b    
 73                                                   
 74         // x0 += x1, x3 = rotl32(x3 ^ x0, 16)     
 75         add             v0.4s, v0.4s, v1.4s       
 76         eor             v3.16b, v3.16b, v0.16b    
 77         rev32           v3.8h, v3.8h              
 78                                                   
 79         // x2 += x3, x1 = rotl32(x1 ^ x2, 12)     
 80         add             v2.4s, v2.4s, v3.4s       
 81         eor             v4.16b, v1.16b, v2.16b    
 82         shl             v1.4s, v4.4s, #12         
 83         sri             v1.4s, v4.4s, #20         
 84                                                   
 85         // x0 += x1, x3 = rotl32(x3 ^ x0, 8)      
 86         add             v0.4s, v0.4s, v1.4s       
 87         eor             v3.16b, v3.16b, v0.16b    
 88         tbl             v3.16b, {v3.16b}, v12.    
 89                                                   
 90         // x2 += x3, x1 = rotl32(x1 ^ x2, 7)      
 91         add             v2.4s, v2.4s, v3.4s       
 92         eor             v4.16b, v1.16b, v2.16b    
 93         shl             v1.4s, v4.4s, #7          
 94         sri             v1.4s, v4.4s, #25         
 95                                                   
 96         // x1 = shuffle32(x1, MASK(2, 1, 0, 3)    
 97         ext             v1.16b, v1.16b, v1.16b    
 98         // x2 = shuffle32(x2, MASK(1, 0, 3, 2)    
 99         ext             v2.16b, v2.16b, v2.16b    
100         // x3 = shuffle32(x3, MASK(0, 3, 2, 1)    
101         ext             v3.16b, v3.16b, v3.16b    
102                                                   
103         subs            w3, w3, #2                
104         b.ne            .Ldoubleround             
105                                                   
106         ret                                       
107 SYM_FUNC_END(chacha_permute)                      
108                                                   
109 SYM_FUNC_START(chacha_block_xor_neon)             
110         // x0: Input state matrix, s              
111         // x1: 1 data block output, o             
112         // x2: 1 data block input, i              
113         // w3: nrounds                            
114                                                   
115         stp             x29, x30, [sp, #-16]!     
116         mov             x29, sp                   
117                                                   
118         // x0..3 = s0..3                          
119         ld1             {v0.4s-v3.4s}, [x0]       
120         ld1             {v8.4s-v11.4s}, [x0]      
121                                                   
122         bl              chacha_permute            
123                                                   
124         ld1             {v4.16b-v7.16b}, [x2]     
125                                                   
126         // o0 = i0 ^ (x0 + s0)                    
127         add             v0.4s, v0.4s, v8.4s       
128         eor             v0.16b, v0.16b, v4.16b    
129                                                   
130         // o1 = i1 ^ (x1 + s1)                    
131         add             v1.4s, v1.4s, v9.4s       
132         eor             v1.16b, v1.16b, v5.16b    
133                                                   
134         // o2 = i2 ^ (x2 + s2)                    
135         add             v2.4s, v2.4s, v10.4s      
136         eor             v2.16b, v2.16b, v6.16b    
137                                                   
138         // o3 = i3 ^ (x3 + s3)                    
139         add             v3.4s, v3.4s, v11.4s      
140         eor             v3.16b, v3.16b, v7.16b    
141                                                   
142         st1             {v0.16b-v3.16b}, [x1]     
143                                                   
144         ldp             x29, x30, [sp], #16       
145         ret                                       
146 SYM_FUNC_END(chacha_block_xor_neon)               
147                                                   
148 SYM_FUNC_START(hchacha_block_neon)                
149         // x0: Input state matrix, s              
150         // x1: output (8 32-bit words)            
151         // w2: nrounds                            
152                                                   
153         stp             x29, x30, [sp, #-16]!     
154         mov             x29, sp                   
155                                                   
156         ld1             {v0.4s-v3.4s}, [x0]       
157                                                   
158         mov             w3, w2                    
159         bl              chacha_permute            
160                                                   
161         st1             {v0.4s}, [x1], #16        
162         st1             {v3.4s}, [x1]             
163                                                   
164         ldp             x29, x30, [sp], #16       
165         ret                                       
166 SYM_FUNC_END(hchacha_block_neon)                  
167                                                   
168         a0              .req    w12               
169         a1              .req    w13               
170         a2              .req    w14               
171         a3              .req    w15               
172         a4              .req    w16               
173         a5              .req    w17               
174         a6              .req    w19               
175         a7              .req    w20               
176         a8              .req    w21               
177         a9              .req    w22               
178         a10             .req    w23               
179         a11             .req    w24               
180         a12             .req    w25               
181         a13             .req    w26               
182         a14             .req    w27               
183         a15             .req    w28               
184                                                   
185         .align          6                         
186 SYM_FUNC_START(chacha_4block_xor_neon)            
187         frame_push      10                        
188                                                   
189         // x0: Input state matrix, s              
190         // x1: 4 data blocks output, o            
191         // x2: 4 data blocks input, i             
192         // w3: nrounds                            
193         // x4: byte count                         
194                                                   
195         adr_l           x10, .Lpermute            
196         and             x5, x4, #63               
197         add             x10, x10, x5              
198                                                   
199         //                                        
200         // This function encrypts four consecu    
201         // the state matrix in NEON registers     
202         // each operation on the corresponding    
203         // requires no word shuffling. For fin    
204         // matrix by interleaving 32- and then    
205         // do XOR in NEON registers.              
206         //                                        
207         // At the same time, a fifth block is     
208         // scalar registers                       
209         //                                        
210         adr_l           x9, CTRINC                
211         ld1             {v30.4s-v31.4s}, [x9]     
212                                                   
213         // x0..15[0-3] = s0..3[0..3]              
214         add             x8, x0, #16               
215         ld4r            { v0.4s- v3.4s}, [x0]     
216         ld4r            { v4.4s- v7.4s}, [x8],    
217         ld4r            { v8.4s-v11.4s}, [x8],    
218         ld4r            {v12.4s-v15.4s}, [x8]     
219                                                   
220         mov             a0, v0.s[0]               
221         mov             a1, v1.s[0]               
222         mov             a2, v2.s[0]               
223         mov             a3, v3.s[0]               
224         mov             a4, v4.s[0]               
225         mov             a5, v5.s[0]               
226         mov             a6, v6.s[0]               
227         mov             a7, v7.s[0]               
228         mov             a8, v8.s[0]               
229         mov             a9, v9.s[0]               
230         mov             a10, v10.s[0]             
231         mov             a11, v11.s[0]             
232         mov             a12, v12.s[0]             
233         mov             a13, v13.s[0]             
234         mov             a14, v14.s[0]             
235         mov             a15, v15.s[0]             
236                                                   
237         // x12 += counter values 1-4              
238         add             v12.4s, v12.4s, v30.4s    
239                                                   
240 .Ldoubleround4:                                   
241         // x0 += x4, x12 = rotl32(x12 ^ x0, 16    
242         // x1 += x5, x13 = rotl32(x13 ^ x1, 16    
243         // x2 += x6, x14 = rotl32(x14 ^ x2, 16    
244         // x3 += x7, x15 = rotl32(x15 ^ x3, 16    
245         add             v0.4s, v0.4s, v4.4s       
246           add           a0, a0, a4                
247         add             v1.4s, v1.4s, v5.4s       
248           add           a1, a1, a5                
249         add             v2.4s, v2.4s, v6.4s       
250           add           a2, a2, a6                
251         add             v3.4s, v3.4s, v7.4s       
252           add           a3, a3, a7                
253                                                   
254         eor             v12.16b, v12.16b, v0.1    
255           eor           a12, a12, a0              
256         eor             v13.16b, v13.16b, v1.1    
257           eor           a13, a13, a1              
258         eor             v14.16b, v14.16b, v2.1    
259           eor           a14, a14, a2              
260         eor             v15.16b, v15.16b, v3.1    
261           eor           a15, a15, a3              
262                                                   
263         rev32           v12.8h, v12.8h            
264           ror           a12, a12, #16             
265         rev32           v13.8h, v13.8h            
266           ror           a13, a13, #16             
267         rev32           v14.8h, v14.8h            
268           ror           a14, a14, #16             
269         rev32           v15.8h, v15.8h            
270           ror           a15, a15, #16             
271                                                   
272         // x8 += x12, x4 = rotl32(x4 ^ x8, 12)    
273         // x9 += x13, x5 = rotl32(x5 ^ x9, 12)    
274         // x10 += x14, x6 = rotl32(x6 ^ x10, 1    
275         // x11 += x15, x7 = rotl32(x7 ^ x11, 1    
276         add             v8.4s, v8.4s, v12.4s      
277           add           a8, a8, a12               
278         add             v9.4s, v9.4s, v13.4s      
279           add           a9, a9, a13               
280         add             v10.4s, v10.4s, v14.4s    
281           add           a10, a10, a14             
282         add             v11.4s, v11.4s, v15.4s    
283           add           a11, a11, a15             
284                                                   
285         eor             v16.16b, v4.16b, v8.16    
286           eor           a4, a4, a8                
287         eor             v17.16b, v5.16b, v9.16    
288           eor           a5, a5, a9                
289         eor             v18.16b, v6.16b, v10.1    
290           eor           a6, a6, a10               
291         eor             v19.16b, v7.16b, v11.1    
292           eor           a7, a7, a11               
293                                                   
294         shl             v4.4s, v16.4s, #12        
295         shl             v5.4s, v17.4s, #12        
296         shl             v6.4s, v18.4s, #12        
297         shl             v7.4s, v19.4s, #12        
298                                                   
299         sri             v4.4s, v16.4s, #20        
300           ror           a4, a4, #20               
301         sri             v5.4s, v17.4s, #20        
302           ror           a5, a5, #20               
303         sri             v6.4s, v18.4s, #20        
304           ror           a6, a6, #20               
305         sri             v7.4s, v19.4s, #20        
306           ror           a7, a7, #20               
307                                                   
308         // x0 += x4, x12 = rotl32(x12 ^ x0, 8)    
309         // x1 += x5, x13 = rotl32(x13 ^ x1, 8)    
310         // x2 += x6, x14 = rotl32(x14 ^ x2, 8)    
311         // x3 += x7, x15 = rotl32(x15 ^ x3, 8)    
312         add             v0.4s, v0.4s, v4.4s       
313           add           a0, a0, a4                
314         add             v1.4s, v1.4s, v5.4s       
315           add           a1, a1, a5                
316         add             v2.4s, v2.4s, v6.4s       
317           add           a2, a2, a6                
318         add             v3.4s, v3.4s, v7.4s       
319           add           a3, a3, a7                
320                                                   
321         eor             v12.16b, v12.16b, v0.1    
322           eor           a12, a12, a0              
323         eor             v13.16b, v13.16b, v1.1    
324           eor           a13, a13, a1              
325         eor             v14.16b, v14.16b, v2.1    
326           eor           a14, a14, a2              
327         eor             v15.16b, v15.16b, v3.1    
328           eor           a15, a15, a3              
329                                                   
330         tbl             v12.16b, {v12.16b}, v3    
331           ror           a12, a12, #24             
332         tbl             v13.16b, {v13.16b}, v3    
333           ror           a13, a13, #24             
334         tbl             v14.16b, {v14.16b}, v3    
335           ror           a14, a14, #24             
336         tbl             v15.16b, {v15.16b}, v3    
337           ror           a15, a15, #24             
338                                                   
339         // x8 += x12, x4 = rotl32(x4 ^ x8, 7)     
340         // x9 += x13, x5 = rotl32(x5 ^ x9, 7)     
341         // x10 += x14, x6 = rotl32(x6 ^ x10, 7    
342         // x11 += x15, x7 = rotl32(x7 ^ x11, 7    
343         add             v8.4s, v8.4s, v12.4s      
344           add           a8, a8, a12               
345         add             v9.4s, v9.4s, v13.4s      
346           add           a9, a9, a13               
347         add             v10.4s, v10.4s, v14.4s    
348           add           a10, a10, a14             
349         add             v11.4s, v11.4s, v15.4s    
350           add           a11, a11, a15             
351                                                   
352         eor             v16.16b, v4.16b, v8.16    
353           eor           a4, a4, a8                
354         eor             v17.16b, v5.16b, v9.16    
355           eor           a5, a5, a9                
356         eor             v18.16b, v6.16b, v10.1    
357           eor           a6, a6, a10               
358         eor             v19.16b, v7.16b, v11.1    
359           eor           a7, a7, a11               
360                                                   
361         shl             v4.4s, v16.4s, #7         
362         shl             v5.4s, v17.4s, #7         
363         shl             v6.4s, v18.4s, #7         
364         shl             v7.4s, v19.4s, #7         
365                                                   
366         sri             v4.4s, v16.4s, #25        
367           ror           a4, a4, #25               
368         sri             v5.4s, v17.4s, #25        
369           ror           a5, a5, #25               
370         sri             v6.4s, v18.4s, #25        
371          ror            a6, a6, #25               
372         sri             v7.4s, v19.4s, #25        
373           ror           a7, a7, #25               
374                                                   
375         // x0 += x5, x15 = rotl32(x15 ^ x0, 16    
376         // x1 += x6, x12 = rotl32(x12 ^ x1, 16    
377         // x2 += x7, x13 = rotl32(x13 ^ x2, 16    
378         // x3 += x4, x14 = rotl32(x14 ^ x3, 16    
379         add             v0.4s, v0.4s, v5.4s       
380           add           a0, a0, a5                
381         add             v1.4s, v1.4s, v6.4s       
382           add           a1, a1, a6                
383         add             v2.4s, v2.4s, v7.4s       
384           add           a2, a2, a7                
385         add             v3.4s, v3.4s, v4.4s       
386           add           a3, a3, a4                
387                                                   
388         eor             v15.16b, v15.16b, v0.1    
389           eor           a15, a15, a0              
390         eor             v12.16b, v12.16b, v1.1    
391           eor           a12, a12, a1              
392         eor             v13.16b, v13.16b, v2.1    
393           eor           a13, a13, a2              
394         eor             v14.16b, v14.16b, v3.1    
395           eor           a14, a14, a3              
396                                                   
397         rev32           v15.8h, v15.8h            
398           ror           a15, a15, #16             
399         rev32           v12.8h, v12.8h            
400           ror           a12, a12, #16             
401         rev32           v13.8h, v13.8h            
402           ror           a13, a13, #16             
403         rev32           v14.8h, v14.8h            
404           ror           a14, a14, #16             
405                                                   
406         // x10 += x15, x5 = rotl32(x5 ^ x10, 1    
407         // x11 += x12, x6 = rotl32(x6 ^ x11, 1    
408         // x8 += x13, x7 = rotl32(x7 ^ x8, 12)    
409         // x9 += x14, x4 = rotl32(x4 ^ x9, 12)    
410         add             v10.4s, v10.4s, v15.4s    
411           add           a10, a10, a15             
412         add             v11.4s, v11.4s, v12.4s    
413           add           a11, a11, a12             
414         add             v8.4s, v8.4s, v13.4s      
415           add           a8, a8, a13               
416         add             v9.4s, v9.4s, v14.4s      
417           add           a9, a9, a14               
418                                                   
419         eor             v16.16b, v5.16b, v10.1    
420           eor           a5, a5, a10               
421         eor             v17.16b, v6.16b, v11.1    
422           eor           a6, a6, a11               
423         eor             v18.16b, v7.16b, v8.16    
424           eor           a7, a7, a8                
425         eor             v19.16b, v4.16b, v9.16    
426           eor           a4, a4, a9                
427                                                   
428         shl             v5.4s, v16.4s, #12        
429         shl             v6.4s, v17.4s, #12        
430         shl             v7.4s, v18.4s, #12        
431         shl             v4.4s, v19.4s, #12        
432                                                   
433         sri             v5.4s, v16.4s, #20        
434           ror           a5, a5, #20               
435         sri             v6.4s, v17.4s, #20        
436           ror           a6, a6, #20               
437         sri             v7.4s, v18.4s, #20        
438           ror           a7, a7, #20               
439         sri             v4.4s, v19.4s, #20        
440           ror           a4, a4, #20               
441                                                   
442         // x0 += x5, x15 = rotl32(x15 ^ x0, 8)    
443         // x1 += x6, x12 = rotl32(x12 ^ x1, 8)    
444         // x2 += x7, x13 = rotl32(x13 ^ x2, 8)    
445         // x3 += x4, x14 = rotl32(x14 ^ x3, 8)    
446         add             v0.4s, v0.4s, v5.4s       
447           add           a0, a0, a5                
448         add             v1.4s, v1.4s, v6.4s       
449           add           a1, a1, a6                
450         add             v2.4s, v2.4s, v7.4s       
451           add           a2, a2, a7                
452         add             v3.4s, v3.4s, v4.4s       
453           add           a3, a3, a4                
454                                                   
455         eor             v15.16b, v15.16b, v0.1    
456           eor           a15, a15, a0              
457         eor             v12.16b, v12.16b, v1.1    
458           eor           a12, a12, a1              
459         eor             v13.16b, v13.16b, v2.1    
460           eor           a13, a13, a2              
461         eor             v14.16b, v14.16b, v3.1    
462           eor           a14, a14, a3              
463                                                   
464         tbl             v15.16b, {v15.16b}, v3    
465           ror           a15, a15, #24             
466         tbl             v12.16b, {v12.16b}, v3    
467           ror           a12, a12, #24             
468         tbl             v13.16b, {v13.16b}, v3    
469           ror           a13, a13, #24             
470         tbl             v14.16b, {v14.16b}, v3    
471           ror           a14, a14, #24             
472                                                   
473         // x10 += x15, x5 = rotl32(x5 ^ x10, 7    
474         // x11 += x12, x6 = rotl32(x6 ^ x11, 7    
475         // x8 += x13, x7 = rotl32(x7 ^ x8, 7)     
476         // x9 += x14, x4 = rotl32(x4 ^ x9, 7)     
477         add             v10.4s, v10.4s, v15.4s    
478           add           a10, a10, a15             
479         add             v11.4s, v11.4s, v12.4s    
480           add           a11, a11, a12             
481         add             v8.4s, v8.4s, v13.4s      
482           add           a8, a8, a13               
483         add             v9.4s, v9.4s, v14.4s      
484           add           a9, a9, a14               
485                                                   
486         eor             v16.16b, v5.16b, v10.1    
487           eor           a5, a5, a10               
488         eor             v17.16b, v6.16b, v11.1    
489           eor           a6, a6, a11               
490         eor             v18.16b, v7.16b, v8.16    
491           eor           a7, a7, a8                
492         eor             v19.16b, v4.16b, v9.16    
493           eor           a4, a4, a9                
494                                                   
495         shl             v5.4s, v16.4s, #7         
496         shl             v6.4s, v17.4s, #7         
497         shl             v7.4s, v18.4s, #7         
498         shl             v4.4s, v19.4s, #7         
499                                                   
500         sri             v5.4s, v16.4s, #25        
501           ror           a5, a5, #25               
502         sri             v6.4s, v17.4s, #25        
503           ror           a6, a6, #25               
504         sri             v7.4s, v18.4s, #25        
505           ror           a7, a7, #25               
506         sri             v4.4s, v19.4s, #25        
507           ror           a4, a4, #25               
508                                                   
509         subs            w3, w3, #2                
510         b.ne            .Ldoubleround4            
511                                                   
512         ld4r            {v16.4s-v19.4s}, [x0],    
513         ld4r            {v20.4s-v23.4s}, [x0],    
514                                                   
515         // x12 += counter values 0-3              
516         add             v12.4s, v12.4s, v30.4s    
517                                                   
518         // x0[0-3] += s0[0]                       
519         // x1[0-3] += s0[1]                       
520         // x2[0-3] += s0[2]                       
521         // x3[0-3] += s0[3]                       
522         add             v0.4s, v0.4s, v16.4s      
523           mov           w6, v16.s[0]              
524           mov           w7, v17.s[0]              
525         add             v1.4s, v1.4s, v17.4s      
526           mov           w8, v18.s[0]              
527           mov           w9, v19.s[0]              
528         add             v2.4s, v2.4s, v18.4s      
529           add           a0, a0, w6                
530           add           a1, a1, w7                
531         add             v3.4s, v3.4s, v19.4s      
532           add           a2, a2, w8                
533           add           a3, a3, w9                
534 CPU_BE(   rev           a0, a0          )         
535 CPU_BE(   rev           a1, a1          )         
536 CPU_BE(   rev           a2, a2          )         
537 CPU_BE(   rev           a3, a3          )         
538                                                   
539         ld4r            {v24.4s-v27.4s}, [x0],    
540         ld4r            {v28.4s-v31.4s}, [x0]     
541                                                   
542         // x4[0-3] += s1[0]                       
543         // x5[0-3] += s1[1]                       
544         // x6[0-3] += s1[2]                       
545         // x7[0-3] += s1[3]                       
546         add             v4.4s, v4.4s, v20.4s      
547           mov           w6, v20.s[0]              
548           mov           w7, v21.s[0]              
549         add             v5.4s, v5.4s, v21.4s      
550           mov           w8, v22.s[0]              
551           mov           w9, v23.s[0]              
552         add             v6.4s, v6.4s, v22.4s      
553           add           a4, a4, w6                
554           add           a5, a5, w7                
555         add             v7.4s, v7.4s, v23.4s      
556           add           a6, a6, w8                
557           add           a7, a7, w9                
558 CPU_BE(   rev           a4, a4          )         
559 CPU_BE(   rev           a5, a5          )         
560 CPU_BE(   rev           a6, a6          )         
561 CPU_BE(   rev           a7, a7          )         
562                                                   
563         // x8[0-3] += s2[0]                       
564         // x9[0-3] += s2[1]                       
565         // x10[0-3] += s2[2]                      
566         // x11[0-3] += s2[3]                      
567         add             v8.4s, v8.4s, v24.4s      
568           mov           w6, v24.s[0]              
569           mov           w7, v25.s[0]              
570         add             v9.4s, v9.4s, v25.4s      
571           mov           w8, v26.s[0]              
572           mov           w9, v27.s[0]              
573         add             v10.4s, v10.4s, v26.4s    
574           add           a8, a8, w6                
575           add           a9, a9, w7                
576         add             v11.4s, v11.4s, v27.4s    
577           add           a10, a10, w8              
578           add           a11, a11, w9              
579 CPU_BE(   rev           a8, a8          )         
580 CPU_BE(   rev           a9, a9          )         
581 CPU_BE(   rev           a10, a10        )         
582 CPU_BE(   rev           a11, a11        )         
583                                                   
584         // x12[0-3] += s3[0]                      
585         // x13[0-3] += s3[1]                      
586         // x14[0-3] += s3[2]                      
587         // x15[0-3] += s3[3]                      
588         add             v12.4s, v12.4s, v28.4s    
589           mov           w6, v28.s[0]              
590           mov           w7, v29.s[0]              
591         add             v13.4s, v13.4s, v29.4s    
592           mov           w8, v30.s[0]              
593           mov           w9, v31.s[0]              
594         add             v14.4s, v14.4s, v30.4s    
595           add           a12, a12, w6              
596           add           a13, a13, w7              
597         add             v15.4s, v15.4s, v31.4s    
598           add           a14, a14, w8              
599           add           a15, a15, w9              
600 CPU_BE(   rev           a12, a12        )         
601 CPU_BE(   rev           a13, a13        )         
602 CPU_BE(   rev           a14, a14        )         
603 CPU_BE(   rev           a15, a15        )         
604                                                   
605         // interleave 32-bit words in state n,    
606           ldp           w6, w7, [x2], #64         
607         zip1            v16.4s, v0.4s, v1.4s      
608           ldp           w8, w9, [x2, #-56]        
609           eor           a0, a0, w6                
610         zip2            v17.4s, v0.4s, v1.4s      
611           eor           a1, a1, w7                
612         zip1            v18.4s, v2.4s, v3.4s      
613           eor           a2, a2, w8                
614         zip2            v19.4s, v2.4s, v3.4s      
615           eor           a3, a3, w9                
616           ldp           w6, w7, [x2, #-48]        
617         zip1            v20.4s, v4.4s, v5.4s      
618           ldp           w8, w9, [x2, #-40]        
619           eor           a4, a4, w6                
620         zip2            v21.4s, v4.4s, v5.4s      
621           eor           a5, a5, w7                
622         zip1            v22.4s, v6.4s, v7.4s      
623           eor           a6, a6, w8                
624         zip2            v23.4s, v6.4s, v7.4s      
625           eor           a7, a7, w9                
626           ldp           w6, w7, [x2, #-32]        
627         zip1            v24.4s, v8.4s, v9.4s      
628           ldp           w8, w9, [x2, #-24]        
629           eor           a8, a8, w6                
630         zip2            v25.4s, v8.4s, v9.4s      
631           eor           a9, a9, w7                
632         zip1            v26.4s, v10.4s, v11.4s    
633           eor           a10, a10, w8              
634         zip2            v27.4s, v10.4s, v11.4s    
635           eor           a11, a11, w9              
636           ldp           w6, w7, [x2, #-16]        
637         zip1            v28.4s, v12.4s, v13.4s    
638           ldp           w8, w9, [x2, #-8]         
639           eor           a12, a12, w6              
640         zip2            v29.4s, v12.4s, v13.4s    
641           eor           a13, a13, w7              
642         zip1            v30.4s, v14.4s, v15.4s    
643           eor           a14, a14, w8              
644         zip2            v31.4s, v14.4s, v15.4s    
645           eor           a15, a15, w9              
646                                                   
647         add             x3, x2, x4                
648         sub             x3, x3, #128              
649                                                   
650         subs            x5, x4, #128              
651         csel            x2, x2, x3, ge            
652                                                   
653         // interleave 64-bit words in state n,    
654         zip1            v0.2d, v16.2d, v18.2d     
655         zip2            v4.2d, v16.2d, v18.2d     
656           stp           a0, a1, [x1], #64         
657         zip1            v8.2d, v17.2d, v19.2d     
658         zip2            v12.2d, v17.2d, v19.2d    
659           stp           a2, a3, [x1, #-56]        
660                                                   
661         subs            x6, x4, #192              
662         ld1             {v16.16b-v19.16b}, [x2    
663         csel            x2, x2, x3, ge            
664                                                   
665         zip1            v1.2d, v20.2d, v22.2d     
666         zip2            v5.2d, v20.2d, v22.2d     
667           stp           a4, a5, [x1, #-48]        
668         zip1            v9.2d, v21.2d, v23.2d     
669         zip2            v13.2d, v21.2d, v23.2d    
670           stp           a6, a7, [x1, #-40]        
671                                                   
672         subs            x7, x4, #256              
673         ld1             {v20.16b-v23.16b}, [x2    
674         csel            x2, x2, x3, ge            
675                                                   
676         zip1            v2.2d, v24.2d, v26.2d     
677         zip2            v6.2d, v24.2d, v26.2d     
678           stp           a8, a9, [x1, #-32]        
679         zip1            v10.2d, v25.2d, v27.2d    
680         zip2            v14.2d, v25.2d, v27.2d    
681           stp           a10, a11, [x1, #-24]      
682                                                   
683         subs            x8, x4, #320              
684         ld1             {v24.16b-v27.16b}, [x2    
685         csel            x2, x2, x3, ge            
686                                                   
687         zip1            v3.2d, v28.2d, v30.2d     
688         zip2            v7.2d, v28.2d, v30.2d     
689           stp           a12, a13, [x1, #-16]      
690         zip1            v11.2d, v29.2d, v31.2d    
691         zip2            v15.2d, v29.2d, v31.2d    
692           stp           a14, a15, [x1, #-8]       
693                                                   
694         tbnz            x5, #63, .Lt128           
695         ld1             {v28.16b-v31.16b}, [x2    
696                                                   
697         // xor with corresponding input, write    
698         eor             v16.16b, v16.16b, v0.1    
699         eor             v17.16b, v17.16b, v1.1    
700         eor             v18.16b, v18.16b, v2.1    
701         eor             v19.16b, v19.16b, v3.1    
702                                                   
703         tbnz            x6, #63, .Lt192           
704                                                   
705         eor             v20.16b, v20.16b, v4.1    
706         eor             v21.16b, v21.16b, v5.1    
707         eor             v22.16b, v22.16b, v6.1    
708         eor             v23.16b, v23.16b, v7.1    
709                                                   
710         st1             {v16.16b-v19.16b}, [x1    
711         tbnz            x7, #63, .Lt256           
712                                                   
713         eor             v24.16b, v24.16b, v8.1    
714         eor             v25.16b, v25.16b, v9.1    
715         eor             v26.16b, v26.16b, v10.    
716         eor             v27.16b, v27.16b, v11.    
717                                                   
718         st1             {v20.16b-v23.16b}, [x1    
719         tbnz            x8, #63, .Lt320           
720                                                   
721         eor             v28.16b, v28.16b, v12.    
722         eor             v29.16b, v29.16b, v13.    
723         eor             v30.16b, v30.16b, v14.    
724         eor             v31.16b, v31.16b, v15.    
725                                                   
726         st1             {v24.16b-v27.16b}, [x1    
727         st1             {v28.16b-v31.16b}, [x1    
728                                                   
729 .Lout:  frame_pop                                 
730         ret                                       
731                                                   
732         // fewer than 192 bytes of in/output      
733 .Lt192: cbz             x5, 1f                    
734         ld1             {v28.16b-v31.16b}, [x1    
735         add             x5, x5, x1                
736         tbl             v28.16b, {v4.16b-v7.16    
737         tbl             v29.16b, {v4.16b-v7.16    
738         tbl             v30.16b, {v4.16b-v7.16    
739         tbl             v31.16b, {v4.16b-v7.16    
740                                                   
741 0:      eor             v20.16b, v20.16b, v28.    
742         eor             v21.16b, v21.16b, v29.    
743         eor             v22.16b, v22.16b, v30.    
744         eor             v23.16b, v23.16b, v31.    
745         st1             {v20.16b-v23.16b}, [x5    
746 1:      st1             {v16.16b-v19.16b}, [x1    
747         b               .Lout                     
748                                                   
749         // fewer than 128 bytes of in/output      
750 .Lt128: ld1             {v28.16b-v31.16b}, [x1    
751         add             x5, x5, x1                
752         sub             x1, x1, #64               
753         tbl             v28.16b, {v0.16b-v3.16    
754         tbl             v29.16b, {v0.16b-v3.16    
755         tbl             v30.16b, {v0.16b-v3.16    
756         tbl             v31.16b, {v0.16b-v3.16    
757         ld1             {v16.16b-v19.16b}, [x1    
758         b               0b                        
759                                                   
760         // fewer than 256 bytes of in/output      
761 .Lt256: cbz             x6, 2f                    
762         ld1             {v4.16b-v7.16b}, [x10]    
763         add             x6, x6, x1                
764         tbl             v0.16b, {v8.16b-v11.16    
765         tbl             v1.16b, {v8.16b-v11.16    
766         tbl             v2.16b, {v8.16b-v11.16    
767         tbl             v3.16b, {v8.16b-v11.16    
768                                                   
769         eor             v28.16b, v28.16b, v0.1    
770         eor             v29.16b, v29.16b, v1.1    
771         eor             v30.16b, v30.16b, v2.1    
772         eor             v31.16b, v31.16b, v3.1    
773         st1             {v28.16b-v31.16b}, [x6    
774 2:      st1             {v20.16b-v23.16b}, [x1    
775         b               .Lout                     
776                                                   
777         // fewer than 320 bytes of in/output      
778 .Lt320: cbz             x7, 3f                    
779         ld1             {v4.16b-v7.16b}, [x10]    
780         add             x7, x7, x1                
781         tbl             v0.16b, {v12.16b-v15.1    
782         tbl             v1.16b, {v12.16b-v15.1    
783         tbl             v2.16b, {v12.16b-v15.1    
784         tbl             v3.16b, {v12.16b-v15.1    
785                                                   
786         eor             v28.16b, v28.16b, v0.1    
787         eor             v29.16b, v29.16b, v1.1    
788         eor             v30.16b, v30.16b, v2.1    
789         eor             v31.16b, v31.16b, v3.1    
790         st1             {v28.16b-v31.16b}, [x7    
791 3:      st1             {v24.16b-v27.16b}, [x1    
792         b               .Lout                     
793 SYM_FUNC_END(chacha_4block_xor_neon)              
794                                                   
795         .section        ".rodata", "a", %progb    
796         .align          L1_CACHE_SHIFT            
797 .Lpermute:                                        
798         .set            .Li, 0                    
799         .rept           128                       
800         .byte           (.Li - 64)                
801         .set            .Li, .Li + 1              
802         .endr                                     
803                                                   
804 CTRINC: .word           1, 2, 3, 4                
805 ROT8:   .word           0x02010003, 0x06050407    
                                                      

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

sflogo.php