~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/arch/x86/crypto/camellia-aesni-avx-asm_64.S

Version: ~ [ linux-6.12-rc7 ] ~ [ linux-6.11.7 ] ~ [ linux-6.10.14 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.60 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.116 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.171 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.229 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.285 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.323 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.12 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

Diff markup

Differences between /arch/x86/crypto/camellia-aesni-avx-asm_64.S (Architecture alpha) and /arch/i386/crypto/camellia-aesni-avx-asm_64.S (Architecture i386)


  1 /*                                                
  2  * x86_64/AVX/AES-NI assembler implementation     
  3  *                                                
  4  * Copyright © 2012-2013 Jussi Kivilinna <juss    
  5  *                                                
  6  * This program is free software; you can redi    
  7  * it under the terms of the GNU General Publi    
  8  * the Free Software Foundation; either versio    
  9  * (at your option) any later version.            
 10  *                                                
 11  */                                               
 12                                                   
 13 /*                                                
 14  * Version licensed under 2-clause BSD License    
 15  *      http://koti.mbnet.fi/axh/crypto/camell    
 16  */                                               
 17                                                   
 18 #include <linux/linkage.h>                        
 19 #include <asm/frame.h>                            
 20                                                   
 21 #define CAMELLIA_TABLE_BYTE_LEN 272               
 22                                                   
 23 /* struct camellia_ctx: */                        
 24 #define key_table 0                               
 25 #define key_length CAMELLIA_TABLE_BYTE_LEN        
 26                                                   
 27 /* register macros */                             
 28 #define CTX %rdi                                  
 29                                                   
 30 /*********************************************    
 31   16-way camellia                                 
 32  *********************************************    
 33 #define filter_8bit(x, lo_t, hi_t, mask4bit, t    
 34         vpand x, mask4bit, tmp0; \                
 35         vpandn x, mask4bit, x; \                  
 36         vpsrld $4, x, x; \                        
 37         \                                         
 38         vpshufb tmp0, lo_t, tmp0; \               
 39         vpshufb x, hi_t, x; \                     
 40         vpxor tmp0, x, x;                         
 41                                                   
 42 /*                                                
 43  * IN:                                            
 44  *   x0..x7: byte-sliced AB state                 
 45  *   mem_cd: register pointer storing CD state    
 46  *   key: index for key material                  
 47  * OUT:                                           
 48  *   x0..x7: new byte-sliced CD state             
 49  */                                               
 50 #define roundsm16(x0, x1, x2, x3, x4, x5, x6,     
 51                   t7, mem_cd, key) \              
 52         /* \                                      
 53          * S-function with AES subbytes \         
 54          */ \                                     
 55         vmovdqa .Linv_shift_row(%rip), t4; \      
 56         vbroadcastss .L0f0f0f0f(%rip), t7; \      
 57         vmovdqa .Lpre_tf_lo_s1(%rip), t0; \       
 58         vmovdqa .Lpre_tf_hi_s1(%rip), t1; \       
 59         \                                         
 60         /* AES inverse shift rows */ \            
 61         vpshufb t4, x0, x0; \                     
 62         vpshufb t4, x7, x7; \                     
 63         vpshufb t4, x1, x1; \                     
 64         vpshufb t4, x4, x4; \                     
 65         vpshufb t4, x2, x2; \                     
 66         vpshufb t4, x5, x5; \                     
 67         vpshufb t4, x3, x3; \                     
 68         vpshufb t4, x6, x6; \                     
 69         \                                         
 70         /* prefilter sboxes 1, 2 and 3 */ \       
 71         vmovdqa .Lpre_tf_lo_s4(%rip), t2; \       
 72         vmovdqa .Lpre_tf_hi_s4(%rip), t3; \       
 73         filter_8bit(x0, t0, t1, t7, t6); \        
 74         filter_8bit(x7, t0, t1, t7, t6); \        
 75         filter_8bit(x1, t0, t1, t7, t6); \        
 76         filter_8bit(x4, t0, t1, t7, t6); \        
 77         filter_8bit(x2, t0, t1, t7, t6); \        
 78         filter_8bit(x5, t0, t1, t7, t6); \        
 79         \                                         
 80         /* prefilter sbox 4 */ \                  
 81         vpxor t4, t4, t4; \                       
 82         filter_8bit(x3, t2, t3, t7, t6); \        
 83         filter_8bit(x6, t2, t3, t7, t6); \        
 84         \                                         
 85         /* AES subbytes + AES shift rows */ \     
 86         vmovdqa .Lpost_tf_lo_s1(%rip), t0; \      
 87         vmovdqa .Lpost_tf_hi_s1(%rip), t1; \      
 88         vaesenclast t4, x0, x0; \                 
 89         vaesenclast t4, x7, x7; \                 
 90         vaesenclast t4, x1, x1; \                 
 91         vaesenclast t4, x4, x4; \                 
 92         vaesenclast t4, x2, x2; \                 
 93         vaesenclast t4, x5, x5; \                 
 94         vaesenclast t4, x3, x3; \                 
 95         vaesenclast t4, x6, x6; \                 
 96         \                                         
 97         /* postfilter sboxes 1 and 4 */ \         
 98         vmovdqa .Lpost_tf_lo_s3(%rip), t2; \      
 99         vmovdqa .Lpost_tf_hi_s3(%rip), t3; \      
100         filter_8bit(x0, t0, t1, t7, t6); \        
101         filter_8bit(x7, t0, t1, t7, t6); \        
102         filter_8bit(x3, t0, t1, t7, t6); \        
103         filter_8bit(x6, t0, t1, t7, t6); \        
104         \                                         
105         /* postfilter sbox 3 */ \                 
106         vmovdqa .Lpost_tf_lo_s2(%rip), t4; \      
107         vmovdqa .Lpost_tf_hi_s2(%rip), t5; \      
108         filter_8bit(x2, t2, t3, t7, t6); \        
109         filter_8bit(x5, t2, t3, t7, t6); \        
110         \                                         
111         vpxor t6, t6, t6; \                       
112         vmovq key, t0; \                          
113         \                                         
114         /* postfilter sbox 2 */ \                 
115         filter_8bit(x1, t4, t5, t7, t2); \        
116         filter_8bit(x4, t4, t5, t7, t2); \        
117         \                                         
118         vpsrldq $5, t0, t5; \                     
119         vpsrldq $1, t0, t1; \                     
120         vpsrldq $2, t0, t2; \                     
121         vpsrldq $3, t0, t3; \                     
122         vpsrldq $4, t0, t4; \                     
123         vpshufb t6, t0, t0; \                     
124         vpshufb t6, t1, t1; \                     
125         vpshufb t6, t2, t2; \                     
126         vpshufb t6, t3, t3; \                     
127         vpshufb t6, t4, t4; \                     
128         vpsrldq $2, t5, t7; \                     
129         vpshufb t6, t7, t7; \                     
130         \                                         
131         /* \                                      
132          * P-function \                           
133          */ \                                     
134         vpxor x5, x0, x0; \                       
135         vpxor x6, x1, x1; \                       
136         vpxor x7, x2, x2; \                       
137         vpxor x4, x3, x3; \                       
138         \                                         
139         vpxor x2, x4, x4; \                       
140         vpxor x3, x5, x5; \                       
141         vpxor x0, x6, x6; \                       
142         vpxor x1, x7, x7; \                       
143         \                                         
144         vpxor x7, x0, x0; \                       
145         vpxor x4, x1, x1; \                       
146         vpxor x5, x2, x2; \                       
147         vpxor x6, x3, x3; \                       
148         \                                         
149         vpxor x3, x4, x4; \                       
150         vpxor x0, x5, x5; \                       
151         vpxor x1, x6, x6; \                       
152         vpxor x2, x7, x7; /* note: high and lo    
153         \                                         
154         /* \                                      
155          * Add key material and result to CD (    
156          */ \                                     
157         \                                         
158         vpxor t3, x4, x4; \                       
159         vpxor 0 * 16(mem_cd), x4, x4; \           
160         \                                         
161         vpxor t2, x5, x5; \                       
162         vpxor 1 * 16(mem_cd), x5, x5; \           
163         \                                         
164         vpsrldq $1, t5, t3; \                     
165         vpshufb t6, t5, t5; \                     
166         vpshufb t6, t3, t6; \                     
167         \                                         
168         vpxor t1, x6, x6; \                       
169         vpxor 2 * 16(mem_cd), x6, x6; \           
170         \                                         
171         vpxor t0, x7, x7; \                       
172         vpxor 3 * 16(mem_cd), x7, x7; \           
173         \                                         
174         vpxor t7, x0, x0; \                       
175         vpxor 4 * 16(mem_cd), x0, x0; \           
176         \                                         
177         vpxor t6, x1, x1; \                       
178         vpxor 5 * 16(mem_cd), x1, x1; \           
179         \                                         
180         vpxor t5, x2, x2; \                       
181         vpxor 6 * 16(mem_cd), x2, x2; \           
182         \                                         
183         vpxor t4, x3, x3; \                       
184         vpxor 7 * 16(mem_cd), x3, x3;             
185                                                   
186 /*                                                
187  * Size optimization... with inlined roundsm16    
188  * larger and would only be 0.5% faster (on sa    
189  */                                               
190 .align 8                                          
191 SYM_FUNC_START_LOCAL(roundsm16_x0_x1_x2_x3_x4_    
192         roundsm16(%xmm0, %xmm1, %xmm2, %xmm3,     
193                   %xmm8, %xmm9, %xmm10, %xmm11    
194                   %rcx, (%r9));                   
195         RET;                                      
196 SYM_FUNC_END(roundsm16_x0_x1_x2_x3_x4_x5_x6_x7    
197                                                   
198 .align 8                                          
199 SYM_FUNC_START_LOCAL(roundsm16_x4_x5_x6_x7_x0_    
200         roundsm16(%xmm4, %xmm5, %xmm6, %xmm7,     
201                   %xmm12, %xmm13, %xmm14, %xmm    
202                   %rax, (%r9));                   
203         RET;                                      
204 SYM_FUNC_END(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3    
205                                                   
206 /*                                                
207  * IN/OUT:                                        
208  *  x0..x7: byte-sliced AB state preloaded        
209  *  mem_ab: byte-sliced AB state in memory        
210  *  mem_cb: byte-sliced CD state in memory        
211  */                                               
212 #define two_roundsm16(x0, x1, x2, x3, x4, x5,     
213                       y6, y7, mem_ab, mem_cd,     
214         leaq (key_table + (i) * 8)(CTX), %r9;     
215         call roundsm16_x0_x1_x2_x3_x4_x5_x6_x7    
216         \                                         
217         vmovdqu x4, 0 * 16(mem_cd); \             
218         vmovdqu x5, 1 * 16(mem_cd); \             
219         vmovdqu x6, 2 * 16(mem_cd); \             
220         vmovdqu x7, 3 * 16(mem_cd); \             
221         vmovdqu x0, 4 * 16(mem_cd); \             
222         vmovdqu x1, 5 * 16(mem_cd); \             
223         vmovdqu x2, 6 * 16(mem_cd); \             
224         vmovdqu x3, 7 * 16(mem_cd); \             
225         \                                         
226         leaq (key_table + ((i) + (dir)) * 8)(C    
227         call roundsm16_x4_x5_x6_x7_x0_x1_x2_x3    
228         \                                         
229         store_ab(x0, x1, x2, x3, x4, x5, x6, x    
230                                                   
231 #define dummy_store(x0, x1, x2, x3, x4, x5, x6    
232                                                   
233 #define store_ab_state(x0, x1, x2, x3, x4, x5,    
234         /* Store new AB state */ \                
235         vmovdqu x0, 0 * 16(mem_ab); \             
236         vmovdqu x1, 1 * 16(mem_ab); \             
237         vmovdqu x2, 2 * 16(mem_ab); \             
238         vmovdqu x3, 3 * 16(mem_ab); \             
239         vmovdqu x4, 4 * 16(mem_ab); \             
240         vmovdqu x5, 5 * 16(mem_ab); \             
241         vmovdqu x6, 6 * 16(mem_ab); \             
242         vmovdqu x7, 7 * 16(mem_ab);               
243                                                   
244 #define enc_rounds16(x0, x1, x2, x3, x4, x5, x    
245                       y6, y7, mem_ab, mem_cd,     
246         two_roundsm16(x0, x1, x2, x3, x4, x5,     
247                       y6, y7, mem_ab, mem_cd,     
248         two_roundsm16(x0, x1, x2, x3, x4, x5,     
249                       y6, y7, mem_ab, mem_cd,     
250         two_roundsm16(x0, x1, x2, x3, x4, x5,     
251                       y6, y7, mem_ab, mem_cd,     
252                                                   
253 #define dec_rounds16(x0, x1, x2, x3, x4, x5, x    
254                       y6, y7, mem_ab, mem_cd,     
255         two_roundsm16(x0, x1, x2, x3, x4, x5,     
256                       y6, y7, mem_ab, mem_cd,     
257         two_roundsm16(x0, x1, x2, x3, x4, x5,     
258                       y6, y7, mem_ab, mem_cd,     
259         two_roundsm16(x0, x1, x2, x3, x4, x5,     
260                       y6, y7, mem_ab, mem_cd,     
261                                                   
262 /*                                                
263  * IN:                                            
264  *  v0..3: byte-sliced 32-bit integers            
265  * OUT:                                           
266  *  v0..3: (IN <<< 1)                             
267  */                                               
268 #define rol32_1_16(v0, v1, v2, v3, t0, t1, t2,    
269         vpcmpgtb v0, zero, t0; \                  
270         vpaddb v0, v0, v0; \                      
271         vpabsb t0, t0; \                          
272         \                                         
273         vpcmpgtb v1, zero, t1; \                  
274         vpaddb v1, v1, v1; \                      
275         vpabsb t1, t1; \                          
276         \                                         
277         vpcmpgtb v2, zero, t2; \                  
278         vpaddb v2, v2, v2; \                      
279         vpabsb t2, t2; \                          
280         \                                         
281         vpor t0, v1, v1; \                        
282         \                                         
283         vpcmpgtb v3, zero, t0; \                  
284         vpaddb v3, v3, v3; \                      
285         vpabsb t0, t0; \                          
286         \                                         
287         vpor t1, v2, v2; \                        
288         vpor t2, v3, v3; \                        
289         vpor t0, v0, v0;                          
290                                                   
291 /*                                                
292  * IN:                                            
293  *   r: byte-sliced AB state in memory            
294  *   l: byte-sliced CD state in memory            
295  * OUT:                                           
296  *   x0..x7: new byte-sliced CD state             
297  */                                               
298 #define fls16(l, l0, l1, l2, l3, l4, l5, l6, l    
299               tt1, tt2, tt3, kll, klr, krl, kr    
300         /* \                                      
301          * t0 = kll; \                            
302          * t0 &= ll; \                            
303          * lr ^= rol32(t0, 1); \                  
304          */ \                                     
305         vpxor tt0, tt0, tt0; \                    
306         vmovd kll, t0; \                          
307         vpshufb tt0, t0, t3; \                    
308         vpsrldq $1, t0, t0; \                     
309         vpshufb tt0, t0, t2; \                    
310         vpsrldq $1, t0, t0; \                     
311         vpshufb tt0, t0, t1; \                    
312         vpsrldq $1, t0, t0; \                     
313         vpshufb tt0, t0, t0; \                    
314         \                                         
315         vpand l0, t0, t0; \                       
316         vpand l1, t1, t1; \                       
317         vpand l2, t2, t2; \                       
318         vpand l3, t3, t3; \                       
319         \                                         
320         rol32_1_16(t3, t2, t1, t0, tt1, tt2, t    
321         \                                         
322         vpxor l4, t0, l4; \                       
323         vmovdqu l4, 4 * 16(l); \                  
324         vpxor l5, t1, l5; \                       
325         vmovdqu l5, 5 * 16(l); \                  
326         vpxor l6, t2, l6; \                       
327         vmovdqu l6, 6 * 16(l); \                  
328         vpxor l7, t3, l7; \                       
329         vmovdqu l7, 7 * 16(l); \                  
330         \                                         
331         /* \                                      
332          * t2 = krr; \                            
333          * t2 |= rr; \                            
334          * rl ^= t2; \                            
335          */ \                                     
336         \                                         
337         vmovd krr, t0; \                          
338         vpshufb tt0, t0, t3; \                    
339         vpsrldq $1, t0, t0; \                     
340         vpshufb tt0, t0, t2; \                    
341         vpsrldq $1, t0, t0; \                     
342         vpshufb tt0, t0, t1; \                    
343         vpsrldq $1, t0, t0; \                     
344         vpshufb tt0, t0, t0; \                    
345         \                                         
346         vpor 4 * 16(r), t0, t0; \                 
347         vpor 5 * 16(r), t1, t1; \                 
348         vpor 6 * 16(r), t2, t2; \                 
349         vpor 7 * 16(r), t3, t3; \                 
350         \                                         
351         vpxor 0 * 16(r), t0, t0; \                
352         vpxor 1 * 16(r), t1, t1; \                
353         vpxor 2 * 16(r), t2, t2; \                
354         vpxor 3 * 16(r), t3, t3; \                
355         vmovdqu t0, 0 * 16(r); \                  
356         vmovdqu t1, 1 * 16(r); \                  
357         vmovdqu t2, 2 * 16(r); \                  
358         vmovdqu t3, 3 * 16(r); \                  
359         \                                         
360         /* \                                      
361          * t2 = krl; \                            
362          * t2 &= rl; \                            
363          * rr ^= rol32(t2, 1); \                  
364          */ \                                     
365         vmovd krl, t0; \                          
366         vpshufb tt0, t0, t3; \                    
367         vpsrldq $1, t0, t0; \                     
368         vpshufb tt0, t0, t2; \                    
369         vpsrldq $1, t0, t0; \                     
370         vpshufb tt0, t0, t1; \                    
371         vpsrldq $1, t0, t0; \                     
372         vpshufb tt0, t0, t0; \                    
373         \                                         
374         vpand 0 * 16(r), t0, t0; \                
375         vpand 1 * 16(r), t1, t1; \                
376         vpand 2 * 16(r), t2, t2; \                
377         vpand 3 * 16(r), t3, t3; \                
378         \                                         
379         rol32_1_16(t3, t2, t1, t0, tt1, tt2, t    
380         \                                         
381         vpxor 4 * 16(r), t0, t0; \                
382         vpxor 5 * 16(r), t1, t1; \                
383         vpxor 6 * 16(r), t2, t2; \                
384         vpxor 7 * 16(r), t3, t3; \                
385         vmovdqu t0, 4 * 16(r); \                  
386         vmovdqu t1, 5 * 16(r); \                  
387         vmovdqu t2, 6 * 16(r); \                  
388         vmovdqu t3, 7 * 16(r); \                  
389         \                                         
390         /* \                                      
391          * t0 = klr; \                            
392          * t0 |= lr; \                            
393          * ll ^= t0; \                            
394          */ \                                     
395         \                                         
396         vmovd klr, t0; \                          
397         vpshufb tt0, t0, t3; \                    
398         vpsrldq $1, t0, t0; \                     
399         vpshufb tt0, t0, t2; \                    
400         vpsrldq $1, t0, t0; \                     
401         vpshufb tt0, t0, t1; \                    
402         vpsrldq $1, t0, t0; \                     
403         vpshufb tt0, t0, t0; \                    
404         \                                         
405         vpor l4, t0, t0; \                        
406         vpor l5, t1, t1; \                        
407         vpor l6, t2, t2; \                        
408         vpor l7, t3, t3; \                        
409         \                                         
410         vpxor l0, t0, l0; \                       
411         vmovdqu l0, 0 * 16(l); \                  
412         vpxor l1, t1, l1; \                       
413         vmovdqu l1, 1 * 16(l); \                  
414         vpxor l2, t2, l2; \                       
415         vmovdqu l2, 2 * 16(l); \                  
416         vpxor l3, t3, l3; \                       
417         vmovdqu l3, 3 * 16(l);                    
418                                                   
419 #define transpose_4x4(x0, x1, x2, x3, t1, t2)     
420         vpunpckhdq x1, x0, t2; \                  
421         vpunpckldq x1, x0, x0; \                  
422         \                                         
423         vpunpckldq x3, x2, t1; \                  
424         vpunpckhdq x3, x2, x2; \                  
425         \                                         
426         vpunpckhqdq t1, x0, x1; \                 
427         vpunpcklqdq t1, x0, x0; \                 
428         \                                         
429         vpunpckhqdq x2, t2, x3; \                 
430         vpunpcklqdq x2, t2, x2;                   
431                                                   
432 #define byteslice_16x16b(a0, b0, c0, d0, a1, b    
433                          b3, c3, d3, st0, st1)    
434         vmovdqu d2, st0; \                        
435         vmovdqu d3, st1; \                        
436         transpose_4x4(a0, a1, a2, a3, d2, d3);    
437         transpose_4x4(b0, b1, b2, b3, d2, d3);    
438         vmovdqu st0, d2; \                        
439         vmovdqu st1, d3; \                        
440         \                                         
441         vmovdqu a0, st0; \                        
442         vmovdqu a1, st1; \                        
443         transpose_4x4(c0, c1, c2, c3, a0, a1);    
444         transpose_4x4(d0, d1, d2, d3, a0, a1);    
445         \                                         
446         vmovdqu .Lshufb_16x16b(%rip), a0; \       
447         vmovdqu st1, a1; \                        
448         vpshufb a0, a2, a2; \                     
449         vpshufb a0, a3, a3; \                     
450         vpshufb a0, b0, b0; \                     
451         vpshufb a0, b1, b1; \                     
452         vpshufb a0, b2, b2; \                     
453         vpshufb a0, b3, b3; \                     
454         vpshufb a0, a1, a1; \                     
455         vpshufb a0, c0, c0; \                     
456         vpshufb a0, c1, c1; \                     
457         vpshufb a0, c2, c2; \                     
458         vpshufb a0, c3, c3; \                     
459         vpshufb a0, d0, d0; \                     
460         vpshufb a0, d1, d1; \                     
461         vpshufb a0, d2, d2; \                     
462         vpshufb a0, d3, d3; \                     
463         vmovdqu d3, st1; \                        
464         vmovdqu st0, d3; \                        
465         vpshufb a0, d3, a0; \                     
466         vmovdqu d2, st0; \                        
467         \                                         
468         transpose_4x4(a0, b0, c0, d0, d2, d3);    
469         transpose_4x4(a1, b1, c1, d1, d2, d3);    
470         vmovdqu st0, d2; \                        
471         vmovdqu st1, d3; \                        
472         \                                         
473         vmovdqu b0, st0; \                        
474         vmovdqu b1, st1; \                        
475         transpose_4x4(a2, b2, c2, d2, b0, b1);    
476         transpose_4x4(a3, b3, c3, d3, b0, b1);    
477         vmovdqu st0, b0; \                        
478         vmovdqu st1, b1; \                        
479         /* does not adjust output bytes inside    
480                                                   
481 /* load blocks to registers and apply pre-whit    
482 #define inpack16_pre(x0, x1, x2, x3, x4, x5, x    
483                      y6, y7, rio, key) \          
484         vmovq key, x0; \                          
485         vpshufb .Lpack_bswap(%rip), x0, x0; \     
486         \                                         
487         vpxor 0 * 16(rio), x0, y7; \              
488         vpxor 1 * 16(rio), x0, y6; \              
489         vpxor 2 * 16(rio), x0, y5; \              
490         vpxor 3 * 16(rio), x0, y4; \              
491         vpxor 4 * 16(rio), x0, y3; \              
492         vpxor 5 * 16(rio), x0, y2; \              
493         vpxor 6 * 16(rio), x0, y1; \              
494         vpxor 7 * 16(rio), x0, y0; \              
495         vpxor 8 * 16(rio), x0, x7; \              
496         vpxor 9 * 16(rio), x0, x6; \              
497         vpxor 10 * 16(rio), x0, x5; \             
498         vpxor 11 * 16(rio), x0, x4; \             
499         vpxor 12 * 16(rio), x0, x3; \             
500         vpxor 13 * 16(rio), x0, x2; \             
501         vpxor 14 * 16(rio), x0, x1; \             
502         vpxor 15 * 16(rio), x0, x0;               
503                                                   
504 /* byteslice pre-whitened blocks and store to     
505 #define inpack16_post(x0, x1, x2, x3, x4, x5,     
506                       y6, y7, mem_ab, mem_cd)     
507         byteslice_16x16b(x0, x1, x2, x3, x4, x    
508                          y5, y6, y7, (mem_ab),    
509         \                                         
510         vmovdqu x0, 0 * 16(mem_ab); \             
511         vmovdqu x1, 1 * 16(mem_ab); \             
512         vmovdqu x2, 2 * 16(mem_ab); \             
513         vmovdqu x3, 3 * 16(mem_ab); \             
514         vmovdqu x4, 4 * 16(mem_ab); \             
515         vmovdqu x5, 5 * 16(mem_ab); \             
516         vmovdqu x6, 6 * 16(mem_ab); \             
517         vmovdqu x7, 7 * 16(mem_ab); \             
518         vmovdqu y0, 0 * 16(mem_cd); \             
519         vmovdqu y1, 1 * 16(mem_cd); \             
520         vmovdqu y2, 2 * 16(mem_cd); \             
521         vmovdqu y3, 3 * 16(mem_cd); \             
522         vmovdqu y4, 4 * 16(mem_cd); \             
523         vmovdqu y5, 5 * 16(mem_cd); \             
524         vmovdqu y6, 6 * 16(mem_cd); \             
525         vmovdqu y7, 7 * 16(mem_cd);               
526                                                   
527 /* de-byteslice, apply post-whitening and stor    
528 #define outunpack16(x0, x1, x2, x3, x4, x5, x6    
529                     y5, y6, y7, key, stack_tmp    
530         byteslice_16x16b(y0, y4, x0, x4, y1, y    
531                          y7, x3, x7, stack_tmp    
532         \                                         
533         vmovdqu x0, stack_tmp0; \                 
534         \                                         
535         vmovq key, x0; \                          
536         vpshufb .Lpack_bswap(%rip), x0, x0; \     
537         \                                         
538         vpxor x0, y7, y7; \                       
539         vpxor x0, y6, y6; \                       
540         vpxor x0, y5, y5; \                       
541         vpxor x0, y4, y4; \                       
542         vpxor x0, y3, y3; \                       
543         vpxor x0, y2, y2; \                       
544         vpxor x0, y1, y1; \                       
545         vpxor x0, y0, y0; \                       
546         vpxor x0, x7, x7; \                       
547         vpxor x0, x6, x6; \                       
548         vpxor x0, x5, x5; \                       
549         vpxor x0, x4, x4; \                       
550         vpxor x0, x3, x3; \                       
551         vpxor x0, x2, x2; \                       
552         vpxor x0, x1, x1; \                       
553         vpxor stack_tmp0, x0, x0;                 
554                                                   
555 #define write_output(x0, x1, x2, x3, x4, x5, x    
556                      y6, y7, rio) \               
557         vmovdqu x0, 0 * 16(rio); \                
558         vmovdqu x1, 1 * 16(rio); \                
559         vmovdqu x2, 2 * 16(rio); \                
560         vmovdqu x3, 3 * 16(rio); \                
561         vmovdqu x4, 4 * 16(rio); \                
562         vmovdqu x5, 5 * 16(rio); \                
563         vmovdqu x6, 6 * 16(rio); \                
564         vmovdqu x7, 7 * 16(rio); \                
565         vmovdqu y0, 8 * 16(rio); \                
566         vmovdqu y1, 9 * 16(rio); \                
567         vmovdqu y2, 10 * 16(rio); \               
568         vmovdqu y3, 11 * 16(rio); \               
569         vmovdqu y4, 12 * 16(rio); \               
570         vmovdqu y5, 13 * 16(rio); \               
571         vmovdqu y6, 14 * 16(rio); \               
572         vmovdqu y7, 15 * 16(rio);                 
573                                                   
574                                                   
575 /* NB: section is mergeable, all elements must    
576 .section        .rodata.cst16, "aM", @progbits    
577 .align 16                                         
578                                                   
579 #define SHUFB_BYTES(idx) \                        
580         0 + (idx), 4 + (idx), 8 + (idx), 12 +     
581                                                   
582 .Lshufb_16x16b:                                   
583         .byte SHUFB_BYTES(0), SHUFB_BYTES(1),     
584                                                   
585 .Lpack_bswap:                                     
586         .long 0x00010203                          
587         .long 0x04050607                          
588         .long 0x80808080                          
589         .long 0x80808080                          
590                                                   
591 /*                                                
592  * pre-SubByte transform                          
593  *                                                
594  * pre-lookup for sbox1, sbox2, sbox3:            
595  *   swap_bitendianness(                          
596  *       isom_map_camellia_to_aes(                
597  *           camellia_f(                          
598  *               swap_bitendianess(in)            
599  *           )                                    
600  *       )                                        
601  *   )                                            
602  *                                                
603  * (note: '⊕ 0xc5' inside camellia_f())         
604  */                                               
605 .Lpre_tf_lo_s1:                                   
606         .byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x    
607         .byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x    
608 .Lpre_tf_hi_s1:                                   
609         .byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0x    
610         .byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0x    
611                                                   
612 /*                                                
613  * pre-SubByte transform                          
614  *                                                
615  * pre-lookup for sbox4:                          
616  *   swap_bitendianness(                          
617  *       isom_map_camellia_to_aes(                
618  *           camellia_f(                          
619  *               swap_bitendianess(in <<< 1)      
620  *           )                                    
621  *       )                                        
622  *   )                                            
623  *                                                
624  * (note: '⊕ 0xc5' inside camellia_f())         
625  */                                               
626 .Lpre_tf_lo_s4:                                   
627         .byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x    
628         .byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x    
629 .Lpre_tf_hi_s4:                                   
630         .byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0x    
631         .byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x    
632                                                   
633 /*                                                
634  * post-SubByte transform                         
635  *                                                
636  * post-lookup for sbox1, sbox4:                  
637  *  swap_bitendianness(                           
638  *      camellia_h(                               
639  *          isom_map_aes_to_camellia(             
640  *              swap_bitendianness(               
641  *                  aes_inverse_affine_transfo    
642  *              )                                 
643  *          )                                     
644  *      )                                         
645  *  )                                             
646  *                                                
647  * (note: '⊕ 0x6e' inside camellia_h())         
648  */                                               
649 .Lpost_tf_lo_s1:                                  
650         .byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0x    
651         .byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x    
652 .Lpost_tf_hi_s1:                                  
653         .byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x    
654         .byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x    
655                                                   
656 /*                                                
657  * post-SubByte transform                         
658  *                                                
659  * post-lookup for sbox2:                         
660  *  swap_bitendianness(                           
661  *      camellia_h(                               
662  *          isom_map_aes_to_camellia(             
663  *              swap_bitendianness(               
664  *                  aes_inverse_affine_transfo    
665  *              )                                 
666  *          )                                     
667  *      )                                         
668  *  ) <<< 1                                       
669  *                                                
670  * (note: '⊕ 0x6e' inside camellia_h())         
671  */                                               
672 .Lpost_tf_lo_s2:                                  
673         .byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x    
674         .byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x    
675 .Lpost_tf_hi_s2:                                  
676         .byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x    
677         .byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x    
678                                                   
679 /*                                                
680  * post-SubByte transform                         
681  *                                                
682  * post-lookup for sbox3:                         
683  *  swap_bitendianness(                           
684  *      camellia_h(                               
685  *          isom_map_aes_to_camellia(             
686  *              swap_bitendianness(               
687  *                  aes_inverse_affine_transfo    
688  *              )                                 
689  *          )                                     
690  *      )                                         
691  *  ) >>> 1                                       
692  *                                                
693  * (note: '⊕ 0x6e' inside camellia_h())         
694  */                                               
695 .Lpost_tf_lo_s3:                                  
696         .byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x    
697         .byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x    
698 .Lpost_tf_hi_s3:                                  
699         .byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x    
700         .byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x    
701                                                   
702 /* For isolating SubBytes from AESENCLAST, inv    
703 .Linv_shift_row:                                  
704         .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x    
705         .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x    
706                                                   
707 /* 4-bit mask */                                  
708 .section        .rodata.cst4.L0f0f0f0f, "aM",     
709 .align 4                                          
710 .L0f0f0f0f:                                       
711         .long 0x0f0f0f0f                          
712                                                   
713 .text                                             
714                                                   
715 SYM_FUNC_START_LOCAL(__camellia_enc_blk16)        
716         /* input:                                 
717          *      %rdi: ctx, CTX                    
718          *      %rax: temporary storage, 256 b    
719          *      %xmm0..%xmm15: 16 plaintext bl    
720          * output:                                
721          *      %xmm0..%xmm15: 16 encrypted bl    
722          *       7, 8, 6, 5, 4, 3, 2, 1, 0, 15    
723          */                                       
724         FRAME_BEGIN                               
725                                                   
726         leaq 8 * 16(%rax), %rcx;                  
727                                                   
728         inpack16_post(%xmm0, %xmm1, %xmm2, %xm    
729                       %xmm8, %xmm9, %xmm10, %x    
730                       %xmm15, %rax, %rcx);        
731                                                   
732         enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm    
733                      %xmm8, %xmm9, %xmm10, %xm    
734                      %xmm15, %rax, %rcx, 0);      
735                                                   
736         fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3    
737               %rcx, %xmm8, %xmm9, %xmm10, %xmm    
738               %xmm15,                             
739               ((key_table + (8) * 8) + 0)(CTX)    
740               ((key_table + (8) * 8) + 4)(CTX)    
741               ((key_table + (8) * 8) + 8)(CTX)    
742               ((key_table + (8) * 8) + 12)(CTX    
743                                                   
744         enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm    
745                      %xmm8, %xmm9, %xmm10, %xm    
746                      %xmm15, %rax, %rcx, 8);      
747                                                   
748         fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3    
749               %rcx, %xmm8, %xmm9, %xmm10, %xmm    
750               %xmm15,                             
751               ((key_table + (16) * 8) + 0)(CTX    
752               ((key_table + (16) * 8) + 4)(CTX    
753               ((key_table + (16) * 8) + 8)(CTX    
754               ((key_table + (16) * 8) + 12)(CT    
755                                                   
756         enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm    
757                      %xmm8, %xmm9, %xmm10, %xm    
758                      %xmm15, %rax, %rcx, 16);     
759                                                   
760         movl $24, %r8d;                           
761         cmpl $16, key_length(CTX);                
762         jne .Lenc_max32;                          
763                                                   
764 .Lenc_done:                                       
765         /* load CD for output */                  
766         vmovdqu 0 * 16(%rcx), %xmm8;              
767         vmovdqu 1 * 16(%rcx), %xmm9;              
768         vmovdqu 2 * 16(%rcx), %xmm10;             
769         vmovdqu 3 * 16(%rcx), %xmm11;             
770         vmovdqu 4 * 16(%rcx), %xmm12;             
771         vmovdqu 5 * 16(%rcx), %xmm13;             
772         vmovdqu 6 * 16(%rcx), %xmm14;             
773         vmovdqu 7 * 16(%rcx), %xmm15;             
774                                                   
775         outunpack16(%xmm0, %xmm1, %xmm2, %xmm3    
776                     %xmm8, %xmm9, %xmm10, %xmm    
777                     %xmm15, (key_table)(CTX, %    
778                                                   
779         FRAME_END                                 
780         RET;                                      
781                                                   
782 .align 8                                          
783 .Lenc_max32:                                      
784         movl $32, %r8d;                           
785                                                   
786         fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3    
787               %rcx, %xmm8, %xmm9, %xmm10, %xmm    
788               %xmm15,                             
789               ((key_table + (24) * 8) + 0)(CTX    
790               ((key_table + (24) * 8) + 4)(CTX    
791               ((key_table + (24) * 8) + 8)(CTX    
792               ((key_table + (24) * 8) + 12)(CT    
793                                                   
794         enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm    
795                      %xmm8, %xmm9, %xmm10, %xm    
796                      %xmm15, %rax, %rcx, 24);     
797                                                   
798         jmp .Lenc_done;                           
799 SYM_FUNC_END(__camellia_enc_blk16)                
800                                                   
801 SYM_FUNC_START_LOCAL(__camellia_dec_blk16)        
802         /* input:                                 
803          *      %rdi: ctx, CTX                    
804          *      %rax: temporary storage, 256 b    
805          *      %r8d: 24 for 16 byte key, 32 f    
806          *      %xmm0..%xmm15: 16 encrypted bl    
807          * output:                                
808          *      %xmm0..%xmm15: 16 plaintext bl    
809          *       7, 8, 6, 5, 4, 3, 2, 1, 0, 15    
810          */                                       
811         FRAME_BEGIN                               
812                                                   
813         leaq 8 * 16(%rax), %rcx;                  
814                                                   
815         inpack16_post(%xmm0, %xmm1, %xmm2, %xm    
816                       %xmm8, %xmm9, %xmm10, %x    
817                       %xmm15, %rax, %rcx);        
818                                                   
819         cmpl $32, %r8d;                           
820         je .Ldec_max32;                           
821                                                   
822 .Ldec_max24:                                      
823         dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm    
824                      %xmm8, %xmm9, %xmm10, %xm    
825                      %xmm15, %rax, %rcx, 16);     
826                                                   
827         fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3    
828               %rcx, %xmm8, %xmm9, %xmm10, %xmm    
829               %xmm15,                             
830               ((key_table + (16) * 8) + 8)(CTX    
831               ((key_table + (16) * 8) + 12)(CT    
832               ((key_table + (16) * 8) + 0)(CTX    
833               ((key_table + (16) * 8) + 4)(CTX    
834                                                   
835         dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm    
836                      %xmm8, %xmm9, %xmm10, %xm    
837                      %xmm15, %rax, %rcx, 8);      
838                                                   
839         fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3    
840               %rcx, %xmm8, %xmm9, %xmm10, %xmm    
841               %xmm15,                             
842               ((key_table + (8) * 8) + 8)(CTX)    
843               ((key_table + (8) * 8) + 12)(CTX    
844               ((key_table + (8) * 8) + 0)(CTX)    
845               ((key_table + (8) * 8) + 4)(CTX)    
846                                                   
847         dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm    
848                      %xmm8, %xmm9, %xmm10, %xm    
849                      %xmm15, %rax, %rcx, 0);      
850                                                   
851         /* load CD for output */                  
852         vmovdqu 0 * 16(%rcx), %xmm8;              
853         vmovdqu 1 * 16(%rcx), %xmm9;              
854         vmovdqu 2 * 16(%rcx), %xmm10;             
855         vmovdqu 3 * 16(%rcx), %xmm11;             
856         vmovdqu 4 * 16(%rcx), %xmm12;             
857         vmovdqu 5 * 16(%rcx), %xmm13;             
858         vmovdqu 6 * 16(%rcx), %xmm14;             
859         vmovdqu 7 * 16(%rcx), %xmm15;             
860                                                   
861         outunpack16(%xmm0, %xmm1, %xmm2, %xmm3    
862                     %xmm8, %xmm9, %xmm10, %xmm    
863                     %xmm15, (key_table)(CTX),     
864                                                   
865         FRAME_END                                 
866         RET;                                      
867                                                   
868 .align 8                                          
869 .Ldec_max32:                                      
870         dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm    
871                      %xmm8, %xmm9, %xmm10, %xm    
872                      %xmm15, %rax, %rcx, 24);     
873                                                   
874         fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3    
875               %rcx, %xmm8, %xmm9, %xmm10, %xmm    
876               %xmm15,                             
877               ((key_table + (24) * 8) + 8)(CTX    
878               ((key_table + (24) * 8) + 12)(CT    
879               ((key_table + (24) * 8) + 0)(CTX    
880               ((key_table + (24) * 8) + 4)(CTX    
881                                                   
882         jmp .Ldec_max24;                          
883 SYM_FUNC_END(__camellia_dec_blk16)                
884                                                   
885 SYM_FUNC_START(camellia_ecb_enc_16way)            
886         /* input:                                 
887          *      %rdi: ctx, CTX                    
888          *      %rsi: dst (16 blocks)             
889          *      %rdx: src (16 blocks)             
890          */                                       
891          FRAME_BEGIN                              
892                                                   
893         inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm    
894                      %xmm8, %xmm9, %xmm10, %xm    
895                      %xmm15, %rdx, (key_table)    
896                                                   
897         /* now dst can be used as temporary bu    
898         movq    %rsi, %rax;                       
899                                                   
900         call __camellia_enc_blk16;                
901                                                   
902         write_output(%xmm7, %xmm6, %xmm5, %xmm    
903                      %xmm15, %xmm14, %xmm13, %    
904                      %xmm8, %rsi);                
905                                                   
906         FRAME_END                                 
907         RET;                                      
908 SYM_FUNC_END(camellia_ecb_enc_16way)              
909                                                   
910 SYM_FUNC_START(camellia_ecb_dec_16way)            
911         /* input:                                 
912          *      %rdi: ctx, CTX                    
913          *      %rsi: dst (16 blocks)             
914          *      %rdx: src (16 blocks)             
915          */                                       
916          FRAME_BEGIN                              
917                                                   
918         cmpl $16, key_length(CTX);                
919         movl $32, %r8d;                           
920         movl $24, %eax;                           
921         cmovel %eax, %r8d; /* max */              
922                                                   
923         inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm    
924                      %xmm8, %xmm9, %xmm10, %xm    
925                      %xmm15, %rdx, (key_table)    
926                                                   
927         /* now dst can be used as temporary bu    
928         movq    %rsi, %rax;                       
929                                                   
930         call __camellia_dec_blk16;                
931                                                   
932         write_output(%xmm7, %xmm6, %xmm5, %xmm    
933                      %xmm15, %xmm14, %xmm13, %    
934                      %xmm8, %rsi);                
935                                                   
936         FRAME_END                                 
937         RET;                                      
938 SYM_FUNC_END(camellia_ecb_dec_16way)              
939                                                   
940 SYM_FUNC_START(camellia_cbc_dec_16way)            
941         /* input:                                 
942          *      %rdi: ctx, CTX                    
943          *      %rsi: dst (16 blocks)             
944          *      %rdx: src (16 blocks)             
945          */                                       
946         FRAME_BEGIN                               
947                                                   
948         cmpl $16, key_length(CTX);                
949         movl $32, %r8d;                           
950         movl $24, %eax;                           
951         cmovel %eax, %r8d; /* max */              
952                                                   
953         inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm    
954                      %xmm8, %xmm9, %xmm10, %xm    
955                      %xmm15, %rdx, (key_table)    
956                                                   
957         /*                                        
958          * dst might still be in-use (in case     
959          * temporary storage.                     
960          */                                       
961         subq $(16 * 16), %rsp;                    
962         movq %rsp, %rax;                          
963                                                   
964         call __camellia_dec_blk16;                
965                                                   
966         addq $(16 * 16), %rsp;                    
967                                                   
968         vpxor (0 * 16)(%rdx), %xmm6, %xmm6;       
969         vpxor (1 * 16)(%rdx), %xmm5, %xmm5;       
970         vpxor (2 * 16)(%rdx), %xmm4, %xmm4;       
971         vpxor (3 * 16)(%rdx), %xmm3, %xmm3;       
972         vpxor (4 * 16)(%rdx), %xmm2, %xmm2;       
973         vpxor (5 * 16)(%rdx), %xmm1, %xmm1;       
974         vpxor (6 * 16)(%rdx), %xmm0, %xmm0;       
975         vpxor (7 * 16)(%rdx), %xmm15, %xmm15;     
976         vpxor (8 * 16)(%rdx), %xmm14, %xmm14;     
977         vpxor (9 * 16)(%rdx), %xmm13, %xmm13;     
978         vpxor (10 * 16)(%rdx), %xmm12, %xmm12;    
979         vpxor (11 * 16)(%rdx), %xmm11, %xmm11;    
980         vpxor (12 * 16)(%rdx), %xmm10, %xmm10;    
981         vpxor (13 * 16)(%rdx), %xmm9, %xmm9;      
982         vpxor (14 * 16)(%rdx), %xmm8, %xmm8;      
983         write_output(%xmm7, %xmm6, %xmm5, %xmm    
984                      %xmm15, %xmm14, %xmm13, %    
985                      %xmm8, %rsi);                
986                                                   
987         FRAME_END                                 
988         RET;                                      
989 SYM_FUNC_END(camellia_cbc_dec_16way)              
                                                      

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

sflogo.php