~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/arch/x86/crypto/aria-aesni-avx2-asm_64.S

Version: ~ [ linux-6.12-rc7 ] ~ [ linux-6.11.7 ] ~ [ linux-6.10.14 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.60 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.116 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.171 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.229 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.285 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.323 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.12 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

Diff markup

Differences between /arch/x86/crypto/aria-aesni-avx2-asm_64.S (Version linux-6.12-rc7) and /arch/i386/crypto/aria-aesni-avx2-asm_64.S (Version linux-5.15.171)


  1 /* SPDX-License-Identifier: GPL-2.0-or-later *    
  2 /*                                                
  3  * ARIA Cipher 32-way parallel algorithm (AVX2    
  4  *                                                
  5  * Copyright (c) 2022 Taehee Yoo <ap420073@gmai    
  6  *                                                
  7  */                                               
  8                                                   
  9 #include <linux/linkage.h>                        
 10 #include <asm/frame.h>                            
 11 #include <asm/asm-offsets.h>                      
 12 #include <linux/cfi_types.h>                      
 13                                                   
 14 /* register macros */                             
 15 #define CTX %rdi                                  
 16                                                   
 17 #define ymm0_x xmm0                               
 18 #define ymm1_x xmm1                               
 19 #define ymm2_x xmm2                               
 20 #define ymm3_x xmm3                               
 21 #define ymm4_x xmm4                               
 22 #define ymm5_x xmm5                               
 23 #define ymm6_x xmm6                               
 24 #define ymm7_x xmm7                               
 25 #define ymm8_x xmm8                               
 26 #define ymm9_x xmm9                               
 27 #define ymm10_x xmm10                             
 28 #define ymm11_x xmm11                             
 29 #define ymm12_x xmm12                             
 30 #define ymm13_x xmm13                             
 31 #define ymm14_x xmm14                             
 32 #define ymm15_x xmm15                             
 33                                                   
 34 #define BV8(a0, a1, a2, a3, a4, a5, a6, a7)       
 35         ( (((a0) & 1) << 0) |                     
 36           (((a1) & 1) << 1) |                     
 37           (((a2) & 1) << 2) |                     
 38           (((a3) & 1) << 3) |                     
 39           (((a4) & 1) << 4) |                     
 40           (((a5) & 1) << 5) |                     
 41           (((a6) & 1) << 6) |                     
 42           (((a7) & 1) << 7) )                     
 43                                                   
 44 #define BM8X8(l0, l1, l2, l3, l4, l5, l6, l7)     
 45         ( ((l7) << (0 * 8)) |                     
 46           ((l6) << (1 * 8)) |                     
 47           ((l5) << (2 * 8)) |                     
 48           ((l4) << (3 * 8)) |                     
 49           ((l3) << (4 * 8)) |                     
 50           ((l2) << (5 * 8)) |                     
 51           ((l1) << (6 * 8)) |                     
 52           ((l0) << (7 * 8)) )                     
 53                                                   
 54 #define inc_le128(x, minus_one, tmp)              
 55         vpcmpeqq minus_one, x, tmp;               
 56         vpsubq minus_one, x, x;                   
 57         vpslldq $8, tmp, tmp;                     
 58         vpsubq tmp, x, x;                         
 59                                                   
 60 #define filter_8bit(x, lo_t, hi_t, mask4bit, t    
 61         vpand x, mask4bit, tmp0;                  
 62         vpandn x, mask4bit, x;                    
 63         vpsrld $4, x, x;                          
 64                                                   
 65         vpshufb tmp0, lo_t, tmp0;                 
 66         vpshufb x, hi_t, x;                       
 67         vpxor tmp0, x, x;                         
 68                                                   
 69 #define transpose_4x4(x0, x1, x2, x3, t1, t2)     
 70         vpunpckhdq x1, x0, t2;                    
 71         vpunpckldq x1, x0, x0;                    
 72                                                   
 73         vpunpckldq x3, x2, t1;                    
 74         vpunpckhdq x3, x2, x2;                    
 75                                                   
 76         vpunpckhqdq t1, x0, x1;                   
 77         vpunpcklqdq t1, x0, x0;                   
 78                                                   
 79         vpunpckhqdq x2, t2, x3;                   
 80         vpunpcklqdq x2, t2, x2;                   
 81                                                   
 82 #define byteslice_16x16b(a0, b0, c0, d0,          
 83                          a1, b1, c1, d1,          
 84                          a2, b2, c2, d2,          
 85                          a3, b3, c3, d3,          
 86                          st0, st1)                
 87         vmovdqu d2, st0;                          
 88         vmovdqu d3, st1;                          
 89         transpose_4x4(a0, a1, a2, a3, d2, d3);    
 90         transpose_4x4(b0, b1, b2, b3, d2, d3);    
 91         vmovdqu st0, d2;                          
 92         vmovdqu st1, d3;                          
 93                                                   
 94         vmovdqu a0, st0;                          
 95         vmovdqu a1, st1;                          
 96         transpose_4x4(c0, c1, c2, c3, a0, a1);    
 97         transpose_4x4(d0, d1, d2, d3, a0, a1);    
 98                                                   
 99         vbroadcasti128 .Lshufb_16x16b(%rip), a    
100         vmovdqu st1, a1;                          
101         vpshufb a0, a2, a2;                       
102         vpshufb a0, a3, a3;                       
103         vpshufb a0, b0, b0;                       
104         vpshufb a0, b1, b1;                       
105         vpshufb a0, b2, b2;                       
106         vpshufb a0, b3, b3;                       
107         vpshufb a0, a1, a1;                       
108         vpshufb a0, c0, c0;                       
109         vpshufb a0, c1, c1;                       
110         vpshufb a0, c2, c2;                       
111         vpshufb a0, c3, c3;                       
112         vpshufb a0, d0, d0;                       
113         vpshufb a0, d1, d1;                       
114         vpshufb a0, d2, d2;                       
115         vpshufb a0, d3, d3;                       
116         vmovdqu d3, st1;                          
117         vmovdqu st0, d3;                          
118         vpshufb a0, d3, a0;                       
119         vmovdqu d2, st0;                          
120                                                   
121         transpose_4x4(a0, b0, c0, d0, d2, d3);    
122         transpose_4x4(a1, b1, c1, d1, d2, d3);    
123         vmovdqu st0, d2;                          
124         vmovdqu st1, d3;                          
125                                                   
126         vmovdqu b0, st0;                          
127         vmovdqu b1, st1;                          
128         transpose_4x4(a2, b2, c2, d2, b0, b1);    
129         transpose_4x4(a3, b3, c3, d3, b0, b1);    
130         vmovdqu st0, b0;                          
131         vmovdqu st1, b1;                          
132         /* does not adjust output bytes inside    
133                                                   
134 #define debyteslice_16x16b(a0, b0, c0, d0,        
135                            a1, b1, c1, d1,        
136                            a2, b2, c2, d2,        
137                            a3, b3, c3, d3,        
138                            st0, st1)              
139         vmovdqu d2, st0;                          
140         vmovdqu d3, st1;                          
141         transpose_4x4(a0, a1, a2, a3, d2, d3);    
142         transpose_4x4(b0, b1, b2, b3, d2, d3);    
143         vmovdqu st0, d2;                          
144         vmovdqu st1, d3;                          
145                                                   
146         vmovdqu a0, st0;                          
147         vmovdqu a1, st1;                          
148         transpose_4x4(c0, c1, c2, c3, a0, a1);    
149         transpose_4x4(d0, d1, d2, d3, a0, a1);    
150                                                   
151         vbroadcasti128 .Lshufb_16x16b(%rip), a    
152         vmovdqu st1, a1;                          
153         vpshufb a0, a2, a2;                       
154         vpshufb a0, a3, a3;                       
155         vpshufb a0, b0, b0;                       
156         vpshufb a0, b1, b1;                       
157         vpshufb a0, b2, b2;                       
158         vpshufb a0, b3, b3;                       
159         vpshufb a0, a1, a1;                       
160         vpshufb a0, c0, c0;                       
161         vpshufb a0, c1, c1;                       
162         vpshufb a0, c2, c2;                       
163         vpshufb a0, c3, c3;                       
164         vpshufb a0, d0, d0;                       
165         vpshufb a0, d1, d1;                       
166         vpshufb a0, d2, d2;                       
167         vpshufb a0, d3, d3;                       
168         vmovdqu d3, st1;                          
169         vmovdqu st0, d3;                          
170         vpshufb a0, d3, a0;                       
171         vmovdqu d2, st0;                          
172                                                   
173         transpose_4x4(c0, d0, a0, b0, d2, d3);    
174         transpose_4x4(c1, d1, a1, b1, d2, d3);    
175         vmovdqu st0, d2;                          
176         vmovdqu st1, d3;                          
177                                                   
178         vmovdqu b0, st0;                          
179         vmovdqu b1, st1;                          
180         transpose_4x4(c2, d2, a2, b2, b0, b1);    
181         transpose_4x4(c3, d3, a3, b3, b0, b1);    
182         vmovdqu st0, b0;                          
183         vmovdqu st1, b1;                          
184         /* does not adjust output bytes inside    
185                                                   
186 /* load blocks to registers and apply pre-whit    
187 #define inpack16_pre(x0, x1, x2, x3,              
188                      x4, x5, x6, x7,              
189                      y0, y1, y2, y3,              
190                      y4, y5, y6, y7,              
191                      rio)                         
192         vmovdqu (0 * 32)(rio), x0;                
193         vmovdqu (1 * 32)(rio), x1;                
194         vmovdqu (2 * 32)(rio), x2;                
195         vmovdqu (3 * 32)(rio), x3;                
196         vmovdqu (4 * 32)(rio), x4;                
197         vmovdqu (5 * 32)(rio), x5;                
198         vmovdqu (6 * 32)(rio), x6;                
199         vmovdqu (7 * 32)(rio), x7;                
200         vmovdqu (8 * 32)(rio), y0;                
201         vmovdqu (9 * 32)(rio), y1;                
202         vmovdqu (10 * 32)(rio), y2;               
203         vmovdqu (11 * 32)(rio), y3;               
204         vmovdqu (12 * 32)(rio), y4;               
205         vmovdqu (13 * 32)(rio), y5;               
206         vmovdqu (14 * 32)(rio), y6;               
207         vmovdqu (15 * 32)(rio), y7;               
208                                                   
209 /* byteslice pre-whitened blocks and store to     
210 #define inpack16_post(x0, x1, x2, x3,             
211                       x4, x5, x6, x7,             
212                       y0, y1, y2, y3,             
213                       y4, y5, y6, y7,             
214                       mem_ab, mem_cd)             
215         byteslice_16x16b(x0, x1, x2, x3,          
216                          x4, x5, x6, x7,          
217                          y0, y1, y2, y3,          
218                          y4, y5, y6, y7,          
219                          (mem_ab), (mem_cd));     
220                                                   
221         vmovdqu x0, 0 * 32(mem_ab);               
222         vmovdqu x1, 1 * 32(mem_ab);               
223         vmovdqu x2, 2 * 32(mem_ab);               
224         vmovdqu x3, 3 * 32(mem_ab);               
225         vmovdqu x4, 4 * 32(mem_ab);               
226         vmovdqu x5, 5 * 32(mem_ab);               
227         vmovdqu x6, 6 * 32(mem_ab);               
228         vmovdqu x7, 7 * 32(mem_ab);               
229         vmovdqu y0, 0 * 32(mem_cd);               
230         vmovdqu y1, 1 * 32(mem_cd);               
231         vmovdqu y2, 2 * 32(mem_cd);               
232         vmovdqu y3, 3 * 32(mem_cd);               
233         vmovdqu y4, 4 * 32(mem_cd);               
234         vmovdqu y5, 5 * 32(mem_cd);               
235         vmovdqu y6, 6 * 32(mem_cd);               
236         vmovdqu y7, 7 * 32(mem_cd);               
237                                                   
238 #define write_output(x0, x1, x2, x3,              
239                      x4, x5, x6, x7,              
240                      y0, y1, y2, y3,              
241                      y4, y5, y6, y7,              
242                      mem)                         
243         vmovdqu x0, 0 * 32(mem);                  
244         vmovdqu x1, 1 * 32(mem);                  
245         vmovdqu x2, 2 * 32(mem);                  
246         vmovdqu x3, 3 * 32(mem);                  
247         vmovdqu x4, 4 * 32(mem);                  
248         vmovdqu x5, 5 * 32(mem);                  
249         vmovdqu x6, 6 * 32(mem);                  
250         vmovdqu x7, 7 * 32(mem);                  
251         vmovdqu y0, 8 * 32(mem);                  
252         vmovdqu y1, 9 * 32(mem);                  
253         vmovdqu y2, 10 * 32(mem);                 
254         vmovdqu y3, 11 * 32(mem);                 
255         vmovdqu y4, 12 * 32(mem);                 
256         vmovdqu y5, 13 * 32(mem);                 
257         vmovdqu y6, 14 * 32(mem);                 
258         vmovdqu y7, 15 * 32(mem);                 
259                                                   
260 #define aria_store_state_8way(x0, x1, x2, x3,     
261                               x4, x5, x6, x7,     
262                               mem_tmp, idx)       
263         vmovdqu x0, ((idx + 0) * 32)(mem_tmp);    
264         vmovdqu x1, ((idx + 1) * 32)(mem_tmp);    
265         vmovdqu x2, ((idx + 2) * 32)(mem_tmp);    
266         vmovdqu x3, ((idx + 3) * 32)(mem_tmp);    
267         vmovdqu x4, ((idx + 4) * 32)(mem_tmp);    
268         vmovdqu x5, ((idx + 5) * 32)(mem_tmp);    
269         vmovdqu x6, ((idx + 6) * 32)(mem_tmp);    
270         vmovdqu x7, ((idx + 7) * 32)(mem_tmp);    
271                                                   
272 #define aria_load_state_8way(x0, x1, x2, x3,      
273                              x4, x5, x6, x7,      
274                              mem_tmp, idx)        
275         vmovdqu ((idx + 0) * 32)(mem_tmp), x0;    
276         vmovdqu ((idx + 1) * 32)(mem_tmp), x1;    
277         vmovdqu ((idx + 2) * 32)(mem_tmp), x2;    
278         vmovdqu ((idx + 3) * 32)(mem_tmp), x3;    
279         vmovdqu ((idx + 4) * 32)(mem_tmp), x4;    
280         vmovdqu ((idx + 5) * 32)(mem_tmp), x5;    
281         vmovdqu ((idx + 6) * 32)(mem_tmp), x6;    
282         vmovdqu ((idx + 7) * 32)(mem_tmp), x7;    
283                                                   
284 #define aria_ark_8way(x0, x1, x2, x3,             
285                       x4, x5, x6, x7,             
286                       t0, rk, idx, round)         
287         /* AddRoundKey */                         
288         vpbroadcastb ((round * 16) + idx + 3)(    
289         vpxor t0, x0, x0;                         
290         vpbroadcastb ((round * 16) + idx + 2)(    
291         vpxor t0, x1, x1;                         
292         vpbroadcastb ((round * 16) + idx + 1)(    
293         vpxor t0, x2, x2;                         
294         vpbroadcastb ((round * 16) + idx + 0)(    
295         vpxor t0, x3, x3;                         
296         vpbroadcastb ((round * 16) + idx + 7)(    
297         vpxor t0, x4, x4;                         
298         vpbroadcastb ((round * 16) + idx + 6)(    
299         vpxor t0, x5, x5;                         
300         vpbroadcastb ((round * 16) + idx + 5)(    
301         vpxor t0, x6, x6;                         
302         vpbroadcastb ((round * 16) + idx + 4)(    
303         vpxor t0, x7, x7;                         
304                                                   
305 #ifdef CONFIG_AS_GFNI                             
306 #define aria_sbox_8way_gfni(x0, x1, x2, x3,       
307                             x4, x5, x6, x7,       
308                             t0, t1, t2, t3,       
309                             t4, t5, t6, t7)       
310         vpbroadcastq .Ltf_s2_bitmatrix(%rip),     
311         vpbroadcastq .Ltf_inv_bitmatrix(%rip),    
312         vpbroadcastq .Ltf_id_bitmatrix(%rip),     
313         vpbroadcastq .Ltf_aff_bitmatrix(%rip),    
314         vpbroadcastq .Ltf_x2_bitmatrix(%rip),     
315         vgf2p8affineinvqb $(tf_s2_const), t0,     
316         vgf2p8affineinvqb $(tf_s2_const), t0,     
317         vgf2p8affineqb $(tf_inv_const), t1, x2    
318         vgf2p8affineqb $(tf_inv_const), t1, x6    
319         vgf2p8affineinvqb $0, t2, x2, x2;         
320         vgf2p8affineinvqb $0, t2, x6, x6;         
321         vgf2p8affineinvqb $(tf_aff_const), t3,    
322         vgf2p8affineinvqb $(tf_aff_const), t3,    
323         vgf2p8affineqb $(tf_x2_const), t4, x3,    
324         vgf2p8affineqb $(tf_x2_const), t4, x7,    
325         vgf2p8affineinvqb $0, t2, x3, x3;         
326         vgf2p8affineinvqb $0, t2, x7, x7          
327                                                   
328 #endif /* CONFIG_AS_GFNI */                       
329 #define aria_sbox_8way(x0, x1, x2, x3,            
330                        x4, x5, x6, x7,            
331                        t0, t1, t2, t3,            
332                        t4, t5, t6, t7)            
333         vpxor t7, t7, t7;                         
334         vpxor t6, t6, t6;                         
335         vbroadcasti128 .Linv_shift_row(%rip),     
336         vbroadcasti128 .Lshift_row(%rip), t1;     
337         vbroadcasti128 .Ltf_lo__inv_aff__and__    
338         vbroadcasti128 .Ltf_hi__inv_aff__and__    
339         vbroadcasti128 .Ltf_lo__x2__and__fwd_a    
340         vbroadcasti128 .Ltf_hi__x2__and__fwd_a    
341                                                   
342         vextracti128 $1, x0, t6##_x;              
343         vaesenclast t7##_x, x0##_x, x0##_x;       
344         vaesenclast t7##_x, t6##_x, t6##_x;       
345         vinserti128 $1, t6##_x, x0, x0;           
346                                                   
347         vextracti128 $1, x4, t6##_x;              
348         vaesenclast t7##_x, x4##_x, x4##_x;       
349         vaesenclast t7##_x, t6##_x, t6##_x;       
350         vinserti128 $1, t6##_x, x4, x4;           
351                                                   
352         vextracti128 $1, x1, t6##_x;              
353         vaesenclast t7##_x, x1##_x, x1##_x;       
354         vaesenclast t7##_x, t6##_x, t6##_x;       
355         vinserti128 $1, t6##_x, x1, x1;           
356                                                   
357         vextracti128 $1, x5, t6##_x;              
358         vaesenclast t7##_x, x5##_x, x5##_x;       
359         vaesenclast t7##_x, t6##_x, t6##_x;       
360         vinserti128 $1, t6##_x, x5, x5;           
361                                                   
362         vextracti128 $1, x2, t6##_x;              
363         vaesdeclast t7##_x, x2##_x, x2##_x;       
364         vaesdeclast t7##_x, t6##_x, t6##_x;       
365         vinserti128 $1, t6##_x, x2, x2;           
366                                                   
367         vextracti128 $1, x6, t6##_x;              
368         vaesdeclast t7##_x, x6##_x, x6##_x;       
369         vaesdeclast t7##_x, t6##_x, t6##_x;       
370         vinserti128 $1, t6##_x, x6, x6;           
371                                                   
372         vpbroadcastd .L0f0f0f0f(%rip), t6;        
373                                                   
374         /* AES inverse shift rows */              
375         vpshufb t0, x0, x0;                       
376         vpshufb t0, x4, x4;                       
377         vpshufb t0, x1, x1;                       
378         vpshufb t0, x5, x5;                       
379         vpshufb t1, x3, x3;                       
380         vpshufb t1, x7, x7;                       
381         vpshufb t1, x2, x2;                       
382         vpshufb t1, x6, x6;                       
383                                                   
384         /* affine transformation for S2 */        
385         filter_8bit(x1, t2, t3, t6, t0);          
386         /* affine transformation for S2 */        
387         filter_8bit(x5, t2, t3, t6, t0);          
388                                                   
389         /* affine transformation for X2 */        
390         filter_8bit(x3, t4, t5, t6, t0);          
391         /* affine transformation for X2 */        
392         filter_8bit(x7, t4, t5, t6, t0);          
393                                                   
394         vpxor t6, t6, t6;                         
395         vextracti128 $1, x3, t6##_x;              
396         vaesdeclast t7##_x, x3##_x, x3##_x;       
397         vaesdeclast t7##_x, t6##_x, t6##_x;       
398         vinserti128 $1, t6##_x, x3, x3;           
399                                                   
400         vextracti128 $1, x7, t6##_x;              
401         vaesdeclast t7##_x, x7##_x, x7##_x;       
402         vaesdeclast t7##_x, t6##_x, t6##_x;       
403         vinserti128 $1, t6##_x, x7, x7;           
404                                                   
405 #define aria_diff_m(x0, x1, x2, x3,               
406                     t0, t1, t2, t3)               
407         /* T = rotr32(X, 8); */                   
408         /* X ^= T */                              
409         vpxor x0, x3, t0;                         
410         vpxor x1, x0, t1;                         
411         vpxor x2, x1, t2;                         
412         vpxor x3, x2, t3;                         
413         /* X = T ^ rotr(X, 16); */                
414         vpxor t2, x0, x0;                         
415         vpxor x1, t3, t3;                         
416         vpxor t0, x2, x2;                         
417         vpxor t1, x3, x1;                         
418         vmovdqu t3, x3;                           
419                                                   
420 #define aria_diff_word(x0, x1, x2, x3,            
421                        x4, x5, x6, x7,            
422                        y0, y1, y2, y3,            
423                        y4, y5, y6, y7)            
424         /* t1 ^= t2; */                           
425         vpxor y0, x4, x4;                         
426         vpxor y1, x5, x5;                         
427         vpxor y2, x6, x6;                         
428         vpxor y3, x7, x7;                         
429                                                   
430         /* t2 ^= t3; */                           
431         vpxor y4, y0, y0;                         
432         vpxor y5, y1, y1;                         
433         vpxor y6, y2, y2;                         
434         vpxor y7, y3, y3;                         
435                                                   
436         /* t0 ^= t1; */                           
437         vpxor x4, x0, x0;                         
438         vpxor x5, x1, x1;                         
439         vpxor x6, x2, x2;                         
440         vpxor x7, x3, x3;                         
441                                                   
442         /* t3 ^= t1; */                           
443         vpxor x4, y4, y4;                         
444         vpxor x5, y5, y5;                         
445         vpxor x6, y6, y6;                         
446         vpxor x7, y7, y7;                         
447                                                   
448         /* t2 ^= t0; */                           
449         vpxor x0, y0, y0;                         
450         vpxor x1, y1, y1;                         
451         vpxor x2, y2, y2;                         
452         vpxor x3, y3, y3;                         
453                                                   
454         /* t1 ^= t2; */                           
455         vpxor y0, x4, x4;                         
456         vpxor y1, x5, x5;                         
457         vpxor y2, x6, x6;                         
458         vpxor y3, x7, x7;                         
459                                                   
460 #define aria_fe(x0, x1, x2, x3,                   
461                 x4, x5, x6, x7,                   
462                 y0, y1, y2, y3,                   
463                 y4, y5, y6, y7,                   
464                 mem_tmp, rk, round)               
465         aria_ark_8way(x0, x1, x2, x3, x4, x5,     
466                       y0, rk, 8, round);          
467                                                   
468         aria_sbox_8way(x2, x3, x0, x1, x6, x7,    
469                        y0, y1, y2, y3, y4, y5,    
470                                                   
471         aria_diff_m(x0, x1, x2, x3, y0, y1, y2    
472         aria_diff_m(x4, x5, x6, x7, y0, y1, y2    
473         aria_store_state_8way(x0, x1, x2, x3,     
474                               x4, x5, x6, x7,     
475                               mem_tmp, 8);        
476                                                   
477         aria_load_state_8way(x0, x1, x2, x3,      
478                              x4, x5, x6, x7,      
479                              mem_tmp, 0);         
480         aria_ark_8way(x0, x1, x2, x3, x4, x5,     
481                       y0, rk, 0, round);          
482                                                   
483         aria_sbox_8way(x2, x3, x0, x1, x6, x7,    
484                        y0, y1, y2, y3, y4, y5,    
485                                                   
486         aria_diff_m(x0, x1, x2, x3, y0, y1, y2    
487         aria_diff_m(x4, x5, x6, x7, y0, y1, y2    
488         aria_store_state_8way(x0, x1, x2, x3,     
489                               x4, x5, x6, x7,     
490                               mem_tmp, 0);        
491         aria_load_state_8way(y0, y1, y2, y3,      
492                              y4, y5, y6, y7,      
493                              mem_tmp, 8);         
494         aria_diff_word(x0, x1, x2, x3,            
495                        x4, x5, x6, x7,            
496                        y0, y1, y2, y3,            
497                        y4, y5, y6, y7);           
498         /* aria_diff_byte()                       
499          * T3 = ABCD -> BADC                      
500          * T3 = y4, y5, y6, y7 -> y5, y4, y7,     
501          * T0 = ABCD -> CDAB                      
502          * T0 = x0, x1, x2, x3 -> x2, x3, x0,     
503          * T1 = ABCD -> DCBA                      
504          * T1 = x4, x5, x6, x7 -> x7, x6, x5,     
505          */                                       
506         aria_diff_word(x2, x3, x0, x1,            
507                        x7, x6, x5, x4,            
508                        y0, y1, y2, y3,            
509                        y5, y4, y7, y6);           
510         aria_store_state_8way(x3, x2, x1, x0,     
511                               x6, x7, x4, x5,     
512                               mem_tmp, 0);        
513                                                   
514 #define aria_fo(x0, x1, x2, x3,                   
515                 x4, x5, x6, x7,                   
516                 y0, y1, y2, y3,                   
517                 y4, y5, y6, y7,                   
518                 mem_tmp, rk, round)               
519         aria_ark_8way(x0, x1, x2, x3, x4, x5,     
520                       y0, rk, 8, round);          
521                                                   
522         aria_sbox_8way(x0, x1, x2, x3, x4, x5,    
523                        y0, y1, y2, y3, y4, y5,    
524                                                   
525         aria_diff_m(x0, x1, x2, x3, y0, y1, y2    
526         aria_diff_m(x4, x5, x6, x7, y0, y1, y2    
527         aria_store_state_8way(x0, x1, x2, x3,     
528                               x4, x5, x6, x7,     
529                               mem_tmp, 8);        
530                                                   
531         aria_load_state_8way(x0, x1, x2, x3,      
532                              x4, x5, x6, x7,      
533                              mem_tmp, 0);         
534         aria_ark_8way(x0, x1, x2, x3, x4, x5,     
535                       y0, rk, 0, round);          
536                                                   
537         aria_sbox_8way(x0, x1, x2, x3, x4, x5,    
538                        y0, y1, y2, y3, y4, y5,    
539                                                   
540         aria_diff_m(x0, x1, x2, x3, y0, y1, y2    
541         aria_diff_m(x4, x5, x6, x7, y0, y1, y2    
542         aria_store_state_8way(x0, x1, x2, x3,     
543                               x4, x5, x6, x7,     
544                               mem_tmp, 0);        
545         aria_load_state_8way(y0, y1, y2, y3,      
546                              y4, y5, y6, y7,      
547                              mem_tmp, 8);         
548         aria_diff_word(x0, x1, x2, x3,            
549                        x4, x5, x6, x7,            
550                        y0, y1, y2, y3,            
551                        y4, y5, y6, y7);           
552         /* aria_diff_byte()                       
553          * T1 = ABCD -> BADC                      
554          * T1 = x4, x5, x6, x7 -> x5, x4, x7,     
555          * T2 = ABCD -> CDAB                      
556          * T2 = y0, y1, y2, y3, -> y2, y3, y0,    
557          * T3 = ABCD -> DCBA                      
558          * T3 = y4, y5, y6, y7 -> y7, y6, y5,     
559          */                                       
560         aria_diff_word(x0, x1, x2, x3,            
561                        x5, x4, x7, x6,            
562                        y2, y3, y0, y1,            
563                        y7, y6, y5, y4);           
564         aria_store_state_8way(x3, x2, x1, x0,     
565                               x6, x7, x4, x5,     
566                               mem_tmp, 0);        
567                                                   
568 #define aria_ff(x0, x1, x2, x3,                   
569                 x4, x5, x6, x7,                   
570                 y0, y1, y2, y3,                   
571                 y4, y5, y6, y7,                   
572                 mem_tmp, rk, round, last_round    
573         aria_ark_8way(x0, x1, x2, x3, x4, x5,     
574                       y0, rk, 8, round);          
575                                                   
576         aria_sbox_8way(x2, x3, x0, x1, x6, x7,    
577                        y0, y1, y2, y3, y4, y5,    
578                                                   
579         aria_ark_8way(x0, x1, x2, x3, x4, x5,     
580                       y0, rk, 8, last_round);     
581                                                   
582         aria_store_state_8way(x0, x1, x2, x3,     
583                               x4, x5, x6, x7,     
584                               mem_tmp, 8);        
585                                                   
586         aria_load_state_8way(x0, x1, x2, x3,      
587                              x4, x5, x6, x7,      
588                              mem_tmp, 0);         
589         aria_ark_8way(x0, x1, x2, x3, x4, x5,     
590                       y0, rk, 0, round);          
591                                                   
592         aria_sbox_8way(x2, x3, x0, x1, x6, x7,    
593                        y0, y1, y2, y3, y4, y5,    
594                                                   
595         aria_ark_8way(x0, x1, x2, x3, x4, x5,     
596                       y0, rk, 0, last_round);     
597                                                   
598         aria_load_state_8way(y0, y1, y2, y3,      
599                              y4, y5, y6, y7,      
600                              mem_tmp, 8);         
601 #ifdef CONFIG_AS_GFNI                             
602 #define aria_fe_gfni(x0, x1, x2, x3,              
603                      x4, x5, x6, x7,              
604                      y0, y1, y2, y3,              
605                      y4, y5, y6, y7,              
606                      mem_tmp, rk, round)          
607         aria_ark_8way(x0, x1, x2, x3, x4, x5,     
608                       y0, rk, 8, round);          
609                                                   
610         aria_sbox_8way_gfni(x2, x3, x0, x1,       
611                             x6, x7, x4, x5,       
612                             y0, y1, y2, y3,       
613                             y4, y5, y6, y7);      
614                                                   
615         aria_diff_m(x0, x1, x2, x3, y0, y1, y2    
616         aria_diff_m(x4, x5, x6, x7, y0, y1, y2    
617         aria_store_state_8way(x0, x1, x2, x3,     
618                               x4, x5, x6, x7,     
619                               mem_tmp, 8);        
620                                                   
621         aria_load_state_8way(x0, x1, x2, x3,      
622                              x4, x5, x6, x7,      
623                              mem_tmp, 0);         
624         aria_ark_8way(x0, x1, x2, x3, x4, x5,     
625                       y0, rk, 0, round);          
626                                                   
627         aria_sbox_8way_gfni(x2, x3, x0, x1,       
628                             x6, x7, x4, x5,       
629                             y0, y1, y2, y3,       
630                             y4, y5, y6, y7);      
631                                                   
632         aria_diff_m(x0, x1, x2, x3, y0, y1, y2    
633         aria_diff_m(x4, x5, x6, x7, y0, y1, y2    
634         aria_store_state_8way(x0, x1, x2, x3,     
635                               x4, x5, x6, x7,     
636                               mem_tmp, 0);        
637         aria_load_state_8way(y0, y1, y2, y3,      
638                              y4, y5, y6, y7,      
639                              mem_tmp, 8);         
640         aria_diff_word(x0, x1, x2, x3,            
641                        x4, x5, x6, x7,            
642                        y0, y1, y2, y3,            
643                        y4, y5, y6, y7);           
644         /* aria_diff_byte()                       
645          * T3 = ABCD -> BADC                      
646          * T3 = y4, y5, y6, y7 -> y5, y4, y7,     
647          * T0 = ABCD -> CDAB                      
648          * T0 = x0, x1, x2, x3 -> x2, x3, x0,     
649          * T1 = ABCD -> DCBA                      
650          * T1 = x4, x5, x6, x7 -> x7, x6, x5,     
651          */                                       
652         aria_diff_word(x2, x3, x0, x1,            
653                        x7, x6, x5, x4,            
654                        y0, y1, y2, y3,            
655                        y5, y4, y7, y6);           
656         aria_store_state_8way(x3, x2, x1, x0,     
657                               x6, x7, x4, x5,     
658                               mem_tmp, 0);        
659                                                   
660 #define aria_fo_gfni(x0, x1, x2, x3,              
661                      x4, x5, x6, x7,              
662                      y0, y1, y2, y3,              
663                      y4, y5, y6, y7,              
664                      mem_tmp, rk, round)          
665         aria_ark_8way(x0, x1, x2, x3, x4, x5,     
666                       y0, rk, 8, round);          
667                                                   
668         aria_sbox_8way_gfni(x0, x1, x2, x3,       
669                             x4, x5, x6, x7,       
670                             y0, y1, y2, y3,       
671                             y4, y5, y6, y7);      
672                                                   
673         aria_diff_m(x0, x1, x2, x3, y0, y1, y2    
674         aria_diff_m(x4, x5, x6, x7, y0, y1, y2    
675         aria_store_state_8way(x0, x1, x2, x3,     
676                               x4, x5, x6, x7,     
677                               mem_tmp, 8);        
678                                                   
679         aria_load_state_8way(x0, x1, x2, x3,      
680                              x4, x5, x6, x7,      
681                              mem_tmp, 0);         
682         aria_ark_8way(x0, x1, x2, x3, x4, x5,     
683                       y0, rk, 0, round);          
684                                                   
685         aria_sbox_8way_gfni(x0, x1, x2, x3,       
686                             x4, x5, x6, x7,       
687                             y0, y1, y2, y3,       
688                             y4, y5, y6, y7);      
689                                                   
690         aria_diff_m(x0, x1, x2, x3, y0, y1, y2    
691         aria_diff_m(x4, x5, x6, x7, y0, y1, y2    
692         aria_store_state_8way(x0, x1, x2, x3,     
693                               x4, x5, x6, x7,     
694                               mem_tmp, 0);        
695         aria_load_state_8way(y0, y1, y2, y3,      
696                              y4, y5, y6, y7,      
697                              mem_tmp, 8);         
698         aria_diff_word(x0, x1, x2, x3,            
699                        x4, x5, x6, x7,            
700                        y0, y1, y2, y3,            
701                        y4, y5, y6, y7);           
702         /* aria_diff_byte()                       
703          * T1 = ABCD -> BADC                      
704          * T1 = x4, x5, x6, x7 -> x5, x4, x7,     
705          * T2 = ABCD -> CDAB                      
706          * T2 = y0, y1, y2, y3, -> y2, y3, y0,    
707          * T3 = ABCD -> DCBA                      
708          * T3 = y4, y5, y6, y7 -> y7, y6, y5,     
709          */                                       
710         aria_diff_word(x0, x1, x2, x3,            
711                        x5, x4, x7, x6,            
712                        y2, y3, y0, y1,            
713                        y7, y6, y5, y4);           
714         aria_store_state_8way(x3, x2, x1, x0,     
715                               x6, x7, x4, x5,     
716                               mem_tmp, 0);        
717                                                   
718 #define aria_ff_gfni(x0, x1, x2, x3,              
719                 x4, x5, x6, x7,                   
720                 y0, y1, y2, y3,                   
721                 y4, y5, y6, y7,                   
722                 mem_tmp, rk, round, last_round    
723         aria_ark_8way(x0, x1, x2, x3, x4, x5,     
724                       y0, rk, 8, round);          
725                                                   
726         aria_sbox_8way_gfni(x2, x3, x0, x1,       
727                             x6, x7, x4, x5,       
728                             y0, y1, y2, y3,       
729                             y4, y5, y6, y7);      
730                                                   
731         aria_ark_8way(x0, x1, x2, x3, x4, x5,     
732                       y0, rk, 8, last_round);     
733                                                   
734         aria_store_state_8way(x0, x1, x2, x3,     
735                               x4, x5, x6, x7,     
736                               mem_tmp, 8);        
737                                                   
738         aria_load_state_8way(x0, x1, x2, x3,      
739                              x4, x5, x6, x7,      
740                              mem_tmp, 0);         
741         aria_ark_8way(x0, x1, x2, x3, x4, x5,     
742                       y0, rk, 0, round);          
743                                                   
744         aria_sbox_8way_gfni(x2, x3, x0, x1,       
745                             x6, x7, x4, x5,       
746                             y0, y1, y2, y3,       
747                             y4, y5, y6, y7);      
748                                                   
749         aria_ark_8way(x0, x1, x2, x3, x4, x5,     
750                       y0, rk, 0, last_round);     
751                                                   
752         aria_load_state_8way(y0, y1, y2, y3,      
753                              y4, y5, y6, y7,      
754                              mem_tmp, 8);         
755 #endif /* CONFIG_AS_GFNI */                       
756                                                   
757 .section        .rodata.cst32.shufb_16x16b, "a    
758 .align 32                                         
759 #define SHUFB_BYTES(idx) \                        
760         0 + (idx), 4 + (idx), 8 + (idx), 12 +     
761 .Lshufb_16x16b:                                   
762         .byte SHUFB_BYTES(0), SHUFB_BYTES(1),     
763         .byte SHUFB_BYTES(0), SHUFB_BYTES(1),     
764                                                   
765 .section        .rodata.cst16, "aM", @progbits    
766 .align 16                                         
767 /* For isolating SubBytes from AESENCLAST, inv    
768 .Linv_shift_row:                                  
769         .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x    
770         .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x    
771 .Lshift_row:                                      
772         .byte 0x00, 0x05, 0x0a, 0x0f, 0x04, 0x    
773         .byte 0x08, 0x0d, 0x02, 0x07, 0x0c, 0x    
774 /* For CTR-mode IV byteswap */                    
775 .Lbswap128_mask:                                  
776         .byte 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x    
777         .byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x    
778                                                   
779 /* AES inverse affine and S2 combined:            
780  *      1 1 0 0 0 0 0 1     x0     0              
781  *      0 1 0 0 1 0 0 0     x1     0              
782  *      1 1 0 0 1 1 1 1     x2     0              
783  *      0 1 1 0 1 0 0 1     x3     1              
784  *      0 1 0 0 1 1 0 0  *  x4  +  0              
785  *      0 1 0 1 1 0 0 0     x5     0              
786  *      0 0 0 0 0 1 0 1     x6     0              
787  *      1 1 1 0 0 1 1 1     x7     1              
788  */                                               
789 .Ltf_lo__inv_aff__and__s2:                        
790         .octa 0x92172DA81A9FA520B2370D883ABF85    
791 .Ltf_hi__inv_aff__and__s2:                        
792         .octa 0x2B15FFC1AF917B45E6D8320C625CB6    
793                                                   
794 /* X2 and AES forward affine combined:            
795  *      1 0 1 1 0 0 0 1     x0     0              
796  *      0 1 1 1 1 0 1 1     x1     0              
797  *      0 0 0 1 1 0 1 0     x2     1              
798  *      0 1 0 0 0 1 0 0     x3     0              
799  *      0 0 1 1 1 0 1 1  *  x4  +  0              
800  *      0 1 0 0 1 0 0 0     x5     0              
801  *      1 1 0 1 0 0 1 1     x6     0              
802  *      0 1 0 0 1 0 1 0     x7     0              
803  */                                               
804 .Ltf_lo__x2__and__fwd_aff:                        
805         .octa 0xEFAE0544FCBD1657B8F95213ABEA41    
806 .Ltf_hi__x2__and__fwd_aff:                        
807         .octa 0x3F893781E95FE1576CDA64D2BA0CB2    
808                                                   
809 #ifdef CONFIG_AS_GFNI                             
810 .section        .rodata.cst8, "aM", @progbits,    
811 .align 8                                          
812 /* AES affine: */                                 
813 #define tf_aff_const BV8(1, 1, 0, 0, 0, 1, 1,     
814 .Ltf_aff_bitmatrix:                               
815         .quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1    
816                     BV8(1, 1, 0, 0, 0, 1, 1, 1    
817                     BV8(1, 1, 1, 0, 0, 0, 1, 1    
818                     BV8(1, 1, 1, 1, 0, 0, 0, 1    
819                     BV8(1, 1, 1, 1, 1, 0, 0, 0    
820                     BV8(0, 1, 1, 1, 1, 1, 0, 0    
821                     BV8(0, 0, 1, 1, 1, 1, 1, 0    
822                     BV8(0, 0, 0, 1, 1, 1, 1, 1    
823                                                   
824 /* AES inverse affine: */                         
825 #define tf_inv_const BV8(1, 0, 1, 0, 0, 0, 0,     
826 .Ltf_inv_bitmatrix:                               
827         .quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1    
828                     BV8(1, 0, 0, 1, 0, 0, 1, 0    
829                     BV8(0, 1, 0, 0, 1, 0, 0, 1    
830                     BV8(1, 0, 1, 0, 0, 1, 0, 0    
831                     BV8(0, 1, 0, 1, 0, 0, 1, 0    
832                     BV8(0, 0, 1, 0, 1, 0, 0, 1    
833                     BV8(1, 0, 0, 1, 0, 1, 0, 0    
834                     BV8(0, 1, 0, 0, 1, 0, 1, 0    
835                                                   
836 /* S2: */                                         
837 #define tf_s2_const BV8(0, 1, 0, 0, 0, 1, 1, 1    
838 .Ltf_s2_bitmatrix:                                
839         .quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1    
840                     BV8(0, 0, 1, 1, 1, 1, 1, 1    
841                     BV8(1, 1, 1, 0, 1, 1, 0, 1    
842                     BV8(1, 1, 0, 0, 0, 0, 1, 1    
843                     BV8(0, 1, 0, 0, 0, 0, 1, 1    
844                     BV8(1, 1, 0, 0, 1, 1, 1, 0    
845                     BV8(0, 1, 1, 0, 0, 0, 1, 1    
846                     BV8(1, 1, 1, 1, 0, 1, 1, 0    
847                                                   
848 /* X2: */                                         
849 #define tf_x2_const BV8(0, 0, 1, 1, 0, 1, 0, 0    
850 .Ltf_x2_bitmatrix:                                
851         .quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0    
852                     BV8(0, 0, 1, 0, 0, 1, 1, 0    
853                     BV8(0, 0, 0, 0, 1, 0, 1, 0    
854                     BV8(1, 1, 1, 0, 0, 0, 1, 1    
855                     BV8(1, 1, 1, 0, 1, 1, 0, 0    
856                     BV8(0, 1, 1, 0, 1, 0, 1, 1    
857                     BV8(1, 0, 1, 1, 1, 1, 0, 1    
858                     BV8(1, 0, 0, 1, 0, 0, 1, 1    
859                                                   
860 /* Identity matrix: */                            
861 .Ltf_id_bitmatrix:                                
862         .quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0    
863                     BV8(0, 1, 0, 0, 0, 0, 0, 0    
864                     BV8(0, 0, 1, 0, 0, 0, 0, 0    
865                     BV8(0, 0, 0, 1, 0, 0, 0, 0    
866                     BV8(0, 0, 0, 0, 1, 0, 0, 0    
867                     BV8(0, 0, 0, 0, 0, 1, 0, 0    
868                     BV8(0, 0, 0, 0, 0, 0, 1, 0    
869                     BV8(0, 0, 0, 0, 0, 0, 0, 1    
870                                                   
871 #endif /* CONFIG_AS_GFNI */                       
872                                                   
873 /* 4-bit mask */                                  
874 .section        .rodata.cst4.L0f0f0f0f, "aM",     
875 .align 4                                          
876 .L0f0f0f0f:                                       
877         .long 0x0f0f0f0f                          
878                                                   
879 .text                                             
880                                                   
881 SYM_FUNC_START_LOCAL(__aria_aesni_avx2_crypt_3    
882         /* input:                                 
883          *      %r9: rk                           
884          *      %rsi: dst                         
885          *      %rdx: src                         
886          *      %ymm0..%ymm15: byte-sliced blo    
887          */                                       
888                                                   
889         FRAME_BEGIN                               
890                                                   
891         movq %rsi, %rax;                          
892         leaq 8 * 32(%rax), %r8;                   
893                                                   
894         inpack16_post(%ymm0, %ymm1, %ymm2, %ym    
895                       %ymm8, %ymm9, %ymm10, %y    
896                       %ymm15, %rax, %r8);         
897         aria_fo(%ymm8, %ymm9, %ymm10, %ymm11,     
898                 %ymm0, %ymm1, %ymm2, %ymm3, %y    
899                 %rax, %r9, 0);                    
900         aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %y    
901                 %ymm8, %ymm9, %ymm10, %ymm11,     
902                 %ymm15, %rax, %r9, 1);            
903         aria_fo(%ymm9, %ymm8, %ymm11, %ymm10,     
904                 %ymm0, %ymm1, %ymm2, %ymm3, %y    
905                 %rax, %r9, 2);                    
906         aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %y    
907                 %ymm8, %ymm9, %ymm10, %ymm11,     
908                 %ymm15, %rax, %r9, 3);            
909         aria_fo(%ymm9, %ymm8, %ymm11, %ymm10,     
910                 %ymm0, %ymm1, %ymm2, %ymm3, %y    
911                 %rax, %r9, 4);                    
912         aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %y    
913                 %ymm8, %ymm9, %ymm10, %ymm11,     
914                 %ymm15, %rax, %r9, 5);            
915         aria_fo(%ymm9, %ymm8, %ymm11, %ymm10,     
916                 %ymm0, %ymm1, %ymm2, %ymm3, %y    
917                 %rax, %r9, 6);                    
918         aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %y    
919                 %ymm8, %ymm9, %ymm10, %ymm11,     
920                 %ymm15, %rax, %r9, 7);            
921         aria_fo(%ymm9, %ymm8, %ymm11, %ymm10,     
922                 %ymm0, %ymm1, %ymm2, %ymm3, %y    
923                 %rax, %r9, 8);                    
924         aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %y    
925                 %ymm8, %ymm9, %ymm10, %ymm11,     
926                 %ymm15, %rax, %r9, 9);            
927         aria_fo(%ymm9, %ymm8, %ymm11, %ymm10,     
928                 %ymm0, %ymm1, %ymm2, %ymm3, %y    
929                 %rax, %r9, 10);                   
930         cmpl $12, ARIA_CTX_rounds(CTX);           
931         jne .Laria_192;                           
932         aria_ff(%ymm1, %ymm0, %ymm3, %ymm2, %y    
933                 %ymm8, %ymm9, %ymm10, %ymm11,     
934                 %ymm15, %rax, %r9, 11, 12);       
935         jmp .Laria_end;                           
936 .Laria_192:                                       
937         aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %y    
938                 %ymm8, %ymm9, %ymm10, %ymm11,     
939                 %ymm15, %rax, %r9, 11);           
940         aria_fo(%ymm9, %ymm8, %ymm11, %ymm10,     
941                 %ymm0, %ymm1, %ymm2, %ymm3, %y    
942                 %rax, %r9, 12);                   
943         cmpl $14, ARIA_CTX_rounds(CTX);           
944         jne .Laria_256;                           
945         aria_ff(%ymm1, %ymm0, %ymm3, %ymm2, %y    
946                 %ymm8, %ymm9, %ymm10, %ymm11,     
947                 %ymm15, %rax, %r9, 13, 14);       
948         jmp .Laria_end;                           
949 .Laria_256:                                       
950         aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %y    
951                 %ymm8, %ymm9, %ymm10, %ymm11,     
952                 %ymm15, %rax, %r9, 13);           
953         aria_fo(%ymm9, %ymm8, %ymm11, %ymm10,     
954                 %ymm0, %ymm1, %ymm2, %ymm3, %y    
955                 %rax, %r9, 14);                   
956         aria_ff(%ymm1, %ymm0, %ymm3, %ymm2, %y    
957                 %ymm8, %ymm9, %ymm10, %ymm11,     
958                 %ymm15, %rax, %r9, 15, 16);       
959 .Laria_end:                                       
960         debyteslice_16x16b(%ymm8, %ymm12, %ymm    
961                            %ymm9, %ymm13, %ymm    
962                            %ymm10, %ymm14, %ym    
963                            %ymm11, %ymm15, %ym    
964                            (%rax), (%r8));        
965                                                   
966         FRAME_END                                 
967         RET;                                      
968 SYM_FUNC_END(__aria_aesni_avx2_crypt_32way)       
969                                                   
970 SYM_TYPED_FUNC_START(aria_aesni_avx2_encrypt_3    
971         /* input:                                 
972          *      %rdi: ctx, CTX                    
973          *      %rsi: dst                         
974          *      %rdx: src                         
975          */                                       
976                                                   
977         FRAME_BEGIN                               
978                                                   
979         leaq ARIA_CTX_enc_key(CTX), %r9;          
980                                                   
981         inpack16_pre(%ymm0, %ymm1, %ymm2, %ymm    
982                      %ymm8, %ymm9, %ymm10, %ym    
983                      %ymm15, %rdx);               
984                                                   
985         call __aria_aesni_avx2_crypt_32way;       
986                                                   
987         write_output(%ymm1, %ymm0, %ymm3, %ymm    
988                      %ymm8, %ymm9, %ymm10, %ym    
989                      %ymm15, %rax);               
990                                                   
991         FRAME_END                                 
992         RET;                                      
993 SYM_FUNC_END(aria_aesni_avx2_encrypt_32way)       
994                                                   
995 SYM_TYPED_FUNC_START(aria_aesni_avx2_decrypt_3    
996         /* input:                                 
997          *      %rdi: ctx, CTX                    
998          *      %rsi: dst                         
999          *      %rdx: src                         
1000          */                                      
1001                                                  
1002         FRAME_BEGIN                              
1003                                                  
1004         leaq ARIA_CTX_dec_key(CTX), %r9;         
1005                                                  
1006         inpack16_pre(%ymm0, %ymm1, %ymm2, %ym    
1007                      %ymm8, %ymm9, %ymm10, %y    
1008                      %ymm15, %rdx);              
1009                                                  
1010         call __aria_aesni_avx2_crypt_32way;      
1011                                                  
1012         write_output(%ymm1, %ymm0, %ymm3, %ym    
1013                      %ymm8, %ymm9, %ymm10, %y    
1014                      %ymm15, %rax);              
1015                                                  
1016         FRAME_END                                
1017         RET;                                     
1018 SYM_FUNC_END(aria_aesni_avx2_decrypt_32way)      
1019                                                  
1020 SYM_FUNC_START_LOCAL(__aria_aesni_avx2_ctr_ge    
1021         /* input:                                
1022          *      %rdi: ctx                        
1023          *      %rsi: dst                        
1024          *      %rdx: src                        
1025          *      %rcx: keystream                  
1026          *      %r8: iv (big endian, 128bit)     
1027          */                                      
1028                                                  
1029         FRAME_BEGIN                              
1030         movq 8(%r8), %r11;                       
1031         bswapq %r11;                             
1032                                                  
1033         vbroadcasti128 .Lbswap128_mask (%rip)    
1034         vpcmpeqd %ymm0, %ymm0, %ymm0;            
1035         vpsrldq $8, %ymm0, %ymm0;   /* ab: -1    
1036         vpaddq %ymm0, %ymm0, %ymm5; /* ab: -2    
1037                                                  
1038         /* load IV and byteswap */               
1039         vmovdqu (%r8), %xmm7;                    
1040         vpshufb %xmm6, %xmm7, %xmm7;             
1041         vmovdqa %xmm7, %xmm3;                    
1042         inc_le128(%xmm7, %xmm0, %xmm4);          
1043         vinserti128 $1, %xmm7, %ymm3, %ymm3;     
1044         vpshufb %ymm6, %ymm3, %ymm8; /* +1 ;     
1045                                                  
1046         /* check need for handling 64-bit ove    
1047         cmpq $(0xffffffffffffffff - 32), %r11    
1048         ja .Lhandle_ctr_carry;                   
1049                                                  
1050         /* construct IVs */                      
1051         vpsubq %ymm5, %ymm3, %ymm3; /* +3 ; +    
1052         vpshufb %ymm6, %ymm3, %ymm9;             
1053         vpsubq %ymm5, %ymm3, %ymm3; /* +5 ; +    
1054         vpshufb %ymm6, %ymm3, %ymm10;            
1055         vpsubq %ymm5, %ymm3, %ymm3; /* +7 ; +    
1056         vpshufb %ymm6, %ymm3, %ymm11;            
1057         vpsubq %ymm5, %ymm3, %ymm3; /* +9 ; +    
1058         vpshufb %ymm6, %ymm3, %ymm12;            
1059         vpsubq %ymm5, %ymm3, %ymm3; /* +11 ;     
1060         vpshufb %ymm6, %ymm3, %ymm13;            
1061         vpsubq %ymm5, %ymm3, %ymm3; /* +13 ;     
1062         vpshufb %ymm6, %ymm3, %ymm14;            
1063         vpsubq %ymm5, %ymm3, %ymm3; /* +15 ;     
1064         vpshufb %ymm6, %ymm3, %ymm15;            
1065         vmovdqu %ymm8, (0 * 32)(%rcx);           
1066         vmovdqu %ymm9, (1 * 32)(%rcx);           
1067         vmovdqu %ymm10, (2 * 32)(%rcx);          
1068         vmovdqu %ymm11, (3 * 32)(%rcx);          
1069         vmovdqu %ymm12, (4 * 32)(%rcx);          
1070         vmovdqu %ymm13, (5 * 32)(%rcx);          
1071         vmovdqu %ymm14, (6 * 32)(%rcx);          
1072         vmovdqu %ymm15, (7 * 32)(%rcx);          
1073                                                  
1074         vpsubq %ymm5, %ymm3, %ymm3; /* +17 ;     
1075         vpshufb %ymm6, %ymm3, %ymm8;             
1076         vpsubq %ymm5, %ymm3, %ymm3; /* +19 ;     
1077         vpshufb %ymm6, %ymm3, %ymm9;             
1078         vpsubq %ymm5, %ymm3, %ymm3; /* +21 ;     
1079         vpshufb %ymm6, %ymm3, %ymm10;            
1080         vpsubq %ymm5, %ymm3, %ymm3; /* +23 ;     
1081         vpshufb %ymm6, %ymm3, %ymm11;            
1082         vpsubq %ymm5, %ymm3, %ymm3; /* +25 ;     
1083         vpshufb %ymm6, %ymm3, %ymm12;            
1084         vpsubq %ymm5, %ymm3, %ymm3; /* +27 ;     
1085         vpshufb %ymm6, %ymm3, %ymm13;            
1086         vpsubq %ymm5, %ymm3, %ymm3; /* +29 ;     
1087         vpshufb %ymm6, %ymm3, %ymm14;            
1088         vpsubq %ymm5, %ymm3, %ymm3; /* +31 ;     
1089         vpshufb %ymm6, %ymm3, %ymm15;            
1090         vpsubq %ymm5, %ymm3, %ymm3; /* +32 */    
1091         vpshufb %xmm6, %xmm3, %xmm3;             
1092         vmovdqu %xmm3, (%r8);                    
1093         vmovdqu (0 * 32)(%rcx), %ymm0;           
1094         vmovdqu (1 * 32)(%rcx), %ymm1;           
1095         vmovdqu (2 * 32)(%rcx), %ymm2;           
1096         vmovdqu (3 * 32)(%rcx), %ymm3;           
1097         vmovdqu (4 * 32)(%rcx), %ymm4;           
1098         vmovdqu (5 * 32)(%rcx), %ymm5;           
1099         vmovdqu (6 * 32)(%rcx), %ymm6;           
1100         vmovdqu (7 * 32)(%rcx), %ymm7;           
1101         jmp .Lctr_carry_done;                    
1102                                                  
1103         .Lhandle_ctr_carry:                      
1104         /* construct IVs */                      
1105         inc_le128(%ymm3, %ymm0, %ymm4);          
1106         inc_le128(%ymm3, %ymm0, %ymm4);          
1107         vpshufb %ymm6, %ymm3, %ymm9; /* +3 ;     
1108         inc_le128(%ymm3, %ymm0, %ymm4);          
1109         inc_le128(%ymm3, %ymm0, %ymm4);          
1110         vpshufb %ymm6, %ymm3, %ymm10; /* +5 ;    
1111         inc_le128(%ymm3, %ymm0, %ymm4);          
1112         inc_le128(%ymm3, %ymm0, %ymm4);          
1113         vpshufb %ymm6, %ymm3, %ymm11; /* +7 ;    
1114         inc_le128(%ymm3, %ymm0, %ymm4);          
1115         inc_le128(%ymm3, %ymm0, %ymm4);          
1116         vpshufb %ymm6, %ymm3, %ymm12; /* +9 ;    
1117         inc_le128(%ymm3, %ymm0, %ymm4);          
1118         inc_le128(%ymm3, %ymm0, %ymm4);          
1119         vpshufb %ymm6, %ymm3, %ymm13; /* +11     
1120         inc_le128(%ymm3, %ymm0, %ymm4);          
1121         inc_le128(%ymm3, %ymm0, %ymm4);          
1122         vpshufb %ymm6, %ymm3, %ymm14; /* +13     
1123         inc_le128(%ymm3, %ymm0, %ymm4);          
1124         inc_le128(%ymm3, %ymm0, %ymm4);          
1125         vpshufb %ymm6, %ymm3, %ymm15; /* +15     
1126         vmovdqu %ymm8, (0 * 32)(%rcx);           
1127         vmovdqu %ymm9, (1 * 32)(%rcx);           
1128         vmovdqu %ymm10, (2 * 32)(%rcx);          
1129         vmovdqu %ymm11, (3 * 32)(%rcx);          
1130         vmovdqu %ymm12, (4 * 32)(%rcx);          
1131         vmovdqu %ymm13, (5 * 32)(%rcx);          
1132         vmovdqu %ymm14, (6 * 32)(%rcx);          
1133         vmovdqu %ymm15, (7 * 32)(%rcx);          
1134                                                  
1135         inc_le128(%ymm3, %ymm0, %ymm4);          
1136         inc_le128(%ymm3, %ymm0, %ymm4);          
1137         vpshufb %ymm6, %ymm3, %ymm8; /* +17 ;    
1138         inc_le128(%ymm3, %ymm0, %ymm4);          
1139         inc_le128(%ymm3, %ymm0, %ymm4);          
1140         vpshufb %ymm6, %ymm3, %ymm9; /* +19 ;    
1141         inc_le128(%ymm3, %ymm0, %ymm4);          
1142         inc_le128(%ymm3, %ymm0, %ymm4);          
1143         vpshufb %ymm6, %ymm3, %ymm10; /* +21     
1144         inc_le128(%ymm3, %ymm0, %ymm4);          
1145         inc_le128(%ymm3, %ymm0, %ymm4);          
1146         vpshufb %ymm6, %ymm3, %ymm11; /* +23     
1147         inc_le128(%ymm3, %ymm0, %ymm4);          
1148         inc_le128(%ymm3, %ymm0, %ymm4);          
1149         vpshufb %ymm6, %ymm3, %ymm12; /* +25     
1150         inc_le128(%ymm3, %ymm0, %ymm4);          
1151         inc_le128(%ymm3, %ymm0, %ymm4);          
1152         vpshufb %ymm6, %ymm3, %ymm13; /* +27     
1153         inc_le128(%ymm3, %ymm0, %ymm4);          
1154         inc_le128(%ymm3, %ymm0, %ymm4);          
1155         vpshufb %ymm6, %ymm3, %ymm14; /* +29     
1156         inc_le128(%ymm3, %ymm0, %ymm4);          
1157         inc_le128(%ymm3, %ymm0, %ymm4);          
1158         vpshufb %ymm6, %ymm3, %ymm15; /* +31     
1159         inc_le128(%ymm3, %ymm0, %ymm4);          
1160         vextracti128 $1, %ymm3, %xmm3;           
1161         vpshufb %xmm6, %xmm3, %xmm3; /* +32 *    
1162         vmovdqu %xmm3, (%r8);                    
1163         vmovdqu (0 * 32)(%rcx), %ymm0;           
1164         vmovdqu (1 * 32)(%rcx), %ymm1;           
1165         vmovdqu (2 * 32)(%rcx), %ymm2;           
1166         vmovdqu (3 * 32)(%rcx), %ymm3;           
1167         vmovdqu (4 * 32)(%rcx), %ymm4;           
1168         vmovdqu (5 * 32)(%rcx), %ymm5;           
1169         vmovdqu (6 * 32)(%rcx), %ymm6;           
1170         vmovdqu (7 * 32)(%rcx), %ymm7;           
1171                                                  
1172         .Lctr_carry_done:                        
1173                                                  
1174         FRAME_END                                
1175         RET;                                     
1176 SYM_FUNC_END(__aria_aesni_avx2_ctr_gen_keystr    
1177                                                  
1178 SYM_TYPED_FUNC_START(aria_aesni_avx2_ctr_cryp    
1179         /* input:                                
1180          *      %rdi: ctx                        
1181          *      %rsi: dst                        
1182          *      %rdx: src                        
1183          *      %rcx: keystream                  
1184          *      %r8: iv (big endian, 128bit)     
1185          */                                      
1186         FRAME_BEGIN                              
1187                                                  
1188         call __aria_aesni_avx2_ctr_gen_keystr    
1189                                                  
1190         leaq (%rsi), %r10;                       
1191         leaq (%rdx), %r11;                       
1192         leaq (%rcx), %rsi;                       
1193         leaq (%rcx), %rdx;                       
1194         leaq ARIA_CTX_enc_key(CTX), %r9;         
1195                                                  
1196         call __aria_aesni_avx2_crypt_32way;      
1197                                                  
1198         vpxor (0 * 32)(%r11), %ymm1, %ymm1;      
1199         vpxor (1 * 32)(%r11), %ymm0, %ymm0;      
1200         vpxor (2 * 32)(%r11), %ymm3, %ymm3;      
1201         vpxor (3 * 32)(%r11), %ymm2, %ymm2;      
1202         vpxor (4 * 32)(%r11), %ymm4, %ymm4;      
1203         vpxor (5 * 32)(%r11), %ymm5, %ymm5;      
1204         vpxor (6 * 32)(%r11), %ymm6, %ymm6;      
1205         vpxor (7 * 32)(%r11), %ymm7, %ymm7;      
1206         vpxor (8 * 32)(%r11), %ymm8, %ymm8;      
1207         vpxor (9 * 32)(%r11), %ymm9, %ymm9;      
1208         vpxor (10 * 32)(%r11), %ymm10, %ymm10    
1209         vpxor (11 * 32)(%r11), %ymm11, %ymm11    
1210         vpxor (12 * 32)(%r11), %ymm12, %ymm12    
1211         vpxor (13 * 32)(%r11), %ymm13, %ymm13    
1212         vpxor (14 * 32)(%r11), %ymm14, %ymm14    
1213         vpxor (15 * 32)(%r11), %ymm15, %ymm15    
1214         write_output(%ymm1, %ymm0, %ymm3, %ym    
1215                      %ymm8, %ymm9, %ymm10, %y    
1216                      %ymm15, %r10);              
1217                                                  
1218         FRAME_END                                
1219         RET;                                     
1220 SYM_FUNC_END(aria_aesni_avx2_ctr_crypt_32way)    
1221                                                  
1222 #ifdef CONFIG_AS_GFNI                            
1223 SYM_FUNC_START_LOCAL(__aria_aesni_avx2_gfni_c    
1224         /* input:                                
1225          *      %r9: rk                          
1226          *      %rsi: dst                        
1227          *      %rdx: src                        
1228          *      %ymm0..%ymm15: 16 byte-sliced    
1229          */                                      
1230                                                  
1231         FRAME_BEGIN                              
1232                                                  
1233         movq %rsi, %rax;                         
1234         leaq 8 * 32(%rax), %r8;                  
1235                                                  
1236         inpack16_post(%ymm0, %ymm1, %ymm2, %y    
1237                       %ymm4, %ymm5, %ymm6, %y    
1238                       %ymm8, %ymm9, %ymm10, %    
1239                       %ymm12, %ymm13, %ymm14,    
1240                       %ymm15, %rax, %r8);        
1241         aria_fo_gfni(%ymm8, %ymm9, %ymm10, %y    
1242                      %ymm12, %ymm13, %ymm14,     
1243                      %ymm0, %ymm1, %ymm2, %ym    
1244                      %ymm4, %ymm5, %ymm6, %ym    
1245                      %rax, %r9, 0);              
1246         aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ym    
1247                      %ymm4, %ymm5, %ymm6, %ym    
1248                      %ymm8, %ymm9, %ymm10, %y    
1249                      %ymm12, %ymm13, %ymm14,     
1250                      %ymm15, %rax, %r9, 1);      
1251         aria_fo_gfni(%ymm9, %ymm8, %ymm11, %y    
1252                      %ymm12, %ymm13, %ymm14,     
1253                      %ymm0, %ymm1, %ymm2, %ym    
1254                      %ymm4, %ymm5, %ymm6, %ym    
1255                      %rax, %r9, 2);              
1256         aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ym    
1257                      %ymm4, %ymm5, %ymm6, %ym    
1258                      %ymm8, %ymm9, %ymm10, %y    
1259                      %ymm12, %ymm13, %ymm14,     
1260                      %ymm15, %rax, %r9, 3);      
1261         aria_fo_gfni(%ymm9, %ymm8, %ymm11, %y    
1262                      %ymm12, %ymm13, %ymm14,     
1263                      %ymm0, %ymm1, %ymm2, %ym    
1264                      %ymm4, %ymm5, %ymm6, %ym    
1265                      %rax, %r9, 4);              
1266         aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ym    
1267                      %ymm4, %ymm5, %ymm6, %ym    
1268                      %ymm8, %ymm9, %ymm10, %y    
1269                      %ymm12, %ymm13, %ymm14,     
1270                      %ymm15, %rax, %r9, 5);      
1271         aria_fo_gfni(%ymm9, %ymm8, %ymm11, %y    
1272                      %ymm12, %ymm13, %ymm14,     
1273                      %ymm0, %ymm1, %ymm2, %ym    
1274                      %ymm4, %ymm5, %ymm6, %ym    
1275                      %rax, %r9, 6);              
1276         aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ym    
1277                      %ymm4, %ymm5, %ymm6, %ym    
1278                      %ymm8, %ymm9, %ymm10, %y    
1279                      %ymm12, %ymm13, %ymm14,     
1280                      %ymm15, %rax, %r9, 7);      
1281         aria_fo_gfni(%ymm9, %ymm8, %ymm11, %y    
1282                      %ymm12, %ymm13, %ymm14,     
1283                      %ymm0, %ymm1, %ymm2, %ym    
1284                      %ymm4, %ymm5, %ymm6, %ym    
1285                      %rax, %r9, 8);              
1286         aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ym    
1287                      %ymm4, %ymm5, %ymm6, %ym    
1288                      %ymm8, %ymm9, %ymm10, %y    
1289                      %ymm12, %ymm13, %ymm14,     
1290                      %ymm15, %rax, %r9, 9);      
1291         aria_fo_gfni(%ymm9, %ymm8, %ymm11, %y    
1292                      %ymm12, %ymm13, %ymm14,     
1293                      %ymm0, %ymm1, %ymm2, %ym    
1294                      %ymm4, %ymm5, %ymm6, %ym    
1295                      %rax, %r9, 10);             
1296         cmpl $12, ARIA_CTX_rounds(CTX);          
1297         jne .Laria_gfni_192;                     
1298         aria_ff_gfni(%ymm1, %ymm0, %ymm3, %ym    
1299                 %ymm8, %ymm9, %ymm10, %ymm11,    
1300                 %ymm15, %rax, %r9, 11, 12);      
1301         jmp .Laria_gfni_end;                     
1302 .Laria_gfni_192:                                 
1303         aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ym    
1304                      %ymm4, %ymm5, %ymm6, %ym    
1305                      %ymm8, %ymm9, %ymm10, %y    
1306                      %ymm12, %ymm13, %ymm14,     
1307                      %ymm15, %rax, %r9, 11);     
1308         aria_fo_gfni(%ymm9, %ymm8, %ymm11, %y    
1309                      %ymm12, %ymm13, %ymm14,     
1310                      %ymm0, %ymm1, %ymm2, %ym    
1311                      %ymm4, %ymm5, %ymm6, %ym    
1312                      %rax, %r9, 12);             
1313         cmpl $14, ARIA_CTX_rounds(CTX);          
1314         jne .Laria_gfni_256;                     
1315         aria_ff_gfni(%ymm1, %ymm0, %ymm3, %ym    
1316                      %ymm4, %ymm5, %ymm6, %ym    
1317                      %ymm8, %ymm9, %ymm10, %y    
1318                      %ymm12, %ymm13, %ymm14,     
1319                      %ymm15, %rax, %r9, 13, 1    
1320         jmp .Laria_gfni_end;                     
1321 .Laria_gfni_256:                                 
1322         aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ym    
1323                      %ymm4, %ymm5, %ymm6, %ym    
1324                      %ymm8, %ymm9, %ymm10, %y    
1325                      %ymm12, %ymm13, %ymm14,     
1326                      %ymm15, %rax, %r9, 13);     
1327         aria_fo_gfni(%ymm9, %ymm8, %ymm11, %y    
1328                      %ymm12, %ymm13, %ymm14,     
1329                      %ymm0, %ymm1, %ymm2, %ym    
1330                      %ymm4, %ymm5, %ymm6, %ym    
1331                      %rax, %r9, 14);             
1332         aria_ff_gfni(%ymm1, %ymm0, %ymm3, %ym    
1333                      %ymm4, %ymm5, %ymm6, %ym    
1334                      %ymm8, %ymm9, %ymm10, %y    
1335                      %ymm12, %ymm13, %ymm14,     
1336                      %ymm15, %rax, %r9, 15, 1    
1337 .Laria_gfni_end:                                 
1338         debyteslice_16x16b(%ymm8, %ymm12, %ym    
1339                            %ymm9, %ymm13, %ym    
1340                            %ymm10, %ymm14, %y    
1341                            %ymm11, %ymm15, %y    
1342                            (%rax), (%r8));       
1343                                                  
1344         FRAME_END                                
1345         RET;                                     
1346 SYM_FUNC_END(__aria_aesni_avx2_gfni_crypt_32w    
1347                                                  
1348 SYM_TYPED_FUNC_START(aria_aesni_avx2_gfni_enc    
1349         /* input:                                
1350          *      %rdi: ctx, CTX                   
1351          *      %rsi: dst                        
1352          *      %rdx: src                        
1353          */                                      
1354                                                  
1355         FRAME_BEGIN                              
1356                                                  
1357         leaq ARIA_CTX_enc_key(CTX), %r9;         
1358                                                  
1359         inpack16_pre(%ymm0, %ymm1, %ymm2, %ym    
1360                      %ymm8, %ymm9, %ymm10, %y    
1361                      %ymm15, %rdx);              
1362                                                  
1363         call __aria_aesni_avx2_gfni_crypt_32w    
1364                                                  
1365         write_output(%ymm1, %ymm0, %ymm3, %ym    
1366                      %ymm8, %ymm9, %ymm10, %y    
1367                      %ymm15, %rax);              
1368                                                  
1369         FRAME_END                                
1370         RET;                                     
1371 SYM_FUNC_END(aria_aesni_avx2_gfni_encrypt_32w    
1372                                                  
1373 SYM_TYPED_FUNC_START(aria_aesni_avx2_gfni_dec    
1374         /* input:                                
1375          *      %rdi: ctx, CTX                   
1376          *      %rsi: dst                        
1377          *      %rdx: src                        
1378          */                                      
1379                                                  
1380         FRAME_BEGIN                              
1381                                                  
1382         leaq ARIA_CTX_dec_key(CTX), %r9;         
1383                                                  
1384         inpack16_pre(%ymm0, %ymm1, %ymm2, %ym    
1385                      %ymm8, %ymm9, %ymm10, %y    
1386                      %ymm15, %rdx);              
1387                                                  
1388         call __aria_aesni_avx2_gfni_crypt_32w    
1389                                                  
1390         write_output(%ymm1, %ymm0, %ymm3, %ym    
1391                      %ymm8, %ymm9, %ymm10, %y    
1392                      %ymm15, %rax);              
1393                                                  
1394         FRAME_END                                
1395         RET;                                     
1396 SYM_FUNC_END(aria_aesni_avx2_gfni_decrypt_32w    
1397                                                  
1398 SYM_TYPED_FUNC_START(aria_aesni_avx2_gfni_ctr    
1399         /* input:                                
1400          *      %rdi: ctx                        
1401          *      %rsi: dst                        
1402          *      %rdx: src                        
1403          *      %rcx: keystream                  
1404          *      %r8: iv (big endian, 128bit)     
1405          */                                      
1406         FRAME_BEGIN                              
1407                                                  
1408         call __aria_aesni_avx2_ctr_gen_keystr    
1409                                                  
1410         leaq (%rsi), %r10;                       
1411         leaq (%rdx), %r11;                       
1412         leaq (%rcx), %rsi;                       
1413         leaq (%rcx), %rdx;                       
1414         leaq ARIA_CTX_enc_key(CTX), %r9;         
1415                                                  
1416         call __aria_aesni_avx2_gfni_crypt_32w    
1417                                                  
1418         vpxor (0 * 32)(%r11), %ymm1, %ymm1;      
1419         vpxor (1 * 32)(%r11), %ymm0, %ymm0;      
1420         vpxor (2 * 32)(%r11), %ymm3, %ymm3;      
1421         vpxor (3 * 32)(%r11), %ymm2, %ymm2;      
1422         vpxor (4 * 32)(%r11), %ymm4, %ymm4;      
1423         vpxor (5 * 32)(%r11), %ymm5, %ymm5;      
1424         vpxor (6 * 32)(%r11), %ymm6, %ymm6;      
1425         vpxor (7 * 32)(%r11), %ymm7, %ymm7;      
1426         vpxor (8 * 32)(%r11), %ymm8, %ymm8;      
1427         vpxor (9 * 32)(%r11), %ymm9, %ymm9;      
1428         vpxor (10 * 32)(%r11), %ymm10, %ymm10    
1429         vpxor (11 * 32)(%r11), %ymm11, %ymm11    
1430         vpxor (12 * 32)(%r11), %ymm12, %ymm12    
1431         vpxor (13 * 32)(%r11), %ymm13, %ymm13    
1432         vpxor (14 * 32)(%r11), %ymm14, %ymm14    
1433         vpxor (15 * 32)(%r11), %ymm15, %ymm15    
1434         write_output(%ymm1, %ymm0, %ymm3, %ym    
1435                      %ymm8, %ymm9, %ymm10, %y    
1436                      %ymm15, %r10);              
1437                                                  
1438         FRAME_END                                
1439         RET;                                     
1440 SYM_FUNC_END(aria_aesni_avx2_gfni_ctr_crypt_3    
1441 #endif /* CONFIG_AS_GFNI */                      
                                                      

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

sflogo.php