~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/arch/x86/crypto/sm4-aesni-avx-asm_64.S

Version: ~ [ linux-6.12-rc7 ] ~ [ linux-6.11.7 ] ~ [ linux-6.10.14 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.60 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.116 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.171 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.229 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.285 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.323 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.12 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

Diff markup

Differences between /arch/x86/crypto/sm4-aesni-avx-asm_64.S (Architecture alpha) and /arch/ppc/crypto/sm4-aesni-avx-asm_64.S (Architecture ppc)


  1 /* SPDX-License-Identifier: GPL-2.0-or-later *    
  2 /*                                                
  3  * SM4 Cipher Algorithm, AES-NI/AVX optimized.    
  4  * as specified in                                
  5  * https://tools.ietf.org/id/draft-ribose-cfrg    
  6  *                                                
  7  * Copyright (C) 2018 Markku-Juhani O. Saarine<    
  8  * Copyright (C) 2020 Jussi Kivilinna <jussi.ki    
  9  * Copyright (c) 2021 Tianjia Zhang <tianjia.zh    
 10  */                                               
 11                                                   
 12 /* Based on SM4 AES-NI work by libgcrypt and M    
 13  *  https://github.com/mjosaarinen/sm4ni          
 14  */                                               
 15                                                   
 16 #include <linux/linkage.h>                        
 17 #include <linux/cfi_types.h>                      
 18 #include <asm/frame.h>                            
 19                                                   
 20 #define rRIP         (%rip)                       
 21                                                   
 22 #define RX0          %xmm0                        
 23 #define RX1          %xmm1                        
 24 #define MASK_4BIT    %xmm2                        
 25 #define RTMP0        %xmm3                        
 26 #define RTMP1        %xmm4                        
 27 #define RTMP2        %xmm5                        
 28 #define RTMP3        %xmm6                        
 29 #define RTMP4        %xmm7                        
 30                                                   
 31 #define RA0          %xmm8                        
 32 #define RA1          %xmm9                        
 33 #define RA2          %xmm10                       
 34 #define RA3          %xmm11                       
 35                                                   
 36 #define RB0          %xmm12                       
 37 #define RB1          %xmm13                       
 38 #define RB2          %xmm14                       
 39 #define RB3          %xmm15                       
 40                                                   
 41 #define RNOT         %xmm0                        
 42 #define RBSWAP       %xmm1                        
 43                                                   
 44                                                   
 45 /* Transpose four 32-bit words between 128-bit    
 46 #define transpose_4x4(x0, x1, x2, x3, t1, t2)     
 47         vpunpckhdq x1, x0, t2;                    
 48         vpunpckldq x1, x0, x0;                    
 49                                                   
 50         vpunpckldq x3, x2, t1;                    
 51         vpunpckhdq x3, x2, x2;                    
 52                                                   
 53         vpunpckhqdq t1, x0, x1;                   
 54         vpunpcklqdq t1, x0, x0;                   
 55                                                   
 56         vpunpckhqdq x2, t2, x3;                   
 57         vpunpcklqdq x2, t2, x2;                   
 58                                                   
 59 /* pre-SubByte transform. */                      
 60 #define transform_pre(x, lo_t, hi_t, mask4bit,    
 61         vpand x, mask4bit, tmp0;                  
 62         vpandn x, mask4bit, x;                    
 63         vpsrld $4, x, x;                          
 64                                                   
 65         vpshufb tmp0, lo_t, tmp0;                 
 66         vpshufb x, hi_t, x;                       
 67         vpxor tmp0, x, x;                         
 68                                                   
 69 /* post-SubByte transform. Note: x has been XO    
 70  * 'vaeslastenc' instruction.                     
 71  */                                               
 72 #define transform_post(x, lo_t, hi_t, mask4bit    
 73         vpandn mask4bit, x, tmp0;                 
 74         vpsrld $4, x, x;                          
 75         vpand x, mask4bit, x;                     
 76                                                   
 77         vpshufb tmp0, lo_t, tmp0;                 
 78         vpshufb x, hi_t, x;                       
 79         vpxor tmp0, x, x;                         
 80                                                   
 81                                                   
 82 .section        .rodata.cst16, "aM", @progbits    
 83 .align 16                                         
 84                                                   
 85 /*                                                
 86  * Following four affine transform look-up tab    
 87  * Markku-Juhani O. Saarinen, at https://githu    
 88  *                                                
 89  * These allow exposing SM4 S-Box from AES Sub    
 90  */                                               
 91                                                   
 92 /* pre-SubByte affine transform, from SM4 fiel    
 93 .Lpre_tf_lo_s:                                    
 94         .quad 0x9197E2E474720701, 0xC7C1B4B222    
 95 .Lpre_tf_hi_s:                                    
 96         .quad 0xE240AB09EB49A200, 0xF052B91BF9    
 97                                                   
 98 /* post-SubByte affine transform, from AES fie    
 99 .Lpost_tf_lo_s:                                   
100         .quad 0x5B67F2CEA19D0834, 0xEDD1447817    
101 .Lpost_tf_hi_s:                                   
102         .quad 0xAE7201DD73AFDC00, 0x11CDBE62CC    
103                                                   
104 /* For isolating SubBytes from AESENCLAST, inv    
105 .Linv_shift_row:                                  
106         .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x    
107         .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x    
108                                                   
109 /* Inverse shift row + Rotate left by 8 bits o    
110 .Linv_shift_row_rol_8:                            
111         .byte 0x07, 0x00, 0x0d, 0x0a, 0x0b, 0x    
112         .byte 0x0f, 0x08, 0x05, 0x02, 0x03, 0x    
113                                                   
114 /* Inverse shift row + Rotate left by 16 bits     
115 .Linv_shift_row_rol_16:                           
116         .byte 0x0a, 0x07, 0x00, 0x0d, 0x0e, 0x    
117         .byte 0x02, 0x0f, 0x08, 0x05, 0x06, 0x    
118                                                   
119 /* Inverse shift row + Rotate left by 24 bits     
120 .Linv_shift_row_rol_24:                           
121         .byte 0x0d, 0x0a, 0x07, 0x00, 0x01, 0x    
122         .byte 0x05, 0x02, 0x0f, 0x08, 0x09, 0x    
123                                                   
124 /* For CTR-mode IV byteswap */                    
125 .Lbswap128_mask:                                  
126         .byte 15, 14, 13, 12, 11, 10, 9, 8, 7,    
127                                                   
128 /* For input word byte-swap */                    
129 .Lbswap32_mask:                                   
130         .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10,     
131                                                   
132 .align 4                                          
133 /* 4-bit mask */                                  
134 .L0f0f0f0f:                                       
135         .long 0x0f0f0f0f                          
136                                                   
137 /* 12 bytes, only for padding */                  
138 .Lpadding_deadbeef:                               
139         .long 0xdeadbeef, 0xdeadbeef, 0xdeadbe    
140                                                   
141                                                   
142 .text                                             
143                                                   
144 /*                                                
145  * void sm4_aesni_avx_crypt4(const u32 *rk, u8    
146  *                           const u8 *src, in    
147  */                                               
148 SYM_FUNC_START(sm4_aesni_avx_crypt4)              
149         /* input:                                 
150          *      %rdi: round key array, CTX        
151          *      %rsi: dst (1..4 blocks)           
152          *      %rdx: src (1..4 blocks)           
153          *      %rcx: num blocks (1..4)           
154          */                                       
155         FRAME_BEGIN                               
156                                                   
157         vmovdqu 0*16(%rdx), RA0;                  
158         vmovdqa RA0, RA1;                         
159         vmovdqa RA0, RA2;                         
160         vmovdqa RA0, RA3;                         
161         cmpq $2, %rcx;                            
162         jb .Lblk4_load_input_done;                
163         vmovdqu 1*16(%rdx), RA1;                  
164         je .Lblk4_load_input_done;                
165         vmovdqu 2*16(%rdx), RA2;                  
166         cmpq $3, %rcx;                            
167         je .Lblk4_load_input_done;                
168         vmovdqu 3*16(%rdx), RA3;                  
169                                                   
170 .Lblk4_load_input_done:                           
171                                                   
172         vmovdqa .Lbswap32_mask rRIP, RTMP2;       
173         vpshufb RTMP2, RA0, RA0;                  
174         vpshufb RTMP2, RA1, RA1;                  
175         vpshufb RTMP2, RA2, RA2;                  
176         vpshufb RTMP2, RA3, RA3;                  
177                                                   
178         vbroadcastss .L0f0f0f0f rRIP, MASK_4BI    
179         vmovdqa .Lpre_tf_lo_s rRIP, RTMP4;        
180         vmovdqa .Lpre_tf_hi_s rRIP, RB0;          
181         vmovdqa .Lpost_tf_lo_s rRIP, RB1;         
182         vmovdqa .Lpost_tf_hi_s rRIP, RB2;         
183         vmovdqa .Linv_shift_row rRIP, RB3;        
184         vmovdqa .Linv_shift_row_rol_8 rRIP, RT    
185         vmovdqa .Linv_shift_row_rol_16 rRIP, R    
186         transpose_4x4(RA0, RA1, RA2, RA3, RTMP    
187                                                   
188 #define ROUND(round, s0, s1, s2, s3)              
189         vbroadcastss (4*(round))(%rdi), RX0;      
190         vpxor s1, RX0, RX0;                       
191         vpxor s2, RX0, RX0;                       
192         vpxor s3, RX0, RX0; /* s1 ^ s2 ^ s3 ^     
193                                                   
194         /* sbox, non-linear part */               
195         transform_pre(RX0, RTMP4, RB0, MASK_4B    
196         vaesenclast MASK_4BIT, RX0, RX0;          
197         transform_post(RX0, RB1, RB2, MASK_4BI    
198                                                   
199         /* linear part */                         
200         vpshufb RB3, RX0, RTMP0;                  
201         vpxor RTMP0, s0, s0; /* s0 ^ x */         
202         vpshufb RTMP2, RX0, RTMP1;                
203         vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(    
204         vpshufb RTMP3, RX0, RTMP1;                
205         vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(    
206         vpshufb .Linv_shift_row_rol_24 rRIP, R    
207         vpxor RTMP1, s0, s0; /* s0 ^ x ^ rol(x    
208         vpslld $2, RTMP0, RTMP1;                  
209         vpsrld $30, RTMP0, RTMP0;                 
210         vpxor RTMP0, s0, s0;                      
211         /* s0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol    
212         vpxor RTMP1, s0, s0;                      
213                                                   
214         leaq (32*4)(%rdi), %rax;                  
215 .align 16                                         
216 .Lroundloop_blk4:                                 
217         ROUND(0, RA0, RA1, RA2, RA3);             
218         ROUND(1, RA1, RA2, RA3, RA0);             
219         ROUND(2, RA2, RA3, RA0, RA1);             
220         ROUND(3, RA3, RA0, RA1, RA2);             
221         leaq (4*4)(%rdi), %rdi;                   
222         cmpq %rax, %rdi;                          
223         jne .Lroundloop_blk4;                     
224                                                   
225 #undef ROUND                                      
226                                                   
227         vmovdqa .Lbswap128_mask rRIP, RTMP2;      
228                                                   
229         transpose_4x4(RA0, RA1, RA2, RA3, RTMP    
230         vpshufb RTMP2, RA0, RA0;                  
231         vpshufb RTMP2, RA1, RA1;                  
232         vpshufb RTMP2, RA2, RA2;                  
233         vpshufb RTMP2, RA3, RA3;                  
234                                                   
235         vmovdqu RA0, 0*16(%rsi);                  
236         cmpq $2, %rcx;                            
237         jb .Lblk4_store_output_done;              
238         vmovdqu RA1, 1*16(%rsi);                  
239         je .Lblk4_store_output_done;              
240         vmovdqu RA2, 2*16(%rsi);                  
241         cmpq $3, %rcx;                            
242         je .Lblk4_store_output_done;              
243         vmovdqu RA3, 3*16(%rsi);                  
244                                                   
245 .Lblk4_store_output_done:                         
246         vzeroall;                                 
247         FRAME_END                                 
248         RET;                                      
249 SYM_FUNC_END(sm4_aesni_avx_crypt4)                
250                                                   
251 SYM_FUNC_START_LOCAL(__sm4_crypt_blk8)            
252         /* input:                                 
253          *      %rdi: round key array, CTX        
254          *      RA0, RA1, RA2, RA3, RB0, RB1,     
255          *                                        
256          * output:                                
257          *      RA0, RA1, RA2, RA3, RB0, RB1,     
258          *                                        
259          */                                       
260         FRAME_BEGIN                               
261                                                   
262         vmovdqa .Lbswap32_mask rRIP, RTMP2;       
263         vpshufb RTMP2, RA0, RA0;                  
264         vpshufb RTMP2, RA1, RA1;                  
265         vpshufb RTMP2, RA2, RA2;                  
266         vpshufb RTMP2, RA3, RA3;                  
267         vpshufb RTMP2, RB0, RB0;                  
268         vpshufb RTMP2, RB1, RB1;                  
269         vpshufb RTMP2, RB2, RB2;                  
270         vpshufb RTMP2, RB3, RB3;                  
271                                                   
272         vbroadcastss .L0f0f0f0f rRIP, MASK_4BI    
273         transpose_4x4(RA0, RA1, RA2, RA3, RTMP    
274         transpose_4x4(RB0, RB1, RB2, RB3, RTMP    
275                                                   
276 #define ROUND(round, s0, s1, s2, s3, r0, r1, r    
277         vbroadcastss (4*(round))(%rdi), RX0;      
278         vmovdqa .Lpre_tf_lo_s rRIP, RTMP4;        
279         vmovdqa .Lpre_tf_hi_s rRIP, RTMP1;        
280         vmovdqa RX0, RX1;                         
281         vpxor s1, RX0, RX0;                       
282         vpxor s2, RX0, RX0;                       
283         vpxor s3, RX0, RX0; /* s1 ^ s2 ^ s3 ^     
284         vmovdqa .Lpost_tf_lo_s rRIP, RTMP2;       
285         vmovdqa .Lpost_tf_hi_s rRIP, RTMP3;       
286         vpxor r1, RX1, RX1;                       
287         vpxor r2, RX1, RX1;                       
288         vpxor r3, RX1, RX1; /* r1 ^ r2 ^ r3 ^     
289                                                   
290         /* sbox, non-linear part */               
291         transform_pre(RX0, RTMP4, RTMP1, MASK_    
292         transform_pre(RX1, RTMP4, RTMP1, MASK_    
293         vmovdqa .Linv_shift_row rRIP, RTMP4;      
294         vaesenclast MASK_4BIT, RX0, RX0;          
295         vaesenclast MASK_4BIT, RX1, RX1;          
296         transform_post(RX0, RTMP2, RTMP3, MASK    
297         transform_post(RX1, RTMP2, RTMP3, MASK    
298                                                   
299         /* linear part */                         
300         vpshufb RTMP4, RX0, RTMP0;                
301         vpxor RTMP0, s0, s0; /* s0 ^ x */         
302         vpshufb RTMP4, RX1, RTMP2;                
303         vmovdqa .Linv_shift_row_rol_8 rRIP, RT    
304         vpxor RTMP2, r0, r0; /* r0 ^ x */         
305         vpshufb RTMP4, RX0, RTMP1;                
306         vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(    
307         vpshufb RTMP4, RX1, RTMP3;                
308         vmovdqa .Linv_shift_row_rol_16 rRIP, R    
309         vpxor RTMP3, RTMP2, RTMP2; /* x ^ rol(    
310         vpshufb RTMP4, RX0, RTMP1;                
311         vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(    
312         vpshufb RTMP4, RX1, RTMP3;                
313         vmovdqa .Linv_shift_row_rol_24 rRIP, R    
314         vpxor RTMP3, RTMP2, RTMP2; /* x ^ rol(    
315         vpshufb RTMP4, RX0, RTMP1;                
316         vpxor RTMP1, s0, s0; /* s0 ^ x ^ rol(x    
317         /* s0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol    
318         vpslld $2, RTMP0, RTMP1;                  
319         vpsrld $30, RTMP0, RTMP0;                 
320         vpxor RTMP0, s0, s0;                      
321         vpxor RTMP1, s0, s0;                      
322         vpshufb RTMP4, RX1, RTMP3;                
323         vpxor RTMP3, r0, r0; /* r0 ^ x ^ rol(x    
324         /* r0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol    
325         vpslld $2, RTMP2, RTMP3;                  
326         vpsrld $30, RTMP2, RTMP2;                 
327         vpxor RTMP2, r0, r0;                      
328         vpxor RTMP3, r0, r0;                      
329                                                   
330         leaq (32*4)(%rdi), %rax;                  
331 .align 16                                         
332 .Lroundloop_blk8:                                 
333         ROUND(0, RA0, RA1, RA2, RA3, RB0, RB1,    
334         ROUND(1, RA1, RA2, RA3, RA0, RB1, RB2,    
335         ROUND(2, RA2, RA3, RA0, RA1, RB2, RB3,    
336         ROUND(3, RA3, RA0, RA1, RA2, RB3, RB0,    
337         leaq (4*4)(%rdi), %rdi;                   
338         cmpq %rax, %rdi;                          
339         jne .Lroundloop_blk8;                     
340                                                   
341 #undef ROUND                                      
342                                                   
343         vmovdqa .Lbswap128_mask rRIP, RTMP2;      
344                                                   
345         transpose_4x4(RA0, RA1, RA2, RA3, RTMP    
346         transpose_4x4(RB0, RB1, RB2, RB3, RTMP    
347         vpshufb RTMP2, RA0, RA0;                  
348         vpshufb RTMP2, RA1, RA1;                  
349         vpshufb RTMP2, RA2, RA2;                  
350         vpshufb RTMP2, RA3, RA3;                  
351         vpshufb RTMP2, RB0, RB0;                  
352         vpshufb RTMP2, RB1, RB1;                  
353         vpshufb RTMP2, RB2, RB2;                  
354         vpshufb RTMP2, RB3, RB3;                  
355                                                   
356         FRAME_END                                 
357         RET;                                      
358 SYM_FUNC_END(__sm4_crypt_blk8)                    
359                                                   
360 /*                                                
361  * void sm4_aesni_avx_crypt8(const u32 *rk, u8    
362  *                           const u8 *src, in    
363  */                                               
364 SYM_FUNC_START(sm4_aesni_avx_crypt8)              
365         /* input:                                 
366          *      %rdi: round key array, CTX        
367          *      %rsi: dst (1..8 blocks)           
368          *      %rdx: src (1..8 blocks)           
369          *      %rcx: num blocks (1..8)           
370          */                                       
371         cmpq $5, %rcx;                            
372         jb sm4_aesni_avx_crypt4;                  
373                                                   
374         FRAME_BEGIN                               
375                                                   
376         vmovdqu (0 * 16)(%rdx), RA0;              
377         vmovdqu (1 * 16)(%rdx), RA1;              
378         vmovdqu (2 * 16)(%rdx), RA2;              
379         vmovdqu (3 * 16)(%rdx), RA3;              
380         vmovdqu (4 * 16)(%rdx), RB0;              
381         vmovdqa RB0, RB1;                         
382         vmovdqa RB0, RB2;                         
383         vmovdqa RB0, RB3;                         
384         je .Lblk8_load_input_done;                
385         vmovdqu (5 * 16)(%rdx), RB1;              
386         cmpq $7, %rcx;                            
387         jb .Lblk8_load_input_done;                
388         vmovdqu (6 * 16)(%rdx), RB2;              
389         je .Lblk8_load_input_done;                
390         vmovdqu (7 * 16)(%rdx), RB3;              
391                                                   
392 .Lblk8_load_input_done:                           
393         call __sm4_crypt_blk8;                    
394                                                   
395         cmpq $6, %rcx;                            
396         vmovdqu RA0, (0 * 16)(%rsi);              
397         vmovdqu RA1, (1 * 16)(%rsi);              
398         vmovdqu RA2, (2 * 16)(%rsi);              
399         vmovdqu RA3, (3 * 16)(%rsi);              
400         vmovdqu RB0, (4 * 16)(%rsi);              
401         jb .Lblk8_store_output_done;              
402         vmovdqu RB1, (5 * 16)(%rsi);              
403         je .Lblk8_store_output_done;              
404         vmovdqu RB2, (6 * 16)(%rsi);              
405         cmpq $7, %rcx;                            
406         je .Lblk8_store_output_done;              
407         vmovdqu RB3, (7 * 16)(%rsi);              
408                                                   
409 .Lblk8_store_output_done:                         
410         vzeroall;                                 
411         FRAME_END                                 
412         RET;                                      
413 SYM_FUNC_END(sm4_aesni_avx_crypt8)                
414                                                   
415 /*                                                
416  * void sm4_aesni_avx_ctr_enc_blk8(const u32 *    
417  *                                 const u8 *s    
418  */                                               
419 SYM_TYPED_FUNC_START(sm4_aesni_avx_ctr_enc_blk    
420         /* input:                                 
421          *      %rdi: round key array, CTX        
422          *      %rsi: dst (8 blocks)              
423          *      %rdx: src (8 blocks)              
424          *      %rcx: iv (big endian, 128bit)     
425          */                                       
426         FRAME_BEGIN                               
427                                                   
428         /* load IV and byteswap */                
429         vmovdqu (%rcx), RA0;                      
430                                                   
431         vmovdqa .Lbswap128_mask rRIP, RBSWAP;     
432         vpshufb RBSWAP, RA0, RTMP0; /* be => l    
433                                                   
434         vpcmpeqd RNOT, RNOT, RNOT;                
435         vpsrldq $8, RNOT, RNOT; /* low: -1, hi    
436                                                   
437 #define inc_le128(x, minus_one, tmp) \            
438         vpcmpeqq minus_one, x, tmp;  \            
439         vpsubq minus_one, x, x;      \            
440         vpslldq $8, tmp, tmp;        \            
441         vpsubq tmp, x, x;                         
442                                                   
443         /* construct IVs */                       
444         inc_le128(RTMP0, RNOT, RTMP2); /* +1 *    
445         vpshufb RBSWAP, RTMP0, RA1;               
446         inc_le128(RTMP0, RNOT, RTMP2); /* +2 *    
447         vpshufb RBSWAP, RTMP0, RA2;               
448         inc_le128(RTMP0, RNOT, RTMP2); /* +3 *    
449         vpshufb RBSWAP, RTMP0, RA3;               
450         inc_le128(RTMP0, RNOT, RTMP2); /* +4 *    
451         vpshufb RBSWAP, RTMP0, RB0;               
452         inc_le128(RTMP0, RNOT, RTMP2); /* +5 *    
453         vpshufb RBSWAP, RTMP0, RB1;               
454         inc_le128(RTMP0, RNOT, RTMP2); /* +6 *    
455         vpshufb RBSWAP, RTMP0, RB2;               
456         inc_le128(RTMP0, RNOT, RTMP2); /* +7 *    
457         vpshufb RBSWAP, RTMP0, RB3;               
458         inc_le128(RTMP0, RNOT, RTMP2); /* +8 *    
459         vpshufb RBSWAP, RTMP0, RTMP1;             
460                                                   
461         /* store new IV */                        
462         vmovdqu RTMP1, (%rcx);                    
463                                                   
464         call __sm4_crypt_blk8;                    
465                                                   
466         vpxor (0 * 16)(%rdx), RA0, RA0;           
467         vpxor (1 * 16)(%rdx), RA1, RA1;           
468         vpxor (2 * 16)(%rdx), RA2, RA2;           
469         vpxor (3 * 16)(%rdx), RA3, RA3;           
470         vpxor (4 * 16)(%rdx), RB0, RB0;           
471         vpxor (5 * 16)(%rdx), RB1, RB1;           
472         vpxor (6 * 16)(%rdx), RB2, RB2;           
473         vpxor (7 * 16)(%rdx), RB3, RB3;           
474                                                   
475         vmovdqu RA0, (0 * 16)(%rsi);              
476         vmovdqu RA1, (1 * 16)(%rsi);              
477         vmovdqu RA2, (2 * 16)(%rsi);              
478         vmovdqu RA3, (3 * 16)(%rsi);              
479         vmovdqu RB0, (4 * 16)(%rsi);              
480         vmovdqu RB1, (5 * 16)(%rsi);              
481         vmovdqu RB2, (6 * 16)(%rsi);              
482         vmovdqu RB3, (7 * 16)(%rsi);              
483                                                   
484         vzeroall;                                 
485         FRAME_END                                 
486         RET;                                      
487 SYM_FUNC_END(sm4_aesni_avx_ctr_enc_blk8)          
488                                                   
489 /*                                                
490  * void sm4_aesni_avx_cbc_dec_blk8(const u32 *    
491  *                                 const u8 *s    
492  */                                               
493 SYM_TYPED_FUNC_START(sm4_aesni_avx_cbc_dec_blk    
494         /* input:                                 
495          *      %rdi: round key array, CTX        
496          *      %rsi: dst (8 blocks)              
497          *      %rdx: src (8 blocks)              
498          *      %rcx: iv                          
499          */                                       
500         FRAME_BEGIN                               
501                                                   
502         vmovdqu (0 * 16)(%rdx), RA0;              
503         vmovdqu (1 * 16)(%rdx), RA1;              
504         vmovdqu (2 * 16)(%rdx), RA2;              
505         vmovdqu (3 * 16)(%rdx), RA3;              
506         vmovdqu (4 * 16)(%rdx), RB0;              
507         vmovdqu (5 * 16)(%rdx), RB1;              
508         vmovdqu (6 * 16)(%rdx), RB2;              
509         vmovdqu (7 * 16)(%rdx), RB3;              
510                                                   
511         call __sm4_crypt_blk8;                    
512                                                   
513         vmovdqu (7 * 16)(%rdx), RNOT;             
514         vpxor (%rcx), RA0, RA0;                   
515         vpxor (0 * 16)(%rdx), RA1, RA1;           
516         vpxor (1 * 16)(%rdx), RA2, RA2;           
517         vpxor (2 * 16)(%rdx), RA3, RA3;           
518         vpxor (3 * 16)(%rdx), RB0, RB0;           
519         vpxor (4 * 16)(%rdx), RB1, RB1;           
520         vpxor (5 * 16)(%rdx), RB2, RB2;           
521         vpxor (6 * 16)(%rdx), RB3, RB3;           
522         vmovdqu RNOT, (%rcx); /* store new IV     
523                                                   
524         vmovdqu RA0, (0 * 16)(%rsi);              
525         vmovdqu RA1, (1 * 16)(%rsi);              
526         vmovdqu RA2, (2 * 16)(%rsi);              
527         vmovdqu RA3, (3 * 16)(%rsi);              
528         vmovdqu RB0, (4 * 16)(%rsi);              
529         vmovdqu RB1, (5 * 16)(%rsi);              
530         vmovdqu RB2, (6 * 16)(%rsi);              
531         vmovdqu RB3, (7 * 16)(%rsi);              
532                                                   
533         vzeroall;                                 
534         FRAME_END                                 
535         RET;                                      
536 SYM_FUNC_END(sm4_aesni_avx_cbc_dec_blk8)          
                                                      

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

sflogo.php