~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/arch/x86/crypto/crct10dif-pcl-asm_64.S

Version: ~ [ linux-6.12-rc7 ] ~ [ linux-6.11.7 ] ~ [ linux-6.10.14 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.60 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.116 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.171 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.229 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.285 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.323 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.12 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

Diff markup

Differences between /arch/x86/crypto/crct10dif-pcl-asm_64.S (Version linux-6.12-rc7) and /arch/i386/crypto/crct10dif-pcl-asm_64.S (Version linux-4.4.302)


  1 ##############################################    
  2 # Implement fast CRC-T10DIF computation with S    
  3 #                                                 
  4 # Copyright (c) 2013, Intel Corporation           
  5 #                                                 
  6 # Authors:                                        
  7 #     Erdinc Ozturk <erdinc.ozturk@intel.com>      
  8 #     Vinodh Gopal <vinodh.gopal@intel.com>        
  9 #     James Guilford <james.guilford@intel.com>    
 10 #     Tim Chen <tim.c.chen@linux.intel.com>        
 11 #                                                 
 12 # This software is available to you under a ch    
 13 # licenses.  You may choose to be licensed und    
 14 # General Public License (GPL) Version 2, avai    
 15 # COPYING in the main directory of this source    
 16 # OpenIB.org BSD license below:                   
 17 #                                                 
 18 # Redistribution and use in source and binary     
 19 # modification, are permitted provided that th    
 20 # met:                                            
 21 #                                                 
 22 # * Redistributions of source code must retain    
 23 #   notice, this list of conditions and the fo    
 24 #                                                 
 25 # * Redistributions in binary form must reprod    
 26 #   notice, this list of conditions and the fo    
 27 #   documentation and/or other materials provi    
 28 #   distribution.                                 
 29 #                                                 
 30 # * Neither the name of the Intel Corporation     
 31 #   contributors may be used to endorse or pro    
 32 #   this software without specific prior writt    
 33 #                                                 
 34 #                                                 
 35 # THIS SOFTWARE IS PROVIDED BY INTEL CORPORATI    
 36 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BU    
 37 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FI    
 38 # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL IN    
 39 # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIR    
 40 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDI    
 41 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;    
 42 # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER C    
 43 # LIABILITY, WHETHER IN CONTRACT, STRICT LIABI    
 44 # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY     
 45 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY    
 46 #                                                 
 47 #       Reference paper titled "Fast CRC Compu    
 48 #       Polynomials Using PCLMULQDQ Instructio    
 49 #       URL: http://www.intel.com/content/dam/    
 50 #  /white-papers/fast-crc-computation-generic-    
 51 #                                                 
 52                                                   
 53 #include <linux/linkage.h>                        
 54                                                   
 55 .text                                             
 56                                                   
 57 #define         init_crc        %edi              
 58 #define         buf             %rsi              
 59 #define         len             %rdx              
 60                                                   
 61 #define         FOLD_CONSTS     %xmm10            
 62 #define         BSWAP_MASK      %xmm11            
 63                                                   
 64 # Fold reg1, reg2 into the next 32 data bytes,    
 65 # reg1, reg2.                                     
 66 .macro  fold_32_bytes   offset, reg1, reg2        
 67         movdqu  \offset(buf), %xmm9               
 68         movdqu  \offset+16(buf), %xmm12           
 69         pshufb  BSWAP_MASK, %xmm9                 
 70         pshufb  BSWAP_MASK, %xmm12                
 71         movdqa  \reg1, %xmm8                      
 72         movdqa  \reg2, %xmm13                     
 73         pclmulqdq       $0x00, FOLD_CONSTS, \r    
 74         pclmulqdq       $0x11, FOLD_CONSTS, %x    
 75         pclmulqdq       $0x00, FOLD_CONSTS, \r    
 76         pclmulqdq       $0x11, FOLD_CONSTS, %x    
 77         pxor    %xmm9 , \reg1                     
 78         xorps   %xmm8 , \reg1                     
 79         pxor    %xmm12, \reg2                     
 80         xorps   %xmm13, \reg2                     
 81 .endm                                             
 82                                                   
 83 # Fold src_reg into dst_reg.                      
 84 .macro  fold_16_bytes   src_reg, dst_reg          
 85         movdqa  \src_reg, %xmm8                   
 86         pclmulqdq       $0x11, FOLD_CONSTS, \s    
 87         pclmulqdq       $0x00, FOLD_CONSTS, %x    
 88         pxor    %xmm8, \dst_reg                   
 89         xorps   \src_reg, \dst_reg                
 90 .endm                                             
 91                                                   
 92 #                                                 
 93 # u16 crc_t10dif_pcl(u16 init_crc, const *u8 b    
 94 #                                                 
 95 # Assumes len >= 16.                              
 96 #                                                 
 97 SYM_FUNC_START(crc_t10dif_pcl)                    
 98                                                   
 99         movdqa  .Lbswap_mask(%rip), BSWAP_MASK    
100                                                   
101         # For sizes less than 256 bytes, we ca    
102         cmp     $256, len                         
103         jl      .Lless_than_256_bytes             
104                                                   
105         # Load the first 128 data bytes.  Byte    
106         # bit order match the polynomial coeff    
107         movdqu  16*0(buf), %xmm0                  
108         movdqu  16*1(buf), %xmm1                  
109         movdqu  16*2(buf), %xmm2                  
110         movdqu  16*3(buf), %xmm3                  
111         movdqu  16*4(buf), %xmm4                  
112         movdqu  16*5(buf), %xmm5                  
113         movdqu  16*6(buf), %xmm6                  
114         movdqu  16*7(buf), %xmm7                  
115         add     $128, buf                         
116         pshufb  BSWAP_MASK, %xmm0                 
117         pshufb  BSWAP_MASK, %xmm1                 
118         pshufb  BSWAP_MASK, %xmm2                 
119         pshufb  BSWAP_MASK, %xmm3                 
120         pshufb  BSWAP_MASK, %xmm4                 
121         pshufb  BSWAP_MASK, %xmm5                 
122         pshufb  BSWAP_MASK, %xmm6                 
123         pshufb  BSWAP_MASK, %xmm7                 
124                                                   
125         # XOR the first 16 data *bits* with th    
126         pxor    %xmm8, %xmm8                      
127         pinsrw  $7, init_crc, %xmm8               
128         pxor    %xmm8, %xmm0                      
129                                                   
130         movdqa  .Lfold_across_128_bytes_consts    
131                                                   
132         # Subtract 128 for the 128 data bytes     
133         # 128 to simplify the termination cond    
134         sub     $256, len                         
135                                                   
136         # While >= 128 data bytes remain (not     
137         # bytes xmm0-7 into them, storing the     
138 .Lfold_128_bytes_loop:                            
139         fold_32_bytes   0, %xmm0, %xmm1           
140         fold_32_bytes   32, %xmm2, %xmm3          
141         fold_32_bytes   64, %xmm4, %xmm5          
142         fold_32_bytes   96, %xmm6, %xmm7          
143         add     $128, buf                         
144         sub     $128, len                         
145         jge     .Lfold_128_bytes_loop             
146                                                   
147         # Now fold the 112 bytes in xmm0-xmm6     
148                                                   
149         # Fold across 64 bytes.                   
150         movdqa  .Lfold_across_64_bytes_consts(    
151         fold_16_bytes   %xmm0, %xmm4              
152         fold_16_bytes   %xmm1, %xmm5              
153         fold_16_bytes   %xmm2, %xmm6              
154         fold_16_bytes   %xmm3, %xmm7              
155         # Fold across 32 bytes.                   
156         movdqa  .Lfold_across_32_bytes_consts(    
157         fold_16_bytes   %xmm4, %xmm6              
158         fold_16_bytes   %xmm5, %xmm7              
159         # Fold across 16 bytes.                   
160         movdqa  .Lfold_across_16_bytes_consts(    
161         fold_16_bytes   %xmm6, %xmm7              
162                                                   
163         # Add 128 to get the correct number of    
164         # (not counting xmm7), following the p    
165         # Then subtract 16 to simplify the ter    
166         # following loop.                         
167         add     $128-16, len                      
168                                                   
169         # While >= 16 data bytes remain (not c    
170         # xmm7 into them, storing the result b    
171         jl      .Lfold_16_bytes_loop_done         
172 .Lfold_16_bytes_loop:                             
173         movdqa  %xmm7, %xmm8                      
174         pclmulqdq       $0x11, FOLD_CONSTS, %x    
175         pclmulqdq       $0x00, FOLD_CONSTS, %x    
176         pxor    %xmm8, %xmm7                      
177         movdqu  (buf), %xmm0                      
178         pshufb  BSWAP_MASK, %xmm0                 
179         pxor    %xmm0 , %xmm7                     
180         add     $16, buf                          
181         sub     $16, len                          
182         jge     .Lfold_16_bytes_loop              
183                                                   
184 .Lfold_16_bytes_loop_done:                        
185         # Add 16 to get the correct number of     
186         # (not counting xmm7), following the p    
187         add     $16, len                          
188         je      .Lreduce_final_16_bytes           
189                                                   
190 .Lhandle_partial_segment:                         
191         # Reduce the last '16 + len' bytes whe    
192         # bytes are in xmm7 and the rest are t    
193         # this without needing a fold constant    
194         # the bytes into a first chunk of 'len    
195         # bytes, then fold the first chunk int    
196                                                   
197         movdqa  %xmm7, %xmm2                      
198                                                   
199         # xmm1 = last 16 original data bytes      
200         movdqu  -16(buf, len), %xmm1              
201         pshufb  BSWAP_MASK, %xmm1                 
202                                                   
203         # xmm2 = high order part of second chu    
204         lea     .Lbyteshift_table+16(%rip), %r    
205         sub     len, %rax                         
206         movdqu  (%rax), %xmm0                     
207         pshufb  %xmm0, %xmm2                      
208                                                   
209         # xmm7 = first chunk: xmm7 right-shift    
210         pxor    .Lmask1(%rip), %xmm0              
211         pshufb  %xmm0, %xmm7                      
212                                                   
213         # xmm1 = second chunk: 'len' bytes fro    
214         # then '16-len' bytes from xmm2 (high-    
215         pblendvb        %xmm2, %xmm1    #xmm0     
216                                                   
217         # Fold the first chunk into the second    
218         movdqa  %xmm7, %xmm8                      
219         pclmulqdq       $0x11, FOLD_CONSTS, %x    
220         pclmulqdq       $0x00, FOLD_CONSTS, %x    
221         pxor    %xmm8, %xmm7                      
222         pxor    %xmm1, %xmm7                      
223                                                   
224 .Lreduce_final_16_bytes:                          
225         # Reduce the 128-bit value M(x), store    
226                                                   
227         # Load 'x^48 * (x^48 mod G(x))' and 'x    
228         movdqa  .Lfinal_fold_consts(%rip), FOL    
229                                                   
230         # Fold the high 64 bits into the low 6    
231         # x^64.  This produces a 128-bit value    
232         # whose low 48 bits are 0.                
233         movdqa  %xmm7, %xmm0                      
234         pclmulqdq       $0x11, FOLD_CONSTS, %x    
235         pslldq  $8, %xmm0                         
236         pxor    %xmm0, %xmm7                      
237                                                   
238         # Fold the high 32 bits into the low 9    
239         # value congruent to x^64 * M(x) and w    
240         movdqa  %xmm7, %xmm0                      
241         pand    .Lmask2(%rip), %xmm0              
242         psrldq  $12, %xmm7                        
243         pclmulqdq       $0x00, FOLD_CONSTS, %x    
244         pxor    %xmm0, %xmm7                      
245                                                   
246         # Load G(x) and floor(x^48 / G(x)).       
247         movdqa  .Lbarrett_reduction_consts(%ri    
248                                                   
249         # Use Barrett reduction to compute the    
250         movdqa  %xmm7, %xmm0                      
251         pclmulqdq       $0x11, FOLD_CONSTS, %x    
252         psrlq   $32, %xmm7                        
253         pclmulqdq       $0x00, FOLD_CONSTS, %x    
254         psrlq   $48, %xmm0                        
255         pxor    %xmm7, %xmm0                 #    
256         # Final CRC value (x^16 * M(x)) mod G(    
257                                                   
258         pextrw  $0, %xmm0, %eax                   
259         RET                                       
260                                                   
261 .align 16                                         
262 .Lless_than_256_bytes:                            
263         # Checksumming a buffer of length 16..    
264                                                   
265         # Load the first 16 data bytes.           
266         movdqu  (buf), %xmm7                      
267         pshufb  BSWAP_MASK, %xmm7                 
268         add     $16, buf                          
269                                                   
270         # XOR the first 16 data *bits* with th    
271         pxor    %xmm0, %xmm0                      
272         pinsrw  $7, init_crc, %xmm0               
273         pxor    %xmm0, %xmm7                      
274                                                   
275         movdqa  .Lfold_across_16_bytes_consts(    
276         cmp     $16, len                          
277         je      .Lreduce_final_16_bytes           
278         sub     $32, len                          
279         jge     .Lfold_16_bytes_loop              
280         add     $16, len                          
281         jmp     .Lhandle_partial_segment          
282 SYM_FUNC_END(crc_t10dif_pcl)                      
283                                                   
284 .section        .rodata, "a", @progbits           
285 .align 16                                         
286                                                   
287 # Fold constants precomputed from the polynomi    
288 # G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7     
289 .Lfold_across_128_bytes_consts:                   
290         .quad           0x0000000000006123        
291         .quad           0x0000000000002295        
292 .Lfold_across_64_bytes_consts:                    
293         .quad           0x0000000000001069        
294         .quad           0x000000000000dd31        
295 .Lfold_across_32_bytes_consts:                    
296         .quad           0x000000000000857d        
297         .quad           0x0000000000007acc        
298 .Lfold_across_16_bytes_consts:                    
299         .quad           0x000000000000a010        
300         .quad           0x0000000000001faa        
301 .Lfinal_fold_consts:                              
302         .quad           0x1368000000000000        
303         .quad           0x2d56000000000000        
304 .Lbarrett_reduction_consts:                       
305         .quad           0x0000000000018bb7        
306         .quad           0x00000001f65a57f8        
307                                                   
308 .section        .rodata.cst16.mask1, "aM", @pr    
309 .align 16                                         
310 .Lmask1:                                          
311         .octa   0x8080808080808080808080808080    
312                                                   
313 .section        .rodata.cst16.mask2, "aM", @pr    
314 .align 16                                         
315 .Lmask2:                                          
316         .octa   0x00000000FFFFFFFFFFFFFFFFFFFF    
317                                                   
318 .section        .rodata.cst16.bswap_mask, "aM"    
319 .align 16                                         
320 .Lbswap_mask:                                     
321         .octa   0x000102030405060708090A0B0C0D    
322                                                   
323 .section        .rodata.cst32.byteshift_table,    
324 .align 16                                         
325 # For 1 <= len <= 15, the 16-byte vector begin    
326 # is the index vector to shift left by 'len' b    
327 # 0x80} XOR the index vector to shift right by    
328 .Lbyteshift_table:                                
329         .byte            0x0, 0x81, 0x82, 0x83    
330         .byte           0x88, 0x89, 0x8a, 0x8b    
331         .byte            0x0,  0x1,  0x2,  0x3    
332         .byte            0x8,  0x9,  0xa,  0xb    
                                                      

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

sflogo.php