~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/arch/x86/crypto/sha512-avx2-asm.S

Version: ~ [ linux-6.12-rc7 ] ~ [ linux-6.11.7 ] ~ [ linux-6.10.14 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.60 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.116 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.171 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.229 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.285 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.323 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.12 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

Diff markup

Differences between /arch/x86/crypto/sha512-avx2-asm.S (Version linux-6.12-rc7) and /arch/i386/crypto/sha512-avx2-asm.S (Version linux-5.10.229)


  1 ##############################################    
  2 # Implement fast SHA-512 with AVX2 instruction    
  3 #                                                 
  4 # Copyright (C) 2013 Intel Corporation.           
  5 #                                                 
  6 # Authors:                                        
  7 #     James Guilford <james.guilford@intel.com>    
  8 #     Kirk Yap <kirk.s.yap@intel.com>              
  9 #     David Cote <david.m.cote@intel.com>          
 10 #     Tim Chen <tim.c.chen@linux.intel.com>        
 11 #                                                 
 12 # This software is available to you under a ch    
 13 # licenses.  You may choose to be licensed und    
 14 # General Public License (GPL) Version 2, avai    
 15 # COPYING in the main directory of this source    
 16 # OpenIB.org BSD license below:                   
 17 #                                                 
 18 #     Redistribution and use in source and bin    
 19 #     without modification, are permitted prov    
 20 #     conditions are met:                         
 21 #                                                 
 22 #      - Redistributions of source code must r    
 23 #        copyright notice, this list of condit    
 24 #        disclaimer.                              
 25 #                                                 
 26 #      - Redistributions in binary form must r    
 27 #        copyright notice, this list of condit    
 28 #        disclaimer in the documentation and/o    
 29 #        provided with the distribution.          
 30 #                                                 
 31 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WA    
 32 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITE    
 33 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PU    
 34 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHO    
 35 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LI    
 36 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISI    
 37 # CONNECTION WITH THE SOFTWARE OR THE USE OR O    
 38 # SOFTWARE.                                       
 39 #                                                 
 40 ##############################################    
 41 #                                                 
 42 # This code is described in an Intel White-Pap    
 43 # "Fast SHA-512 Implementations on Intel Archi    
 44 #                                                 
 45 # To find it, surf to http://www.intel.com/p/e    
 46 # and search for that title.                      
 47 #                                                 
 48 ##############################################    
 49 # This code schedules 1 blocks at a time, with    
 50 ##############################################    
 51                                                   
 52 #include <linux/linkage.h>                        
 53 #include <linux/cfi_types.h>                      
 54                                                   
 55 .text                                             
 56                                                   
 57 # Virtual Registers                               
 58 Y_0 = %ymm4                                       
 59 Y_1 = %ymm5                                       
 60 Y_2 = %ymm6                                       
 61 Y_3 = %ymm7                                       
 62                                                   
 63 YTMP0 = %ymm0                                     
 64 YTMP1 = %ymm1                                     
 65 YTMP2 = %ymm2                                     
 66 YTMP3 = %ymm3                                     
 67 YTMP4 = %ymm8                                     
 68 XFER  = YTMP0                                     
 69                                                   
 70 BYTE_FLIP_MASK  = %ymm9                           
 71                                                   
 72 # 1st arg is %rdi, which is saved to the stack    
 73 CTX1        = %rdi                                
 74 CTX2        = %r12                                
 75 # 2nd arg                                         
 76 INP         = %rsi                                
 77 # 3rd arg                                         
 78 NUM_BLKS    = %rdx                                
 79                                                   
 80 c           = %rcx                                
 81 d           = %r8                                 
 82 e           = %rdx                                
 83 y3          = %rsi                                
 84                                                   
 85 TBL   = %rdi # clobbers CTX1                      
 86                                                   
 87 a     = %rax                                      
 88 b     = %rbx                                      
 89                                                   
 90 f     = %r9                                       
 91 g     = %r10                                      
 92 h     = %r11                                      
 93 old_h = %r11                                      
 94                                                   
 95 T1    = %r12 # clobbers CTX2                      
 96 y0    = %r13                                      
 97 y1    = %r14                                      
 98 y2    = %r15                                      
 99                                                   
100 # Local variables (stack frame)                   
101 XFER_SIZE = 4*8                                   
102 SRND_SIZE = 1*8                                   
103 INP_SIZE = 1*8                                    
104 INPEND_SIZE = 1*8                                 
105 CTX_SIZE = 1*8                                    
106                                                   
107 frame_XFER = 0                                    
108 frame_SRND = frame_XFER + XFER_SIZE               
109 frame_INP = frame_SRND + SRND_SIZE                
110 frame_INPEND = frame_INP + INP_SIZE               
111 frame_CTX = frame_INPEND + INPEND_SIZE            
112 frame_size = frame_CTX + CTX_SIZE                 
113                                                   
114 ## assume buffers not aligned                     
115 #define VMOVDQ vmovdqu                            
116                                                   
117 # addm [mem], reg                                 
118 # Add reg to mem using reg-mem add and store      
119 .macro addm p1 p2                                 
120         add     \p1, \p2                          
121         mov     \p2, \p1                          
122 .endm                                             
123                                                   
124                                                   
125 # COPY_YMM_AND_BSWAP ymm, [mem], byte_flip_mas    
126 # Load ymm with mem and byte swap each dword      
127 .macro COPY_YMM_AND_BSWAP p1 p2 p3                
128         VMOVDQ \p2, \p1                           
129         vpshufb \p3, \p1, \p1                     
130 .endm                                             
131 # rotate_Ys                                       
132 # Rotate values of symbols Y0...Y3                
133 .macro rotate_Ys                                  
134         Y_ = Y_0                                  
135         Y_0 = Y_1                                 
136         Y_1 = Y_2                                 
137         Y_2 = Y_3                                 
138         Y_3 = Y_                                  
139 .endm                                             
140                                                   
141 # RotateState                                     
142 .macro RotateState                                
143         # Rotate symbols a..h right               
144         old_h  = h                                
145         TMP_   = h                                
146         h      = g                                
147         g      = f                                
148         f      = e                                
149         e      = d                                
150         d      = c                                
151         c      = b                                
152         b      = a                                
153         a      = TMP_                             
154 .endm                                             
155                                                   
156 # macro MY_VPALIGNR     YDST, YSRC1, YSRC2, RV    
157 # YDST = {YSRC1, YSRC2} >> RVAL*8                 
158 .macro MY_VPALIGNR YDST YSRC1 YSRC2 RVAL          
159         vperm2f128      $0x3, \YSRC2, \YSRC1,     
160         vpalignr        $\RVAL, \YSRC2, \YDST,    
161 .endm                                             
162                                                   
163 .macro FOUR_ROUNDS_AND_SCHED                      
164 ################################### RND N + 0     
165                                                   
166         # Extract w[t-7]                          
167         MY_VPALIGNR     YTMP0, Y_3, Y_2, 8        
168         # Calculate w[t-16] + w[t-7]              
169         vpaddq          Y_0, YTMP0, YTMP0         
170         # Extract w[t-15]                         
171         MY_VPALIGNR     YTMP1, Y_1, Y_0, 8        
172                                                   
173         # Calculate sigma0                        
174                                                   
175         # Calculate w[t-15] ror 1                 
176         vpsrlq          $1, YTMP1, YTMP2          
177         vpsllq          $(64-1), YTMP1, YTMP3     
178         vpor            YTMP2, YTMP3, YTMP3       
179         # Calculate w[t-15] shr 7                 
180         vpsrlq          $7, YTMP1, YTMP4          
181                                                   
182         mov     a, y3           # y3 = a          
183         rorx    $41, e, y0      # y0 = e >> 41    
184         rorx    $18, e, y1      # y1 = e >> 18    
185         add     frame_XFER(%rsp),h                
186         or      c, y3           # y3 = a|c        
187         mov     f, y2           # y2 = f          
188         rorx    $34, a, T1      # T1 = a >> 34    
189                                                   
190         xor     y1, y0          # y0 = (e>>41)    
191         xor     g, y2           # y2 = f^g        
192         rorx    $14, e, y1      # y1 = (e >> 1    
193                                                   
194         and     e, y2           # y2 = (f^g)&e    
195         xor     y1, y0          # y0 = (e>>41)    
196         rorx    $39, a, y1      # y1 = a >> 39    
197         add     h, d            # d = k + w +     
198                                                   
199         and     b, y3           # y3 = (a|c)&b    
200         xor     T1, y1          # y1 = (a>>39)    
201         rorx    $28, a, T1      # T1 = (a >> 2    
202                                                   
203         xor     g, y2           # y2 = CH = ((    
204         xor     T1, y1          # y1 = (a>>39)    
205         mov     a, T1           # T1 = a          
206         and     c, T1           # T1 = a&c        
207                                                   
208         add     y0, y2          # y2 = S1 + CH    
209         or      T1, y3          # y3 = MAJ = (    
210         add     y1, h           # h = k + w +     
211                                                   
212         add     y2, d           # d = k + w +     
213                                                   
214         add     y2, h           # h = k + w +     
215         add     y3, h           # h = t1 + S0     
216                                                   
217         RotateState                               
218                                                   
219 ################################### RND N + 1     
220                                                   
221         # Calculate w[t-15] ror 8                 
222         vpsrlq          $8, YTMP1, YTMP2          
223         vpsllq          $(64-8), YTMP1, YTMP1     
224         vpor            YTMP2, YTMP1, YTMP1       
225         # XOR the three components                
226         vpxor           YTMP4, YTMP3, YTMP3       
227         vpxor           YTMP1, YTMP3, YTMP1       
228                                                   
229                                                   
230         # Add three components, w[t-16], w[t-7    
231         vpaddq          YTMP1, YTMP0, YTMP0       
232         # Move to appropriate lanes for calcul    
233         vperm2f128      $0x0, YTMP0, YTMP0, Y_    
234         # Move to appropriate lanes for calcul    
235         vpand           MASK_YMM_LO(%rip), YTM    
236                                                   
237         # Calculate w[16] and w[17] in both 12    
238                                                   
239         # Calculate sigma1 for w[16] and w[17]    
240         vperm2f128      $0x11, Y_3, Y_3, YTMP2    
241         vpsrlq          $6, YTMP2, YTMP4          
242                                                   
243                                                   
244         mov     a, y3           # y3 = a          
245         rorx    $41, e, y0      # y0 = e >> 41    
246         rorx    $18, e, y1      # y1 = e >> 18    
247         add     1*8+frame_XFER(%rsp), h           
248         or      c, y3           # y3 = a|c        
249                                                   
250                                                   
251         mov     f, y2           # y2 = f          
252         rorx    $34, a, T1      # T1 = a >> 34    
253         xor     y1, y0          # y0 = (e>>41)    
254         xor     g, y2           # y2 = f^g        
255                                                   
256                                                   
257         rorx    $14, e, y1      # y1 = (e >> 1    
258         xor     y1, y0          # y0 = (e>>41)    
259         rorx    $39, a, y1      # y1 = a >> 39    
260         and     e, y2           # y2 = (f^g)&e    
261         add     h, d            # d = k + w +     
262                                                   
263         and     b, y3           # y3 = (a|c)&b    
264         xor     T1, y1          # y1 = (a>>39)    
265                                                   
266         rorx    $28, a, T1      # T1 = (a >> 2    
267         xor     g, y2           # y2 = CH = ((    
268                                                   
269         xor     T1, y1          # y1 = (a>>39)    
270         mov     a, T1           # T1 = a          
271         and     c, T1           # T1 = a&c        
272         add     y0, y2          # y2 = S1 + CH    
273                                                   
274         or      T1, y3          # y3 = MAJ = (    
275         add     y1, h           # h = k + w +     
276                                                   
277         add     y2, d           # d = k + w +     
278         add     y2, h           # h = k + w +     
279         add     y3, h           # h = t1 + S0     
280                                                   
281         RotateState                               
282                                                   
283                                                   
284 ################################### RND N + 2     
285                                                   
286         vpsrlq          $19, YTMP2, YTMP3         
287         vpsllq          $(64-19), YTMP2, YTMP1    
288         vpor            YTMP1, YTMP3, YTMP3       
289         vpxor           YTMP3, YTMP4, YTMP4       
290         vpsrlq          $61, YTMP2, YTMP3         
291         vpsllq          $(64-61), YTMP2, YTMP1    
292         vpor            YTMP1, YTMP3, YTMP3       
293         vpxor           YTMP3, YTMP4, YTMP4       
294                                                   
295                                                   
296         # Add sigma1 to the other compunents t    
297         vpaddq          YTMP4, Y_0, Y_0           
298                                                   
299         # Calculate sigma1 for w[18] and w[19]    
300         vpsrlq          $6, Y_0, YTMP4            
301                                                   
302         mov     a, y3           # y3 = a          
303         rorx    $41, e, y0      # y0 = e >> 41    
304         add     2*8+frame_XFER(%rsp), h           
305                                                   
306         rorx    $18, e, y1      # y1 = e >> 18    
307         or      c, y3           # y3 = a|c        
308         mov     f, y2           # y2 = f          
309         xor     g, y2           # y2 = f^g        
310                                                   
311         rorx    $34, a, T1      # T1 = a >> 34    
312         xor     y1, y0          # y0 = (e>>41)    
313         and     e, y2           # y2 = (f^g)&e    
314                                                   
315         rorx    $14, e, y1      # y1 = (e >> 1    
316         add     h, d            # d = k + w +     
317         and     b, y3           # y3 = (a|c)&b    
318                                                   
319         xor     y1, y0          # y0 = (e>>41)    
320         rorx    $39, a, y1      # y1 = a >> 39    
321         xor     g, y2           # y2 = CH = ((    
322                                                   
323         xor     T1, y1          # y1 = (a>>39)    
324         rorx    $28, a, T1      # T1 = (a >> 2    
325                                                   
326         xor     T1, y1          # y1 = (a>>39)    
327         mov     a, T1           # T1 = a          
328         and     c, T1           # T1 = a&c        
329         add     y0, y2          # y2 = S1 + CH    
330                                                   
331         or      T1, y3          # y3 = MAJ = (    
332         add     y1, h           # h = k + w +     
333         add     y2, d           # d = k + w +     
334         add     y2, h           # h = k + w +     
335                                                   
336         add     y3, h           # h = t1 + S0     
337                                                   
338         RotateState                               
339                                                   
340 ################################### RND N + 3     
341                                                   
342         vpsrlq          $19, Y_0, YTMP3           
343         vpsllq          $(64-19), Y_0, YTMP1      
344         vpor            YTMP1, YTMP3, YTMP3       
345         vpxor           YTMP3, YTMP4, YTMP4       
346         vpsrlq          $61, Y_0, YTMP3           
347         vpsllq          $(64-61), Y_0, YTMP1      
348         vpor            YTMP1, YTMP3, YTMP3       
349         vpxor           YTMP3, YTMP4, YTMP4       
350                                                   
351                                                   
352         # Add the sigma0 + w[t-7] + w[t-16] fo    
353         # to newly calculated sigma1 to get w[    
354         vpaddq          YTMP4, YTMP0, YTMP2       
355                                                   
356         # Form w[19, w[18], w17], w[16]           
357         vpblendd                $0xF0, YTMP2,     
358                                                   
359         mov     a, y3           # y3 = a          
360         rorx    $41, e, y0      # y0 = e >> 41    
361         rorx    $18, e, y1      # y1 = e >> 18    
362         add     3*8+frame_XFER(%rsp), h           
363         or      c, y3           # y3 = a|c        
364                                                   
365                                                   
366         mov     f, y2           # y2 = f          
367         rorx    $34, a, T1      # T1 = a >> 34    
368         xor     y1, y0          # y0 = (e>>41)    
369         xor     g, y2           # y2 = f^g        
370                                                   
371                                                   
372         rorx    $14, e, y1      # y1 = (e >> 1    
373         and     e, y2           # y2 = (f^g)&e    
374         add     h, d            # d = k + w +     
375         and     b, y3           # y3 = (a|c)&b    
376                                                   
377         xor     y1, y0          # y0 = (e>>41)    
378         xor     g, y2           # y2 = CH = ((    
379                                                   
380         rorx    $39, a, y1      # y1 = a >> 39    
381         add     y0, y2          # y2 = S1 + CH    
382                                                   
383         xor     T1, y1          # y1 = (a>>39)    
384         add     y2, d           # d = k + w +     
385                                                   
386         rorx    $28, a, T1      # T1 = (a >> 2    
387                                                   
388         xor     T1, y1          # y1 = (a>>39)    
389         mov     a, T1           # T1 = a          
390         and     c, T1           # T1 = a&c        
391         or      T1, y3          # y3 = MAJ = (    
392                                                   
393         add     y1, h           # h = k + w +     
394         add     y2, h           # h = k + w +     
395         add     y3, h           # h = t1 + S0     
396                                                   
397         RotateState                               
398                                                   
399         rotate_Ys                                 
400 .endm                                             
401                                                   
402 .macro DO_4ROUNDS                                 
403                                                   
404 ################################### RND N + 0     
405                                                   
406         mov     f, y2           # y2 = f          
407         rorx    $41, e, y0      # y0 = e >> 41    
408         rorx    $18, e, y1      # y1 = e >> 18    
409         xor     g, y2           # y2 = f^g        
410                                                   
411         xor     y1, y0          # y0 = (e>>41)    
412         rorx    $14, e, y1      # y1 = (e >> 1    
413         and     e, y2           # y2 = (f^g)&e    
414                                                   
415         xor     y1, y0          # y0 = (e>>41)    
416         rorx    $34, a, T1      # T1 = a >> 34    
417         xor     g, y2           # y2 = CH = ((    
418         rorx    $39, a, y1      # y1 = a >> 39    
419         mov     a, y3           # y3 = a          
420                                                   
421         xor     T1, y1          # y1 = (a>>39)    
422         rorx    $28, a, T1      # T1 = (a >> 2    
423         add     frame_XFER(%rsp), h               
424         or      c, y3           # y3 = a|c        
425                                                   
426         xor     T1, y1          # y1 = (a>>39)    
427         mov     a, T1           # T1 = a          
428         and     b, y3           # y3 = (a|c)&b    
429         and     c, T1           # T1 = a&c        
430         add     y0, y2          # y2 = S1 + CH    
431                                                   
432         add     h, d            # d = k + w +     
433         or      T1, y3          # y3 = MAJ = (    
434         add     y1, h           # h = k + w +     
435                                                   
436         add     y2, d           # d = k + w +     
437                                                   
438         RotateState                               
439                                                   
440 ################################### RND N + 1     
441                                                   
442         add     y2, old_h       # h = k + w +     
443         mov     f, y2           # y2 = f          
444         rorx    $41, e, y0      # y0 = e >> 41    
445         rorx    $18, e, y1      # y1 = e >> 18    
446         xor     g, y2           # y2 = f^g        
447                                                   
448         xor     y1, y0          # y0 = (e>>41)    
449         rorx    $14, e, y1      # y1 = (e >> 1    
450         and     e, y2           # y2 = (f^g)&e    
451         add     y3, old_h       # h = t1 + S0     
452                                                   
453         xor     y1, y0          # y0 = (e>>41)    
454         rorx    $34, a, T1      # T1 = a >> 34    
455         xor     g, y2           # y2 = CH = ((    
456         rorx    $39, a, y1      # y1 = a >> 39    
457         mov     a, y3           # y3 = a          
458                                                   
459         xor     T1, y1          # y1 = (a>>39)    
460         rorx    $28, a, T1      # T1 = (a >> 2    
461         add     8*1+frame_XFER(%rsp), h           
462         or      c, y3           # y3 = a|c        
463                                                   
464         xor     T1, y1          # y1 = (a>>39)    
465         mov     a, T1           # T1 = a          
466         and     b, y3           # y3 = (a|c)&b    
467         and     c, T1           # T1 = a&c        
468         add     y0, y2          # y2 = S1 + CH    
469                                                   
470         add     h, d            # d = k + w +     
471         or      T1, y3          # y3 = MAJ = (    
472         add     y1, h           # h = k + w +     
473                                                   
474         add     y2, d           # d = k + w +     
475                                                   
476         RotateState                               
477                                                   
478 ################################### RND N + 2     
479                                                   
480         add     y2, old_h       # h = k + w +     
481         mov     f, y2           # y2 = f          
482         rorx    $41, e, y0      # y0 = e >> 41    
483         rorx    $18, e, y1      # y1 = e >> 18    
484         xor     g, y2           # y2 = f^g        
485                                                   
486         xor     y1, y0          # y0 = (e>>41)    
487         rorx    $14, e, y1      # y1 = (e >> 1    
488         and     e, y2           # y2 = (f^g)&e    
489         add     y3, old_h       # h = t1 + S0     
490                                                   
491         xor     y1, y0          # y0 = (e>>41)    
492         rorx    $34, a, T1      # T1 = a >> 34    
493         xor     g, y2           # y2 = CH = ((    
494         rorx    $39, a, y1      # y1 = a >> 39    
495         mov     a, y3           # y3 = a          
496                                                   
497         xor     T1, y1          # y1 = (a>>39)    
498         rorx    $28, a, T1      # T1 = (a >> 2    
499         add     8*2+frame_XFER(%rsp), h           
500         or      c, y3           # y3 = a|c        
501                                                   
502         xor     T1, y1          # y1 = (a>>39)    
503         mov     a, T1           # T1 = a          
504         and     b, y3           # y3 = (a|c)&b    
505         and     c, T1           # T1 = a&c        
506         add     y0, y2          # y2 = S1 + CH    
507                                                   
508         add     h, d            # d = k + w +     
509         or      T1, y3          # y3 = MAJ = (    
510         add     y1, h           # h = k + w +     
511                                                   
512         add     y2, d           # d = k + w +     
513                                                   
514         RotateState                               
515                                                   
516 ################################### RND N + 3     
517                                                   
518         add     y2, old_h       # h = k + w +     
519         mov     f, y2           # y2 = f          
520         rorx    $41, e, y0      # y0 = e >> 41    
521         rorx    $18, e, y1      # y1 = e >> 18    
522         xor     g, y2           # y2 = f^g        
523                                                   
524         xor     y1, y0          # y0 = (e>>41)    
525         rorx    $14, e, y1      # y1 = (e >> 1    
526         and     e, y2           # y2 = (f^g)&e    
527         add     y3, old_h       # h = t1 + S0     
528                                                   
529         xor     y1, y0          # y0 = (e>>41)    
530         rorx    $34, a, T1      # T1 = a >> 34    
531         xor     g, y2           # y2 = CH = ((    
532         rorx    $39, a, y1      # y1 = a >> 39    
533         mov     a, y3           # y3 = a          
534                                                   
535         xor     T1, y1          # y1 = (a>>39)    
536         rorx    $28, a, T1      # T1 = (a >> 2    
537         add     8*3+frame_XFER(%rsp), h           
538         or      c, y3           # y3 = a|c        
539                                                   
540         xor     T1, y1          # y1 = (a>>39)    
541         mov     a, T1           # T1 = a          
542         and     b, y3           # y3 = (a|c)&b    
543         and     c, T1           # T1 = a&c        
544         add     y0, y2          # y2 = S1 + CH    
545                                                   
546                                                   
547         add     h, d            # d = k + w +     
548         or      T1, y3          # y3 = MAJ = (    
549         add     y1, h           # h = k + w +     
550                                                   
551         add     y2, d           # d = k + w +     
552                                                   
553         add     y2, h           # h = k + w +     
554                                                   
555         add     y3, h           # h = t1 + S0     
556                                                   
557         RotateState                               
558                                                   
559 .endm                                             
560                                                   
561 ##############################################    
562 # void sha512_transform_rorx(sha512_state *sta    
563 # Purpose: Updates the SHA512 digest stored at    
564 # stored in "data".                               
565 # The size of the message pointed to by "data"    
566 # of SHA512 message blocks.                       
567 # "blocks" is the message length in SHA512 blo    
568 ##############################################    
569 SYM_TYPED_FUNC_START(sha512_transform_rorx)       
570         # Save GPRs                               
571         push    %rbx                              
572         push    %r12                              
573         push    %r13                              
574         push    %r14                              
575         push    %r15                              
576                                                   
577         # Allocate Stack Space                    
578         push    %rbp                              
579         mov     %rsp, %rbp                        
580         sub     $frame_size, %rsp                 
581         and     $~(0x20 - 1), %rsp                
582                                                   
583         shl     $7, NUM_BLKS    # convert to b    
584         jz      .Ldone_hash                       
585         add     INP, NUM_BLKS   # pointer to e    
586         mov     NUM_BLKS, frame_INPEND(%rsp)      
587                                                   
588         ## load initial digest                    
589         mov     8*0(CTX1), a                      
590         mov     8*1(CTX1), b                      
591         mov     8*2(CTX1), c                      
592         mov     8*3(CTX1), d                      
593         mov     8*4(CTX1), e                      
594         mov     8*5(CTX1), f                      
595         mov     8*6(CTX1), g                      
596         mov     8*7(CTX1), h                      
597                                                   
598         # save %rdi (CTX) before it gets clobb    
599         mov     %rdi, frame_CTX(%rsp)             
600                                                   
601         vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip),    
602                                                   
603 .Lloop0:                                          
604         lea     K512(%rip), TBL                   
605                                                   
606         ## byte swap first 16 dwords              
607         COPY_YMM_AND_BSWAP      Y_0, (INP), BY    
608         COPY_YMM_AND_BSWAP      Y_1, 1*32(INP)    
609         COPY_YMM_AND_BSWAP      Y_2, 2*32(INP)    
610         COPY_YMM_AND_BSWAP      Y_3, 3*32(INP)    
611                                                   
612         mov     INP, frame_INP(%rsp)              
613                                                   
614         ## schedule 64 input dwords, by doing     
615         movq    $4, frame_SRND(%rsp)              
616                                                   
617 .align 16                                         
618 .Lloop1:                                          
619         vpaddq  (TBL), Y_0, XFER                  
620         vmovdqa XFER, frame_XFER(%rsp)            
621         FOUR_ROUNDS_AND_SCHED                     
622                                                   
623         vpaddq  1*32(TBL), Y_0, XFER              
624         vmovdqa XFER, frame_XFER(%rsp)            
625         FOUR_ROUNDS_AND_SCHED                     
626                                                   
627         vpaddq  2*32(TBL), Y_0, XFER              
628         vmovdqa XFER, frame_XFER(%rsp)            
629         FOUR_ROUNDS_AND_SCHED                     
630                                                   
631         vpaddq  3*32(TBL), Y_0, XFER              
632         vmovdqa XFER, frame_XFER(%rsp)            
633         add     $(4*32), TBL                      
634         FOUR_ROUNDS_AND_SCHED                     
635                                                   
636         subq    $1, frame_SRND(%rsp)              
637         jne     .Lloop1                           
638                                                   
639         movq    $2, frame_SRND(%rsp)              
640 .Lloop2:                                          
641         vpaddq  (TBL), Y_0, XFER                  
642         vmovdqa XFER, frame_XFER(%rsp)            
643         DO_4ROUNDS                                
644         vpaddq  1*32(TBL), Y_1, XFER              
645         vmovdqa XFER, frame_XFER(%rsp)            
646         add     $(2*32), TBL                      
647         DO_4ROUNDS                                
648                                                   
649         vmovdqa Y_2, Y_0                          
650         vmovdqa Y_3, Y_1                          
651                                                   
652         subq    $1, frame_SRND(%rsp)              
653         jne     .Lloop2                           
654                                                   
655         mov     frame_CTX(%rsp), CTX2             
656         addm    8*0(CTX2), a                      
657         addm    8*1(CTX2), b                      
658         addm    8*2(CTX2), c                      
659         addm    8*3(CTX2), d                      
660         addm    8*4(CTX2), e                      
661         addm    8*5(CTX2), f                      
662         addm    8*6(CTX2), g                      
663         addm    8*7(CTX2), h                      
664                                                   
665         mov     frame_INP(%rsp), INP              
666         add     $128, INP                         
667         cmp     frame_INPEND(%rsp), INP           
668         jne     .Lloop0                           
669                                                   
670 .Ldone_hash:                                      
671                                                   
672         # Restore Stack Pointer                   
673         mov     %rbp, %rsp                        
674         pop     %rbp                              
675                                                   
676         # Restore GPRs                            
677         pop     %r15                              
678         pop     %r14                              
679         pop     %r13                              
680         pop     %r12                              
681         pop     %rbx                              
682                                                   
683         vzeroupper                                
684         RET                                       
685 SYM_FUNC_END(sha512_transform_rorx)               
686                                                   
687 ##############################################    
688 ### Binary Data                                   
689                                                   
690                                                   
691 # Mergeable 640-byte rodata section. This allo    
692 # with other, exactly the same 640-byte fragme    
693 # (if such section exists).                       
694 .section        .rodata.cst640.K512, "aM", @pr    
695 .align 64                                         
696 # K[t] used in SHA512 hashing                     
697 K512:                                             
698         .quad   0x428a2f98d728ae22,0x713744912    
699         .quad   0xb5c0fbcfec4d3b2f,0xe9b5dba58    
700         .quad   0x3956c25bf348b538,0x59f111f1b    
701         .quad   0x923f82a4af194f9b,0xab1c5ed5d    
702         .quad   0xd807aa98a3030242,0x12835b014    
703         .quad   0x243185be4ee4b28c,0x550c7dc3d    
704         .quad   0x72be5d74f27b896f,0x80deb1fe3    
705         .quad   0x9bdc06a725c71235,0xc19bf174c    
706         .quad   0xe49b69c19ef14ad2,0xefbe47863    
707         .quad   0x0fc19dc68b8cd5b5,0x240ca1cc7    
708         .quad   0x2de92c6f592b0275,0x4a7484aa6    
709         .quad   0x5cb0a9dcbd41fbd4,0x76f988da8    
710         .quad   0x983e5152ee66dfab,0xa831c66d2    
711         .quad   0xb00327c898fb213f,0xbf597fc7b    
712         .quad   0xc6e00bf33da88fc2,0xd5a791479    
713         .quad   0x06ca6351e003826f,0x142929670    
714         .quad   0x27b70a8546d22ffc,0x2e1b21385    
715         .quad   0x4d2c6dfc5ac42aed,0x53380d139    
716         .quad   0x650a73548baf63de,0x766a0abb3    
717         .quad   0x81c2c92e47edaee6,0x92722c851    
718         .quad   0xa2bfe8a14cf10364,0xa81a664bb    
719         .quad   0xc24b8b70d0f89791,0xc76c51a30    
720         .quad   0xd192e819d6ef5218,0xd69906245    
721         .quad   0xf40e35855771202a,0x106aa0703    
722         .quad   0x19a4c116b8d2d0c8,0x1e376c085    
723         .quad   0x2748774cdf8eeb99,0x34b0bcb5e    
724         .quad   0x391c0cb3c5c95a63,0x4ed8aa4ae    
725         .quad   0x5b9cca4f7763e373,0x682e6ff3d    
726         .quad   0x748f82ee5defb2fc,0x78a5636f4    
727         .quad   0x84c87814a1f0ab72,0x8cc702081    
728         .quad   0x90befffa23631e28,0xa4506cebd    
729         .quad   0xbef9a3f7b2c67915,0xc67178f2e    
730         .quad   0xca273eceea26619c,0xd186b8c72    
731         .quad   0xeada7dd6cde0eb1e,0xf57d4f7fe    
732         .quad   0x06f067aa72176fba,0x0a637dc5a    
733         .quad   0x113f9804bef90dae,0x1b710b351    
734         .quad   0x28db77f523047d84,0x32caab7b4    
735         .quad   0x3c9ebe0a15c9bebc,0x431d67c49    
736         .quad   0x4cc5d4becb3e42b6,0x597f299cf    
737         .quad   0x5fcb6fab3ad6faec,0x6c44198c4    
738                                                   
739 .section        .rodata.cst32.PSHUFFLE_BYTE_FL    
740 .align 32                                         
741 # Mask for byte-swapping a couple of qwords in    
742 PSHUFFLE_BYTE_FLIP_MASK:                          
743         .octa 0x08090a0b0c0d0e0f00010203040506    
744         .octa 0x18191a1b1c1d1e1f10111213141516    
745                                                   
746 .section        .rodata.cst32.MASK_YMM_LO, "aM    
747 .align 32                                         
748 MASK_YMM_LO:                                      
749         .octa 0x000000000000000000000000000000    
750         .octa 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF    
                                                      

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

sflogo.php