~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/arch/x86/crypto/sha1_ssse3_asm.S

Version: ~ [ linux-6.12-rc7 ] ~ [ linux-6.11.7 ] ~ [ linux-6.10.14 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.60 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.116 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.171 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.229 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.285 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.323 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.12 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 /* SPDX-License-Identifier: GPL-2.0-or-later */
  2 /*
  3  * This is a SIMD SHA-1 implementation. It requires the Intel(R) Supplemental
  4  * SSE3 instruction set extensions introduced in Intel Core Microarchitecture
  5  * processors. CPUs supporting Intel(R) AVX extensions will get an additional
  6  * boost.
  7  *
  8  * This work was inspired by the vectorized implementation of Dean Gaudet.
  9  * Additional information on it can be found at:
 10  *    http://www.arctic.org/~dean/crypto/sha1.html
 11  *
 12  * It was improved upon with more efficient vectorization of the message
 13  * scheduling. This implementation has also been optimized for all current and
 14  * several future generations of Intel CPUs.
 15  *
 16  * See this article for more information about the implementation details:
 17  *   http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/
 18  *
 19  * Copyright (C) 2010, Intel Corp.
 20  *   Authors: Maxim Locktyukhin <maxim.locktyukhin@intel.com>
 21  *            Ronen Zohar <ronen.zohar@intel.com>
 22  *
 23  * Converted to AT&T syntax and adapted for inclusion in the Linux kernel:
 24  *   Author: Mathias Krause <minipli@googlemail.com>
 25  */
 26 
 27 #include <linux/linkage.h>
 28 #include <linux/cfi_types.h>
 29 
 30 #define CTX     %rdi    // arg1
 31 #define BUF     %rsi    // arg2
 32 #define CNT     %rdx    // arg3
 33 
 34 #define REG_A   %ecx
 35 #define REG_B   %esi
 36 #define REG_C   %edi
 37 #define REG_D   %r12d
 38 #define REG_E   %edx
 39 
 40 #define REG_T1  %eax
 41 #define REG_T2  %ebx
 42 
 43 #define K_BASE          %r8
 44 #define HASH_PTR        %r9
 45 #define BUFFER_PTR      %r10
 46 #define BUFFER_END      %r11
 47 
 48 #define W_TMP1  %xmm0
 49 #define W_TMP2  %xmm9
 50 
 51 #define W0      %xmm1
 52 #define W4      %xmm2
 53 #define W8      %xmm3
 54 #define W12     %xmm4
 55 #define W16     %xmm5
 56 #define W20     %xmm6
 57 #define W24     %xmm7
 58 #define W28     %xmm8
 59 
 60 #define XMM_SHUFB_BSWAP %xmm10
 61 
 62 /* we keep window of 64 w[i]+K pre-calculated values in a circular buffer */
 63 #define WK(t)   (((t) & 15) * 4)(%rsp)
 64 #define W_PRECALC_AHEAD 16
 65 
 66 /*
 67  * This macro implements the SHA-1 function's body for single 64-byte block
 68  * param: function's name
 69  */
 70 .macro SHA1_VECTOR_ASM  name
 71         SYM_TYPED_FUNC_START(\name)
 72 
 73         push    %rbx
 74         push    %r12
 75         push    %rbp
 76         mov     %rsp, %rbp
 77 
 78         sub     $64, %rsp               # allocate workspace
 79         and     $~15, %rsp              # align stack
 80 
 81         mov     CTX, HASH_PTR
 82         mov     BUF, BUFFER_PTR
 83 
 84         shl     $6, CNT                 # multiply by 64
 85         add     BUF, CNT
 86         mov     CNT, BUFFER_END
 87 
 88         lea     K_XMM_AR(%rip), K_BASE
 89         xmm_mov BSWAP_SHUFB_CTL(%rip), XMM_SHUFB_BSWAP
 90 
 91         SHA1_PIPELINED_MAIN_BODY
 92 
 93         # cleanup workspace
 94         mov     $8, %ecx
 95         mov     %rsp, %rdi
 96         xor     %eax, %eax
 97         rep stosq
 98 
 99         mov     %rbp, %rsp              # deallocate workspace
100         pop     %rbp
101         pop     %r12
102         pop     %rbx
103         RET
104 
105         SYM_FUNC_END(\name)
106 .endm
107 
108 /*
109  * This macro implements 80 rounds of SHA-1 for one 64-byte block
110  */
111 .macro SHA1_PIPELINED_MAIN_BODY
112         INIT_REGALLOC
113 
114         mov       (HASH_PTR), A
115         mov      4(HASH_PTR), B
116         mov      8(HASH_PTR), C
117         mov     12(HASH_PTR), D
118         mov     16(HASH_PTR), E
119 
120   .set i, 0
121   .rept W_PRECALC_AHEAD
122         W_PRECALC i
123     .set i, (i+1)
124   .endr
125 
126 .align 4
127 1:
128         RR F1,A,B,C,D,E,0
129         RR F1,D,E,A,B,C,2
130         RR F1,B,C,D,E,A,4
131         RR F1,E,A,B,C,D,6
132         RR F1,C,D,E,A,B,8
133 
134         RR F1,A,B,C,D,E,10
135         RR F1,D,E,A,B,C,12
136         RR F1,B,C,D,E,A,14
137         RR F1,E,A,B,C,D,16
138         RR F1,C,D,E,A,B,18
139 
140         RR F2,A,B,C,D,E,20
141         RR F2,D,E,A,B,C,22
142         RR F2,B,C,D,E,A,24
143         RR F2,E,A,B,C,D,26
144         RR F2,C,D,E,A,B,28
145 
146         RR F2,A,B,C,D,E,30
147         RR F2,D,E,A,B,C,32
148         RR F2,B,C,D,E,A,34
149         RR F2,E,A,B,C,D,36
150         RR F2,C,D,E,A,B,38
151 
152         RR F3,A,B,C,D,E,40
153         RR F3,D,E,A,B,C,42
154         RR F3,B,C,D,E,A,44
155         RR F3,E,A,B,C,D,46
156         RR F3,C,D,E,A,B,48
157 
158         RR F3,A,B,C,D,E,50
159         RR F3,D,E,A,B,C,52
160         RR F3,B,C,D,E,A,54
161         RR F3,E,A,B,C,D,56
162         RR F3,C,D,E,A,B,58
163 
164         add     $64, BUFFER_PTR         # move to the next 64-byte block
165         cmp     BUFFER_END, BUFFER_PTR  # if the current is the last one use
166         cmovae  K_BASE, BUFFER_PTR      # dummy source to avoid buffer overrun
167 
168         RR F4,A,B,C,D,E,60
169         RR F4,D,E,A,B,C,62
170         RR F4,B,C,D,E,A,64
171         RR F4,E,A,B,C,D,66
172         RR F4,C,D,E,A,B,68
173 
174         RR F4,A,B,C,D,E,70
175         RR F4,D,E,A,B,C,72
176         RR F4,B,C,D,E,A,74
177         RR F4,E,A,B,C,D,76
178         RR F4,C,D,E,A,B,78
179 
180         UPDATE_HASH   (HASH_PTR), A
181         UPDATE_HASH  4(HASH_PTR), B
182         UPDATE_HASH  8(HASH_PTR), C
183         UPDATE_HASH 12(HASH_PTR), D
184         UPDATE_HASH 16(HASH_PTR), E
185 
186         RESTORE_RENAMED_REGS
187         cmp     K_BASE, BUFFER_PTR      # K_BASE means, we reached the end
188         jne     1b
189 .endm
190 
191 .macro INIT_REGALLOC
192   .set A, REG_A
193   .set B, REG_B
194   .set C, REG_C
195   .set D, REG_D
196   .set E, REG_E
197   .set T1, REG_T1
198   .set T2, REG_T2
199 .endm
200 
201 .macro RESTORE_RENAMED_REGS
202         # order is important (REG_C is where it should be)
203         mov     B, REG_B
204         mov     D, REG_D
205         mov     A, REG_A
206         mov     E, REG_E
207 .endm
208 
209 .macro SWAP_REG_NAMES  a, b
210   .set _T, \a
211   .set \a, \b
212   .set \b, _T
213 .endm
214 
215 .macro F1  b, c, d
216         mov     \c, T1
217         SWAP_REG_NAMES \c, T1
218         xor     \d, T1
219         and     \b, T1
220         xor     \d, T1
221 .endm
222 
223 .macro F2  b, c, d
224         mov     \d, T1
225         SWAP_REG_NAMES \d, T1
226         xor     \c, T1
227         xor     \b, T1
228 .endm
229 
230 .macro F3  b, c ,d
231         mov     \c, T1
232         SWAP_REG_NAMES \c, T1
233         mov     \b, T2
234         or      \b, T1
235         and     \c, T2
236         and     \d, T1
237         or      T2, T1
238 .endm
239 
240 .macro F4  b, c, d
241         F2 \b, \c, \d
242 .endm
243 
244 .macro UPDATE_HASH  hash, val
245         add     \hash, \val
246         mov     \val, \hash
247 .endm
248 
249 /*
250  * RR does two rounds of SHA-1 back to back with W[] pre-calc
251  *   t1 = F(b, c, d);   e += w(i)
252  *   e += t1;           b <<= 30;   d  += w(i+1);
253  *   t1 = F(a, b, c);
254  *   d += t1;           a <<= 5;
255  *   e += a;
256  *   t1 = e;            a >>= 7;
257  *   t1 <<= 5;
258  *   d += t1;
259  */
260 .macro RR  F, a, b, c, d, e, round
261         add     WK(\round), \e
262         \F   \b, \c, \d         # t1 = F(b, c, d);
263         W_PRECALC (\round + W_PRECALC_AHEAD)
264         rol     $30, \b
265         add     T1, \e
266         add     WK(\round + 1), \d
267 
268         \F   \a, \b, \c
269         W_PRECALC (\round + W_PRECALC_AHEAD + 1)
270         rol     $5, \a
271         add     \a, \e
272         add     T1, \d
273         ror     $7, \a          # (a <<r 5) >>r 7) => a <<r 30)
274 
275         mov     \e, T1
276         SWAP_REG_NAMES \e, T1
277 
278         rol     $5, T1
279         add     T1, \d
280 
281         # write:  \a, \b
282         # rotate: \a<=\d, \b<=\e, \c<=\a, \d<=\b, \e<=\c
283 .endm
284 
285 .macro W_PRECALC  r
286   .set i, \r
287 
288   .if (i < 20)
289     .set K_XMM, 0
290   .elseif (i < 40)
291     .set K_XMM, 16
292   .elseif (i < 60)
293     .set K_XMM, 32
294   .elseif (i < 80)
295     .set K_XMM, 48
296   .endif
297 
298   .if ((i < 16) || ((i >= 80) && (i < (80 + W_PRECALC_AHEAD))))
299     .set i, ((\r) % 80)     # pre-compute for the next iteration
300     .if (i == 0)
301         W_PRECALC_RESET
302     .endif
303         W_PRECALC_00_15
304   .elseif (i<32)
305         W_PRECALC_16_31
306   .elseif (i < 80)   // rounds 32-79
307         W_PRECALC_32_79
308   .endif
309 .endm
310 
311 .macro W_PRECALC_RESET
312   .set W,          W0
313   .set W_minus_04, W4
314   .set W_minus_08, W8
315   .set W_minus_12, W12
316   .set W_minus_16, W16
317   .set W_minus_20, W20
318   .set W_minus_24, W24
319   .set W_minus_28, W28
320   .set W_minus_32, W
321 .endm
322 
323 .macro W_PRECALC_ROTATE
324   .set W_minus_32, W_minus_28
325   .set W_minus_28, W_minus_24
326   .set W_minus_24, W_minus_20
327   .set W_minus_20, W_minus_16
328   .set W_minus_16, W_minus_12
329   .set W_minus_12, W_minus_08
330   .set W_minus_08, W_minus_04
331   .set W_minus_04, W
332   .set W,          W_minus_32
333 .endm
334 
335 .macro W_PRECALC_SSSE3
336 
337 .macro W_PRECALC_00_15
338         W_PRECALC_00_15_SSSE3
339 .endm
340 .macro W_PRECALC_16_31
341         W_PRECALC_16_31_SSSE3
342 .endm
343 .macro W_PRECALC_32_79
344         W_PRECALC_32_79_SSSE3
345 .endm
346 
347 /* message scheduling pre-compute for rounds 0-15 */
348 .macro W_PRECALC_00_15_SSSE3
349   .if ((i & 3) == 0)
350         movdqu  (i*4)(BUFFER_PTR), W_TMP1
351   .elseif ((i & 3) == 1)
352         pshufb  XMM_SHUFB_BSWAP, W_TMP1
353         movdqa  W_TMP1, W
354   .elseif ((i & 3) == 2)
355         paddd   (K_BASE), W_TMP1
356   .elseif ((i & 3) == 3)
357         movdqa  W_TMP1, WK(i&~3)
358         W_PRECALC_ROTATE
359   .endif
360 .endm
361 
362 /* message scheduling pre-compute for rounds 16-31
363  *
364  * - calculating last 32 w[i] values in 8 XMM registers
365  * - pre-calculate K+w[i] values and store to mem, for later load by ALU add
366  *   instruction
367  *
368  * some "heavy-lifting" vectorization for rounds 16-31 due to w[i]->w[i-3]
369  * dependency, but improves for 32-79
370  */
371 .macro W_PRECALC_16_31_SSSE3
372   # blended scheduling of vector and scalar instruction streams, one 4-wide
373   # vector iteration / 4 scalar rounds
374   .if ((i & 3) == 0)
375         movdqa  W_minus_12, W
376         palignr $8, W_minus_16, W       # w[i-14]
377         movdqa  W_minus_04, W_TMP1
378         psrldq  $4, W_TMP1              # w[i-3]
379         pxor    W_minus_08, W
380   .elseif ((i & 3) == 1)
381         pxor    W_minus_16, W_TMP1
382         pxor    W_TMP1, W
383         movdqa  W, W_TMP2
384         movdqa  W, W_TMP1
385         pslldq  $12, W_TMP2
386   .elseif ((i & 3) == 2)
387         psrld   $31, W
388         pslld   $1, W_TMP1
389         por     W, W_TMP1
390         movdqa  W_TMP2, W
391         psrld   $30, W_TMP2
392         pslld   $2, W
393   .elseif ((i & 3) == 3)
394         pxor    W, W_TMP1
395         pxor    W_TMP2, W_TMP1
396         movdqa  W_TMP1, W
397         paddd   K_XMM(K_BASE), W_TMP1
398         movdqa  W_TMP1, WK(i&~3)
399         W_PRECALC_ROTATE
400   .endif
401 .endm
402 
403 /* message scheduling pre-compute for rounds 32-79
404  *
405  * in SHA-1 specification: w[i] = (w[i-3] ^ w[i-8]  ^ w[i-14] ^ w[i-16]) rol 1
406  * instead we do equal:    w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
407  * allows more efficient vectorization since w[i]=>w[i-3] dependency is broken
408  */
409 .macro W_PRECALC_32_79_SSSE3
410   .if ((i & 3) == 0)
411         movdqa  W_minus_04, W_TMP1
412         pxor    W_minus_28, W           # W is W_minus_32 before xor
413         palignr $8, W_minus_08, W_TMP1
414   .elseif ((i & 3) == 1)
415         pxor    W_minus_16, W
416         pxor    W_TMP1, W
417         movdqa  W, W_TMP1
418   .elseif ((i & 3) == 2)
419         psrld   $30, W
420         pslld   $2, W_TMP1
421         por     W, W_TMP1
422   .elseif ((i & 3) == 3)
423         movdqa  W_TMP1, W
424         paddd   K_XMM(K_BASE), W_TMP1
425         movdqa  W_TMP1, WK(i&~3)
426         W_PRECALC_ROTATE
427   .endif
428 .endm
429 
430 .endm           // W_PRECALC_SSSE3
431 
432 
433 #define K1      0x5a827999
434 #define K2      0x6ed9eba1
435 #define K3      0x8f1bbcdc
436 #define K4      0xca62c1d6
437 
438 .section .rodata
439 .align 16
440 
441 K_XMM_AR:
442         .long K1, K1, K1, K1
443         .long K2, K2, K2, K2
444         .long K3, K3, K3, K3
445         .long K4, K4, K4, K4
446 
447 BSWAP_SHUFB_CTL:
448         .long 0x00010203
449         .long 0x04050607
450         .long 0x08090a0b
451         .long 0x0c0d0e0f
452 
453 
454 .section .text
455 
456 W_PRECALC_SSSE3
457 .macro xmm_mov a, b
458         movdqu  \a,\b
459 .endm
460 
461 /*
462  * SSSE3 optimized implementation:
463  *
464  * extern "C" void sha1_transform_ssse3(struct sha1_state *state,
465  *                                      const u8 *data, int blocks);
466  *
467  * Note that struct sha1_state is assumed to begin with u32 state[5].
468  */
469 SHA1_VECTOR_ASM     sha1_transform_ssse3
470 
471 .macro W_PRECALC_AVX
472 
473 .purgem W_PRECALC_00_15
474 .macro  W_PRECALC_00_15
475     W_PRECALC_00_15_AVX
476 .endm
477 .purgem W_PRECALC_16_31
478 .macro  W_PRECALC_16_31
479     W_PRECALC_16_31_AVX
480 .endm
481 .purgem W_PRECALC_32_79
482 .macro  W_PRECALC_32_79
483     W_PRECALC_32_79_AVX
484 .endm
485 
486 .macro W_PRECALC_00_15_AVX
487   .if ((i & 3) == 0)
488         vmovdqu (i*4)(BUFFER_PTR), W_TMP1
489   .elseif ((i & 3) == 1)
490         vpshufb XMM_SHUFB_BSWAP, W_TMP1, W
491   .elseif ((i & 3) == 2)
492         vpaddd  (K_BASE), W, W_TMP1
493   .elseif ((i & 3) == 3)
494         vmovdqa W_TMP1, WK(i&~3)
495         W_PRECALC_ROTATE
496   .endif
497 .endm
498 
499 .macro W_PRECALC_16_31_AVX
500   .if ((i & 3) == 0)
501         vpalignr $8, W_minus_16, W_minus_12, W  # w[i-14]
502         vpsrldq $4, W_minus_04, W_TMP1          # w[i-3]
503         vpxor   W_minus_08, W, W
504         vpxor   W_minus_16, W_TMP1, W_TMP1
505   .elseif ((i & 3) == 1)
506         vpxor   W_TMP1, W, W
507         vpslldq $12, W, W_TMP2
508         vpslld  $1, W, W_TMP1
509   .elseif ((i & 3) == 2)
510         vpsrld  $31, W, W
511         vpor    W, W_TMP1, W_TMP1
512         vpslld  $2, W_TMP2, W
513         vpsrld  $30, W_TMP2, W_TMP2
514   .elseif ((i & 3) == 3)
515         vpxor   W, W_TMP1, W_TMP1
516         vpxor   W_TMP2, W_TMP1, W
517         vpaddd  K_XMM(K_BASE), W, W_TMP1
518         vmovdqu W_TMP1, WK(i&~3)
519         W_PRECALC_ROTATE
520   .endif
521 .endm
522 
523 .macro W_PRECALC_32_79_AVX
524   .if ((i & 3) == 0)
525         vpalignr $8, W_minus_08, W_minus_04, W_TMP1
526         vpxor   W_minus_28, W, W                # W is W_minus_32 before xor
527   .elseif ((i & 3) == 1)
528         vpxor   W_minus_16, W_TMP1, W_TMP1
529         vpxor   W_TMP1, W, W
530   .elseif ((i & 3) == 2)
531         vpslld  $2, W, W_TMP1
532         vpsrld  $30, W, W
533         vpor    W, W_TMP1, W
534   .elseif ((i & 3) == 3)
535         vpaddd  K_XMM(K_BASE), W, W_TMP1
536         vmovdqu W_TMP1, WK(i&~3)
537         W_PRECALC_ROTATE
538   .endif
539 .endm
540 
541 .endm    // W_PRECALC_AVX
542 
543 W_PRECALC_AVX
544 .purgem xmm_mov
545 .macro xmm_mov a, b
546         vmovdqu \a,\b
547 .endm
548 
549 
550 /* AVX optimized implementation:
551  *  extern "C" void sha1_transform_avx(struct sha1_state *state,
552  *                                     const u8 *data, int blocks);
553  */
554 SHA1_VECTOR_ASM     sha1_transform_avx

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

sflogo.php