~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/arch/x86/crypto/sha256-avx-asm.S

Version: ~ [ linux-6.12-rc7 ] ~ [ linux-6.11.7 ] ~ [ linux-6.10.14 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.60 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.116 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.171 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.229 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.285 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.323 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.12 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 ########################################################################
  2 # Implement fast SHA-256 with AVX1 instructions. (x86_64)
  3 #
  4 # Copyright (C) 2013 Intel Corporation.
  5 #
  6 # Authors:
  7 #     James Guilford <james.guilford@intel.com>
  8 #     Kirk Yap <kirk.s.yap@intel.com>
  9 #     Tim Chen <tim.c.chen@linux.intel.com>
 10 #
 11 # This software is available to you under a choice of one of two
 12 # licenses.  You may choose to be licensed under the terms of the GNU
 13 # General Public License (GPL) Version 2, available from the file
 14 # COPYING in the main directory of this source tree, or the
 15 # OpenIB.org BSD license below:
 16 #
 17 #     Redistribution and use in source and binary forms, with or
 18 #     without modification, are permitted provided that the following
 19 #     conditions are met:
 20 #
 21 #      - Redistributions of source code must retain the above
 22 #        copyright notice, this list of conditions and the following
 23 #        disclaimer.
 24 #
 25 #      - Redistributions in binary form must reproduce the above
 26 #        copyright notice, this list of conditions and the following
 27 #        disclaimer in the documentation and/or other materials
 28 #        provided with the distribution.
 29 #
 30 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 31 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 32 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 33 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 34 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 35 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 36 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 37 # SOFTWARE.
 38 ########################################################################
 39 #
 40 # This code is described in an Intel White-Paper:
 41 # "Fast SHA-256 Implementations on Intel Architecture Processors"
 42 #
 43 # To find it, surf to http://www.intel.com/p/en_US/embedded
 44 # and search for that title.
 45 #
 46 ########################################################################
 47 # This code schedules 1 block at a time, with 4 lanes per block
 48 ########################################################################
 49 
 50 #include <linux/linkage.h>
 51 #include <linux/cfi_types.h>
 52 
 53 ## assume buffers not aligned
 54 #define    VMOVDQ vmovdqu
 55 
 56 ################################ Define Macros
 57 
 58 # addm [mem], reg
 59 # Add reg to mem using reg-mem add and store
 60 .macro addm p1 p2
 61         add     \p1, \p2
 62         mov     \p2, \p1
 63 .endm
 64 
 65 
 66 .macro MY_ROR p1 p2
 67         shld    $(32-(\p1)), \p2, \p2
 68 .endm
 69 
 70 ################################
 71 
 72 # COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
 73 # Load xmm with mem and byte swap each dword
 74 .macro COPY_XMM_AND_BSWAP p1 p2 p3
 75         VMOVDQ \p2, \p1
 76         vpshufb \p3, \p1, \p1
 77 .endm
 78 
 79 ################################
 80 
 81 X0 = %xmm4
 82 X1 = %xmm5
 83 X2 = %xmm6
 84 X3 = %xmm7
 85 
 86 XTMP0 = %xmm0
 87 XTMP1 = %xmm1
 88 XTMP2 = %xmm2
 89 XTMP3 = %xmm3
 90 XTMP4 = %xmm8
 91 XFER = %xmm9
 92 XTMP5 = %xmm11
 93 
 94 SHUF_00BA = %xmm10      # shuffle xBxA -> 00BA
 95 SHUF_DC00 = %xmm12      # shuffle xDxC -> DC00
 96 BYTE_FLIP_MASK = %xmm13
 97 
 98 NUM_BLKS = %rdx   # 3rd arg
 99 INP = %rsi        # 2nd arg
100 CTX = %rdi        # 1st arg
101 
102 SRND = %rsi       # clobbers INP
103 c = %ecx
104 d = %r8d
105 e = %edx
106 TBL = %r12
107 a = %eax
108 b = %ebx
109 
110 f = %r9d
111 g = %r10d
112 h = %r11d
113 
114 y0 = %r13d
115 y1 = %r14d
116 y2 = %r15d
117 
118 
119 _INP_END_SIZE = 8
120 _INP_SIZE = 8
121 _XFER_SIZE = 16
122 _XMM_SAVE_SIZE = 0
123 
124 _INP_END = 0
125 _INP            = _INP_END  + _INP_END_SIZE
126 _XFER           = _INP      + _INP_SIZE
127 _XMM_SAVE       = _XFER     + _XFER_SIZE
128 STACK_SIZE      = _XMM_SAVE + _XMM_SAVE_SIZE
129 
130 # rotate_Xs
131 # Rotate values of symbols X0...X3
132 .macro rotate_Xs
133 X_ = X0
134 X0 = X1
135 X1 = X2
136 X2 = X3
137 X3 = X_
138 .endm
139 
140 # ROTATE_ARGS
141 # Rotate values of symbols a...h
142 .macro ROTATE_ARGS
143 TMP_ = h
144 h = g
145 g = f
146 f = e
147 e = d
148 d = c
149 c = b
150 b = a
151 a = TMP_
152 .endm
153 
154 .macro FOUR_ROUNDS_AND_SCHED
155         ## compute s0 four at a time and s1 two at a time
156         ## compute W[-16] + W[-7] 4 at a time
157 
158         mov     e, y0                   # y0 = e
159         MY_ROR  (25-11), y0             # y0 = e >> (25-11)
160         mov     a, y1                   # y1 = a
161         vpalignr $4, X2, X3, XTMP0      # XTMP0 = W[-7]
162         MY_ROR  (22-13), y1             # y1 = a >> (22-13)
163         xor     e, y0                   # y0 = e ^ (e >> (25-11))
164         mov     f, y2                   # y2 = f
165         MY_ROR  (11-6), y0              # y0 = (e >> (11-6)) ^ (e >> (25-6))
166         xor     a, y1                   # y1 = a ^ (a >> (22-13)
167         xor     g, y2                   # y2 = f^g
168         vpaddd  X0, XTMP0, XTMP0        # XTMP0 = W[-7] + W[-16]
169         xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
170         and     e, y2                   # y2 = (f^g)&e
171         MY_ROR  (13-2), y1              # y1 = (a >> (13-2)) ^ (a >> (22-2))
172         ## compute s0
173         vpalignr $4, X0, X1, XTMP1      # XTMP1 = W[-15]
174         xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
175         MY_ROR  6, y0                   # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
176         xor     g, y2                   # y2 = CH = ((f^g)&e)^g
177         MY_ROR  2, y1                   # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
178         add     y0, y2                  # y2 = S1 + CH
179         add     _XFER(%rsp), y2         # y2 = k + w + S1 + CH
180         mov     a, y0                   # y0 = a
181         add     y2, h                   # h = h + S1 + CH + k + w
182         mov     a, y2                   # y2 = a
183         vpsrld  $7, XTMP1, XTMP2
184         or      c, y0                   # y0 = a|c
185         add     h, d                    # d = d + h + S1 + CH + k + w
186         and     c, y2                   # y2 = a&c
187         vpslld  $(32-7), XTMP1, XTMP3
188         and     b, y0                   # y0 = (a|c)&b
189         add     y1, h                   # h = h + S1 + CH + k + w + S0
190         vpor    XTMP2, XTMP3, XTMP3     # XTMP1 = W[-15] MY_ROR 7
191         or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
192         add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
193         ROTATE_ARGS
194         mov     e, y0                   # y0 = e
195         mov     a, y1                   # y1 = a
196         MY_ROR  (25-11), y0             # y0 = e >> (25-11)
197         xor     e, y0                   # y0 = e ^ (e >> (25-11))
198         mov     f, y2                   # y2 = f
199         MY_ROR  (22-13), y1             # y1 = a >> (22-13)
200         vpsrld  $18, XTMP1, XTMP2       #
201         xor     a, y1                   # y1 = a ^ (a >> (22-13)
202         MY_ROR  (11-6), y0              # y0 = (e >> (11-6)) ^ (e >> (25-6))
203         xor     g, y2                   # y2 = f^g
204         vpsrld  $3, XTMP1, XTMP4        # XTMP4 = W[-15] >> 3
205         MY_ROR  (13-2), y1              # y1 = (a >> (13-2)) ^ (a >> (22-2))
206         xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
207         and     e, y2                   # y2 = (f^g)&e
208         MY_ROR  6, y0                   # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
209         vpslld  $(32-18), XTMP1, XTMP1
210         xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
211         xor     g, y2                   # y2 = CH = ((f^g)&e)^g
212         vpxor   XTMP1, XTMP3, XTMP3     #
213         add     y0, y2                  # y2 = S1 + CH
214         add     (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
215         MY_ROR  2, y1                   # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
216         vpxor   XTMP2, XTMP3, XTMP3     # XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR
217         mov     a, y0                   # y0 = a
218         add     y2, h                   # h = h + S1 + CH + k + w
219         mov     a, y2                   # y2 = a
220         vpxor   XTMP4, XTMP3, XTMP1     # XTMP1 = s0
221         or      c, y0                   # y0 = a|c
222         add     h, d                    # d = d + h + S1 + CH + k + w
223         and     c, y2                   # y2 = a&c
224         ## compute low s1
225         vpshufd $0b11111010, X3, XTMP2  # XTMP2 = W[-2] {BBAA}
226         and     b, y0                   # y0 = (a|c)&b
227         add     y1, h                   # h = h + S1 + CH + k + w + S0
228         vpaddd  XTMP1, XTMP0, XTMP0     # XTMP0 = W[-16] + W[-7] + s0
229         or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
230         add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
231         ROTATE_ARGS
232         mov     e, y0                   # y0 = e
233         mov     a, y1                   # y1 = a
234         MY_ROR  (25-11), y0             # y0 = e >> (25-11)
235         xor     e, y0                   # y0 = e ^ (e >> (25-11))
236         MY_ROR  (22-13), y1             # y1 = a >> (22-13)
237         mov     f, y2                   # y2 = f
238         xor     a, y1                   # y1 = a ^ (a >> (22-13)
239         MY_ROR  (11-6), y0              # y0 = (e >> (11-6)) ^ (e >> (25-6))
240         vpsrld  $10, XTMP2, XTMP4       # XTMP4 = W[-2] >> 10 {BBAA}
241         xor     g, y2                   # y2 = f^g
242         vpsrlq  $19, XTMP2, XTMP3       # XTMP3 = W[-2] MY_ROR 19 {xBxA}
243         xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
244         and     e, y2                   # y2 = (f^g)&e
245         vpsrlq  $17, XTMP2, XTMP2       # XTMP2 = W[-2] MY_ROR 17 {xBxA}
246         MY_ROR  (13-2), y1              # y1 = (a >> (13-2)) ^ (a >> (22-2))
247         xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
248         xor     g, y2                   # y2 = CH = ((f^g)&e)^g
249         MY_ROR  6, y0                   # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
250         vpxor   XTMP3, XTMP2, XTMP2     #
251         add     y0, y2                  # y2 = S1 + CH
252         MY_ROR  2, y1                   # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
253         add     (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
254         vpxor   XTMP2, XTMP4, XTMP4     # XTMP4 = s1 {xBxA}
255         mov     a, y0                   # y0 = a
256         add     y2, h                   # h = h + S1 + CH + k + w
257         mov     a, y2                   # y2 = a
258         vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA}
259         or      c, y0                   # y0 = a|c
260         add     h, d                    # d = d + h + S1 + CH + k + w
261         and     c, y2                   # y2 = a&c
262         vpaddd  XTMP4, XTMP0, XTMP0     # XTMP0 = {..., ..., W[1], W[0]}
263         and     b, y0                   # y0 = (a|c)&b
264         add     y1, h                   # h = h + S1 + CH + k + w + S0
265         ## compute high s1
266         vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC}
267         or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
268         add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
269         ROTATE_ARGS
270         mov     e, y0                   # y0 = e
271         MY_ROR  (25-11), y0             # y0 = e >> (25-11)
272         mov     a, y1                   # y1 = a
273         MY_ROR  (22-13), y1             # y1 = a >> (22-13)
274         xor     e, y0                   # y0 = e ^ (e >> (25-11))
275         mov     f, y2                   # y2 = f
276         MY_ROR  (11-6), y0              # y0 = (e >> (11-6)) ^ (e >> (25-6))
277         vpsrld  $10, XTMP2, XTMP5       # XTMP5 = W[-2] >> 10 {DDCC}
278         xor     a, y1                   # y1 = a ^ (a >> (22-13)
279         xor     g, y2                   # y2 = f^g
280         vpsrlq  $19, XTMP2, XTMP3       # XTMP3 = W[-2] MY_ROR 19 {xDxC}
281         xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
282         and     e, y2                   # y2 = (f^g)&e
283         MY_ROR  (13-2), y1              # y1 = (a >> (13-2)) ^ (a >> (22-2))
284         vpsrlq  $17, XTMP2, XTMP2       # XTMP2 = W[-2] MY_ROR 17 {xDxC}
285         xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
286         MY_ROR  6, y0                   # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
287         xor     g, y2                   # y2 = CH = ((f^g)&e)^g
288         vpxor   XTMP3, XTMP2, XTMP2
289         MY_ROR  2, y1                   # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
290         add     y0, y2                  # y2 = S1 + CH
291         add     (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
292         vpxor   XTMP2, XTMP5, XTMP5     # XTMP5 = s1 {xDxC}
293         mov     a, y0                   # y0 = a
294         add     y2, h                   # h = h + S1 + CH + k + w
295         mov     a, y2                   # y2 = a
296         vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00}
297         or      c, y0                   # y0 = a|c
298         add     h, d                    # d = d + h + S1 + CH + k + w
299         and     c, y2                   # y2 = a&c
300         vpaddd  XTMP0, XTMP5, X0        # X0 = {W[3], W[2], W[1], W[0]}
301         and     b, y0                   # y0 = (a|c)&b
302         add     y1, h                   # h = h + S1 + CH + k + w + S0
303         or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
304         add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
305         ROTATE_ARGS
306         rotate_Xs
307 .endm
308 
309 ## input is [rsp + _XFER + %1 * 4]
310 .macro DO_ROUND round
311         mov     e, y0                   # y0 = e
312         MY_ROR  (25-11), y0             # y0 = e >> (25-11)
313         mov     a, y1                   # y1 = a
314         xor     e, y0                   # y0 = e ^ (e >> (25-11))
315         MY_ROR  (22-13), y1             # y1 = a >> (22-13)
316         mov     f, y2                   # y2 = f
317         xor     a, y1                   # y1 = a ^ (a >> (22-13)
318         MY_ROR  (11-6), y0              # y0 = (e >> (11-6)) ^ (e >> (25-6))
319         xor     g, y2                   # y2 = f^g
320         xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
321         MY_ROR  (13-2), y1              # y1 = (a >> (13-2)) ^ (a >> (22-2))
322         and     e, y2                   # y2 = (f^g)&e
323         xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
324         MY_ROR  6, y0                   # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
325         xor     g, y2                   # y2 = CH = ((f^g)&e)^g
326         add     y0, y2                  # y2 = S1 + CH
327         MY_ROR  2, y1                   # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
328         offset = \round * 4 + _XFER     #
329         add     offset(%rsp), y2        # y2 = k + w + S1 + CH
330         mov     a, y0                   # y0 = a
331         add     y2, h                   # h = h + S1 + CH + k + w
332         mov     a, y2                   # y2 = a
333         or      c, y0                   # y0 = a|c
334         add     h, d                    # d = d + h + S1 + CH + k + w
335         and     c, y2                   # y2 = a&c
336         and     b, y0                   # y0 = (a|c)&b
337         add     y1, h                   # h = h + S1 + CH + k + w + S0
338         or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
339         add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
340         ROTATE_ARGS
341 .endm
342 
343 ########################################################################
344 ## void sha256_transform_avx(state sha256_state *state, const u8 *data, int blocks)
345 ## arg 1 : pointer to state
346 ## arg 2 : pointer to input data
347 ## arg 3 : Num blocks
348 ########################################################################
349 .text
350 SYM_TYPED_FUNC_START(sha256_transform_avx)
351         pushq   %rbx
352         pushq   %r12
353         pushq   %r13
354         pushq   %r14
355         pushq   %r15
356         pushq   %rbp
357         movq    %rsp, %rbp
358 
359         subq    $STACK_SIZE, %rsp       # allocate stack space
360         and     $~15, %rsp              # align stack pointer
361 
362         shl     $6, NUM_BLKS            # convert to bytes
363         jz      .Ldone_hash
364         add     INP, NUM_BLKS           # pointer to end of data
365         mov     NUM_BLKS, _INP_END(%rsp)
366 
367         ## load initial digest
368         mov     4*0(CTX), a
369         mov     4*1(CTX), b
370         mov     4*2(CTX), c
371         mov     4*3(CTX), d
372         mov     4*4(CTX), e
373         mov     4*5(CTX), f
374         mov     4*6(CTX), g
375         mov     4*7(CTX), h
376 
377         vmovdqa  PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
378         vmovdqa  _SHUF_00BA(%rip), SHUF_00BA
379         vmovdqa  _SHUF_DC00(%rip), SHUF_DC00
380 .Lloop0:
381         lea     K256(%rip), TBL
382 
383         ## byte swap first 16 dwords
384         COPY_XMM_AND_BSWAP      X0, 0*16(INP), BYTE_FLIP_MASK
385         COPY_XMM_AND_BSWAP      X1, 1*16(INP), BYTE_FLIP_MASK
386         COPY_XMM_AND_BSWAP      X2, 2*16(INP), BYTE_FLIP_MASK
387         COPY_XMM_AND_BSWAP      X3, 3*16(INP), BYTE_FLIP_MASK
388 
389         mov     INP, _INP(%rsp)
390 
391         ## schedule 48 input dwords, by doing 3 rounds of 16 each
392         mov     $3, SRND
393 .align 16
394 .Lloop1:
395         vpaddd  (TBL), X0, XFER
396         vmovdqa XFER, _XFER(%rsp)
397         FOUR_ROUNDS_AND_SCHED
398 
399         vpaddd  1*16(TBL), X0, XFER
400         vmovdqa XFER, _XFER(%rsp)
401         FOUR_ROUNDS_AND_SCHED
402 
403         vpaddd  2*16(TBL), X0, XFER
404         vmovdqa XFER, _XFER(%rsp)
405         FOUR_ROUNDS_AND_SCHED
406 
407         vpaddd  3*16(TBL), X0, XFER
408         vmovdqa XFER, _XFER(%rsp)
409         add     $4*16, TBL
410         FOUR_ROUNDS_AND_SCHED
411 
412         sub     $1, SRND
413         jne     .Lloop1
414 
415         mov     $2, SRND
416 .Lloop2:
417         vpaddd  (TBL), X0, XFER
418         vmovdqa XFER, _XFER(%rsp)
419         DO_ROUND        0
420         DO_ROUND        1
421         DO_ROUND        2
422         DO_ROUND        3
423 
424         vpaddd  1*16(TBL), X1, XFER
425         vmovdqa XFER, _XFER(%rsp)
426         add     $2*16, TBL
427         DO_ROUND        0
428         DO_ROUND        1
429         DO_ROUND        2
430         DO_ROUND        3
431 
432         vmovdqa X2, X0
433         vmovdqa X3, X1
434 
435         sub     $1, SRND
436         jne     .Lloop2
437 
438         addm    (4*0)(CTX),a
439         addm    (4*1)(CTX),b
440         addm    (4*2)(CTX),c
441         addm    (4*3)(CTX),d
442         addm    (4*4)(CTX),e
443         addm    (4*5)(CTX),f
444         addm    (4*6)(CTX),g
445         addm    (4*7)(CTX),h
446 
447         mov     _INP(%rsp), INP
448         add     $64, INP
449         cmp     _INP_END(%rsp), INP
450         jne     .Lloop0
451 
452 .Ldone_hash:
453 
454         mov     %rbp, %rsp
455         popq    %rbp
456         popq    %r15
457         popq    %r14
458         popq    %r13
459         popq    %r12
460         popq    %rbx
461         RET
462 SYM_FUNC_END(sha256_transform_avx)
463 
464 .section        .rodata.cst256.K256, "aM", @progbits, 256
465 .align 64
466 K256:
467         .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
468         .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
469         .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
470         .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
471         .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
472         .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
473         .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
474         .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
475         .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
476         .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
477         .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
478         .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
479         .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
480         .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
481         .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
482         .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
483 
484 .section        .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
485 .align 16
486 PSHUFFLE_BYTE_FLIP_MASK:
487         .octa 0x0c0d0e0f08090a0b0405060700010203
488 
489 .section        .rodata.cst16._SHUF_00BA, "aM", @progbits, 16
490 .align 16
491 # shuffle xBxA -> 00BA
492 _SHUF_00BA:
493         .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100
494 
495 .section        .rodata.cst16._SHUF_DC00, "aM", @progbits, 16
496 .align 16
497 # shuffle xDxC -> DC00
498 _SHUF_DC00:
499         .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

sflogo.php