~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/arch/x86/crypto/sha256-avx2-asm.S

Version: ~ [ linux-6.12-rc7 ] ~ [ linux-6.11.7 ] ~ [ linux-6.10.14 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.60 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.116 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.171 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.229 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.285 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.323 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.12 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 ########################################################################
  2 # Implement fast SHA-256 with AVX2 instructions. (x86_64)
  3 #
  4 # Copyright (C) 2013 Intel Corporation.
  5 #
  6 # Authors:
  7 #     James Guilford <james.guilford@intel.com>
  8 #     Kirk Yap <kirk.s.yap@intel.com>
  9 #     Tim Chen <tim.c.chen@linux.intel.com>
 10 #
 11 # This software is available to you under a choice of one of two
 12 # licenses.  You may choose to be licensed under the terms of the GNU
 13 # General Public License (GPL) Version 2, available from the file
 14 # COPYING in the main directory of this source tree, or the
 15 # OpenIB.org BSD license below:
 16 #
 17 #     Redistribution and use in source and binary forms, with or
 18 #     without modification, are permitted provided that the following
 19 #     conditions are met:
 20 #
 21 #      - Redistributions of source code must retain the above
 22 #        copyright notice, this list of conditions and the following
 23 #        disclaimer.
 24 #
 25 #      - Redistributions in binary form must reproduce the above
 26 #        copyright notice, this list of conditions and the following
 27 #        disclaimer in the documentation and/or other materials
 28 #        provided with the distribution.
 29 #
 30 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 31 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 32 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 33 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 34 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 35 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 36 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 37 # SOFTWARE.
 38 #
 39 ########################################################################
 40 #
 41 # This code is described in an Intel White-Paper:
 42 # "Fast SHA-256 Implementations on Intel Architecture Processors"
 43 #
 44 # To find it, surf to http://www.intel.com/p/en_US/embedded
 45 # and search for that title.
 46 #
 47 ########################################################################
 48 # This code schedules 2 blocks at a time, with 4 lanes per block
 49 ########################################################################
 50 
 51 #include <linux/linkage.h>
 52 #include <linux/cfi_types.h>
 53 
 54 ## assume buffers not aligned
 55 #define VMOVDQ vmovdqu
 56 
 57 ################################ Define Macros
 58 
 59 # addm [mem], reg
 60 # Add reg to mem using reg-mem add and store
 61 .macro addm p1 p2
 62         add     \p1, \p2
 63         mov     \p2, \p1
 64 .endm
 65 
 66 ################################
 67 
 68 X0 = %ymm4
 69 X1 = %ymm5
 70 X2 = %ymm6
 71 X3 = %ymm7
 72 
 73 # XMM versions of above
 74 XWORD0 = %xmm4
 75 XWORD1 = %xmm5
 76 XWORD2 = %xmm6
 77 XWORD3 = %xmm7
 78 
 79 XTMP0 = %ymm0
 80 XTMP1 = %ymm1
 81 XTMP2 = %ymm2
 82 XTMP3 = %ymm3
 83 XTMP4 = %ymm8
 84 XFER  = %ymm9
 85 XTMP5 = %ymm11
 86 
 87 SHUF_00BA =     %ymm10 # shuffle xBxA -> 00BA
 88 SHUF_DC00 =     %ymm12 # shuffle xDxC -> DC00
 89 BYTE_FLIP_MASK = %ymm13
 90 
 91 X_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK
 92 
 93 NUM_BLKS = %rdx # 3rd arg
 94 INP     = %rsi  # 2nd arg
 95 CTX     = %rdi  # 1st arg
 96 c       = %ecx
 97 d       = %r8d
 98 e       = %edx  # clobbers NUM_BLKS
 99 y3      = %esi  # clobbers INP
100 
101 SRND    = CTX   # SRND is same register as CTX
102 
103 a = %eax
104 b = %ebx
105 f = %r9d
106 g = %r10d
107 h = %r11d
108 old_h = %r11d
109 
110 T1 = %r12d
111 y0 = %r13d
112 y1 = %r14d
113 y2 = %r15d
114 
115 
116 _XFER_SIZE      = 2*64*4        # 2 blocks, 64 rounds, 4 bytes/round
117 _XMM_SAVE_SIZE  = 0
118 _INP_END_SIZE   = 8
119 _INP_SIZE       = 8
120 _CTX_SIZE       = 8
121 
122 _XFER           = 0
123 _XMM_SAVE       = _XFER     + _XFER_SIZE
124 _INP_END        = _XMM_SAVE + _XMM_SAVE_SIZE
125 _INP            = _INP_END  + _INP_END_SIZE
126 _CTX            = _INP      + _INP_SIZE
127 STACK_SIZE      = _CTX      + _CTX_SIZE
128 
129 # rotate_Xs
130 # Rotate values of symbols X0...X3
131 .macro rotate_Xs
132         X_ = X0
133         X0 = X1
134         X1 = X2
135         X2 = X3
136         X3 = X_
137 .endm
138 
139 # ROTATE_ARGS
140 # Rotate values of symbols a...h
141 .macro ROTATE_ARGS
142         old_h = h
143         TMP_ = h
144         h = g
145         g = f
146         f = e
147         e = d
148         d = c
149         c = b
150         b = a
151         a = TMP_
152 .endm
153 
154 .macro FOUR_ROUNDS_AND_SCHED disp
155 ################################### RND N + 0 ############################
156 
157         mov     a, y3           # y3 = a                                # MAJA
158         rorx    $25, e, y0      # y0 = e >> 25                          # S1A
159         rorx    $11, e, y1      # y1 = e >> 11                          # S1B
160 
161         addl    \disp(%rsp, SRND), h            # h = k + w + h         # --
162         or      c, y3           # y3 = a|c                              # MAJA
163         vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7]
164         mov     f, y2           # y2 = f                                # CH
165         rorx    $13, a, T1      # T1 = a >> 13                          # S0B
166 
167         xor     y1, y0          # y0 = (e>>25) ^ (e>>11)                # S1
168         xor     g, y2           # y2 = f^g                              # CH
169         vpaddd  X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]# y1 = (e >> 6)# S1
170         rorx    $6, e, y1       # y1 = (e >> 6)                         # S1
171 
172         and     e, y2           # y2 = (f^g)&e                          # CH
173         xor     y1, y0          # y0 = (e>>25) ^ (e>>11) ^ (e>>6)       # S1
174         rorx    $22, a, y1      # y1 = a >> 22                          # S0A
175         add     h, d            # d = k + w + h + d                     # --
176 
177         and     b, y3           # y3 = (a|c)&b                          # MAJA
178         vpalignr $4, X0, X1, XTMP1      # XTMP1 = W[-15]
179         xor     T1, y1          # y1 = (a>>22) ^ (a>>13)                # S0
180         rorx    $2, a, T1       # T1 = (a >> 2)                         # S0
181 
182         xor     g, y2           # y2 = CH = ((f^g)&e)^g                 # CH
183         vpsrld  $7, XTMP1, XTMP2
184         xor     T1, y1          # y1 = (a>>22) ^ (a>>13) ^ (a>>2)       # S0
185         mov     a, T1           # T1 = a                                # MAJB
186         and     c, T1           # T1 = a&c                              # MAJB
187 
188         add     y0, y2          # y2 = S1 + CH                          # --
189         vpslld  $(32-7), XTMP1, XTMP3
190         or      T1, y3          # y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
191         add     y1, h           # h = k + w + h + S0                    # --
192 
193         add     y2, d           # d = k + w + h + d + S1 + CH = d + t1  # --
194         vpor    XTMP2, XTMP3, XTMP3     # XTMP3 = W[-15] ror 7
195 
196         vpsrld  $18, XTMP1, XTMP2
197         add     y2, h           # h = k + w + h + S0 + S1 + CH = t1 + S0# --
198         add     y3, h           # h = t1 + S0 + MAJ                     # --
199 
200 
201         ROTATE_ARGS
202 
203 ################################### RND N + 1 ############################
204 
205         mov     a, y3           # y3 = a                                # MAJA
206         rorx    $25, e, y0      # y0 = e >> 25                          # S1A
207         rorx    $11, e, y1      # y1 = e >> 11                          # S1B
208         offset = \disp + 1*4
209         addl    offset(%rsp, SRND), h   # h = k + w + h         # --
210         or      c, y3           # y3 = a|c                              # MAJA
211 
212 
213         vpsrld  $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3
214         mov     f, y2           # y2 = f                                # CH
215         rorx    $13, a, T1      # T1 = a >> 13                          # S0B
216         xor     y1, y0          # y0 = (e>>25) ^ (e>>11)                # S1
217         xor     g, y2           # y2 = f^g                              # CH
218 
219 
220         rorx    $6, e, y1       # y1 = (e >> 6)                         # S1
221         xor     y1, y0          # y0 = (e>>25) ^ (e>>11) ^ (e>>6)       # S1
222         rorx    $22, a, y1      # y1 = a >> 22                          # S0A
223         and     e, y2           # y2 = (f^g)&e                          # CH
224         add     h, d            # d = k + w + h + d                     # --
225 
226         vpslld  $(32-18), XTMP1, XTMP1
227         and     b, y3           # y3 = (a|c)&b                          # MAJA
228         xor     T1, y1          # y1 = (a>>22) ^ (a>>13)                # S0
229 
230         vpxor   XTMP1, XTMP3, XTMP3
231         rorx    $2, a, T1       # T1 = (a >> 2)                         # S0
232         xor     g, y2           # y2 = CH = ((f^g)&e)^g                 # CH
233 
234         vpxor   XTMP2, XTMP3, XTMP3     # XTMP3 = W[-15] ror 7 ^ W[-15] ror 18
235         xor     T1, y1          # y1 = (a>>22) ^ (a>>13) ^ (a>>2)       # S0
236         mov     a, T1           # T1 = a                                # MAJB
237         and     c, T1           # T1 = a&c                              # MAJB
238         add     y0, y2          # y2 = S1 + CH                          # --
239 
240         vpxor   XTMP4, XTMP3, XTMP1     # XTMP1 = s0
241         vpshufd $0b11111010, X3, XTMP2  # XTMP2 = W[-2] {BBAA}
242         or      T1, y3          # y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
243         add     y1, h           # h = k + w + h + S0                    # --
244 
245         vpaddd  XTMP1, XTMP0, XTMP0     # XTMP0 = W[-16] + W[-7] + s0
246         add     y2, d           # d = k + w + h + d + S1 + CH = d + t1  # --
247         add     y2, h           # h = k + w + h + S0 + S1 + CH = t1 + S0# --
248         add     y3, h           # h = t1 + S0 + MAJ                     # --
249 
250         vpsrld  $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
251 
252 
253         ROTATE_ARGS
254 
255 ################################### RND N + 2 ############################
256 
257         mov     a, y3           # y3 = a                                # MAJA
258         rorx    $25, e, y0      # y0 = e >> 25                          # S1A
259         offset = \disp + 2*4
260         addl    offset(%rsp, SRND), h   # h = k + w + h         # --
261 
262         vpsrlq  $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA}
263         rorx    $11, e, y1      # y1 = e >> 11                          # S1B
264         or      c, y3           # y3 = a|c                              # MAJA
265         mov     f, y2           # y2 = f                                # CH
266         xor     g, y2           # y2 = f^g                              # CH
267 
268         rorx    $13, a, T1      # T1 = a >> 13                          # S0B
269         xor     y1, y0          # y0 = (e>>25) ^ (e>>11)                # S1
270         vpsrlq  $17, XTMP2, XTMP2       # XTMP2 = W[-2] ror 17 {xBxA}
271         and     e, y2           # y2 = (f^g)&e                          # CH
272 
273         rorx    $6, e, y1       # y1 = (e >> 6)                         # S1
274         vpxor   XTMP3, XTMP2, XTMP2
275         add     h, d            # d = k + w + h + d                     # --
276         and     b, y3           # y3 = (a|c)&b                          # MAJA
277 
278         xor     y1, y0          # y0 = (e>>25) ^ (e>>11) ^ (e>>6)       # S1
279         rorx    $22, a, y1      # y1 = a >> 22                          # S0A
280         vpxor   XTMP2, XTMP4, XTMP4     # XTMP4 = s1 {xBxA}
281         xor     g, y2           # y2 = CH = ((f^g)&e)^g                 # CH
282 
283         vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA}
284         xor     T1, y1          # y1 = (a>>22) ^ (a>>13)                # S0
285         rorx    $2, a ,T1       # T1 = (a >> 2)                         # S0
286         vpaddd  XTMP4, XTMP0, XTMP0     # XTMP0 = {..., ..., W[1], W[0]}
287 
288         xor     T1, y1          # y1 = (a>>22) ^ (a>>13) ^ (a>>2)       # S0
289         mov     a, T1           # T1 = a                                # MAJB
290         and     c, T1           # T1 = a&c                              # MAJB
291         add     y0, y2          # y2 = S1 + CH                          # --
292         vpshufd $0b01010000, XTMP0, XTMP2       # XTMP2 = W[-2] {DDCC}
293 
294         or      T1, y3          # y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
295         add     y1,h            # h = k + w + h + S0                    # --
296         add     y2,d            # d = k + w + h + d + S1 + CH = d + t1  # --
297         add     y2,h            # h = k + w + h + S0 + S1 + CH = t1 + S0# --
298 
299         add     y3,h            # h = t1 + S0 + MAJ                     # --
300 
301 
302         ROTATE_ARGS
303 
304 ################################### RND N + 3 ############################
305 
306         mov     a, y3           # y3 = a                                # MAJA
307         rorx    $25, e, y0      # y0 = e >> 25                          # S1A
308         rorx    $11, e, y1      # y1 = e >> 11                          # S1B
309         offset = \disp + 3*4
310         addl    offset(%rsp, SRND), h   # h = k + w + h         # --
311         or      c, y3           # y3 = a|c                              # MAJA
312 
313 
314         vpsrld  $10, XTMP2, XTMP5       # XTMP5 = W[-2] >> 10 {DDCC}
315         mov     f, y2           # y2 = f                                # CH
316         rorx    $13, a, T1      # T1 = a >> 13                          # S0B
317         xor     y1, y0          # y0 = (e>>25) ^ (e>>11)                # S1
318         xor     g, y2           # y2 = f^g                              # CH
319 
320 
321         vpsrlq  $19, XTMP2, XTMP3       # XTMP3 = W[-2] ror 19 {xDxC}
322         rorx    $6, e, y1       # y1 = (e >> 6)                         # S1
323         and     e, y2           # y2 = (f^g)&e                          # CH
324         add     h, d            # d = k + w + h + d                     # --
325         and     b, y3           # y3 = (a|c)&b                          # MAJA
326 
327         vpsrlq  $17, XTMP2, XTMP2       # XTMP2 = W[-2] ror 17 {xDxC}
328         xor     y1, y0          # y0 = (e>>25) ^ (e>>11) ^ (e>>6)       # S1
329         xor     g, y2           # y2 = CH = ((f^g)&e)^g                 # CH
330 
331         vpxor   XTMP3, XTMP2, XTMP2
332         rorx    $22, a, y1      # y1 = a >> 22                          # S0A
333         add     y0, y2          # y2 = S1 + CH                          # --
334 
335         vpxor   XTMP2, XTMP5, XTMP5     # XTMP5 = s1 {xDxC}
336         xor     T1, y1          # y1 = (a>>22) ^ (a>>13)                # S0
337         add     y2, d           # d = k + w + h + d + S1 + CH = d + t1  # --
338 
339         rorx    $2, a, T1       # T1 = (a >> 2)                         # S0
340         vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00}
341 
342         vpaddd  XTMP0, XTMP5, X0        # X0 = {W[3], W[2], W[1], W[0]}
343         xor     T1, y1          # y1 = (a>>22) ^ (a>>13) ^ (a>>2)       # S0
344         mov     a, T1           # T1 = a                                # MAJB
345         and     c, T1           # T1 = a&c                              # MAJB
346         or      T1, y3          # y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
347 
348         add     y1, h           # h = k + w + h + S0                    # --
349         add     y2, h           # h = k + w + h + S0 + S1 + CH = t1 + S0# --
350         add     y3, h           # h = t1 + S0 + MAJ                     # --
351 
352         ROTATE_ARGS
353         rotate_Xs
354 .endm
355 
356 .macro DO_4ROUNDS disp
357 ################################### RND N + 0 ###########################
358 
359         mov     f, y2           # y2 = f                                # CH
360         rorx    $25, e, y0      # y0 = e >> 25                          # S1A
361         rorx    $11, e, y1      # y1 = e >> 11                          # S1B
362         xor     g, y2           # y2 = f^g                              # CH
363 
364         xor     y1, y0          # y0 = (e>>25) ^ (e>>11)                # S1
365         rorx    $6, e, y1       # y1 = (e >> 6)                         # S1
366         and     e, y2           # y2 = (f^g)&e                          # CH
367 
368         xor     y1, y0          # y0 = (e>>25) ^ (e>>11) ^ (e>>6)       # S1
369         rorx    $13, a, T1      # T1 = a >> 13                          # S0B
370         xor     g, y2           # y2 = CH = ((f^g)&e)^g                 # CH
371         rorx    $22, a, y1      # y1 = a >> 22                          # S0A
372         mov     a, y3           # y3 = a                                # MAJA
373 
374         xor     T1, y1          # y1 = (a>>22) ^ (a>>13)                # S0
375         rorx    $2, a, T1       # T1 = (a >> 2)                         # S0
376         addl    \disp(%rsp, SRND), h            # h = k + w + h # --
377         or      c, y3           # y3 = a|c                              # MAJA
378 
379         xor     T1, y1          # y1 = (a>>22) ^ (a>>13) ^ (a>>2)       # S0
380         mov     a, T1           # T1 = a                                # MAJB
381         and     b, y3           # y3 = (a|c)&b                          # MAJA
382         and     c, T1           # T1 = a&c                              # MAJB
383         add     y0, y2          # y2 = S1 + CH                          # --
384 
385 
386         add     h, d            # d = k + w + h + d                     # --
387         or      T1, y3          # y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
388         add     y1, h           # h = k + w + h + S0                    # --
389         add     y2, d           # d = k + w + h + d + S1 + CH = d + t1  # --
390 
391         ROTATE_ARGS
392 
393 ################################### RND N + 1 ###########################
394 
395         add     y2, old_h       # h = k + w + h + S0 + S1 + CH = t1 + S0# --
396         mov     f, y2           # y2 = f                                # CH
397         rorx    $25, e, y0      # y0 = e >> 25                          # S1A
398         rorx    $11, e, y1      # y1 = e >> 11                          # S1B
399         xor     g, y2           # y2 = f^g                              # CH
400 
401         xor     y1, y0          # y0 = (e>>25) ^ (e>>11)                # S1
402         rorx    $6, e, y1       # y1 = (e >> 6)                         # S1
403         and     e, y2           # y2 = (f^g)&e                          # CH
404         add     y3, old_h       # h = t1 + S0 + MAJ                     # --
405 
406         xor     y1, y0          # y0 = (e>>25) ^ (e>>11) ^ (e>>6)       # S1
407         rorx    $13, a, T1      # T1 = a >> 13                          # S0B
408         xor     g, y2           # y2 = CH = ((f^g)&e)^g                 # CH
409         rorx    $22, a, y1      # y1 = a >> 22                          # S0A
410         mov     a, y3           # y3 = a                                # MAJA
411 
412         xor     T1, y1          # y1 = (a>>22) ^ (a>>13)                # S0
413         rorx    $2, a, T1       # T1 = (a >> 2)                         # S0
414         offset = 4*1 + \disp
415         addl    offset(%rsp, SRND), h           # h = k + w + h # --
416         or      c, y3           # y3 = a|c                              # MAJA
417 
418         xor     T1, y1          # y1 = (a>>22) ^ (a>>13) ^ (a>>2)       # S0
419         mov     a, T1           # T1 = a                                # MAJB
420         and     b, y3           # y3 = (a|c)&b                          # MAJA
421         and     c, T1           # T1 = a&c                              # MAJB
422         add     y0, y2          # y2 = S1 + CH                          # --
423 
424 
425         add     h, d            # d = k + w + h + d                     # --
426         or      T1, y3          # y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
427         add     y1, h           # h = k + w + h + S0                    # --
428 
429         add     y2, d           # d = k + w + h + d + S1 + CH = d + t1  # --
430 
431         ROTATE_ARGS
432 
433 ################################### RND N + 2 ##############################
434 
435         add     y2, old_h       # h = k + w + h + S0 + S1 + CH = t1 + S0# --
436         mov     f, y2           # y2 = f                                # CH
437         rorx    $25, e, y0      # y0 = e >> 25                          # S1A
438         rorx    $11, e, y1      # y1 = e >> 11                          # S1B
439         xor     g, y2           # y2 = f^g                              # CH
440 
441         xor     y1, y0          # y0 = (e>>25) ^ (e>>11)                # S1
442         rorx    $6, e, y1       # y1 = (e >> 6)                         # S1
443         and     e, y2           # y2 = (f^g)&e                          # CH
444         add     y3, old_h       # h = t1 + S0 + MAJ                     # --
445 
446         xor     y1, y0          # y0 = (e>>25) ^ (e>>11) ^ (e>>6)       # S1
447         rorx    $13, a, T1      # T1 = a >> 13                          # S0B
448         xor     g, y2           # y2 = CH = ((f^g)&e)^g                 # CH
449         rorx    $22, a, y1      # y1 = a >> 22                          # S0A
450         mov     a, y3           # y3 = a                                # MAJA
451 
452         xor     T1, y1          # y1 = (a>>22) ^ (a>>13)                # S0
453         rorx    $2, a, T1       # T1 = (a >> 2)                         # S0
454         offset = 4*2 + \disp
455         addl    offset(%rsp, SRND), h           # h = k + w + h # --
456         or      c, y3           # y3 = a|c                              # MAJA
457 
458         xor     T1, y1          # y1 = (a>>22) ^ (a>>13) ^ (a>>2)       # S0
459         mov     a, T1           # T1 = a                                # MAJB
460         and     b, y3           # y3 = (a|c)&b                          # MAJA
461         and     c, T1           # T1 = a&c                              # MAJB
462         add     y0, y2          # y2 = S1 + CH                          # --
463 
464 
465         add     h, d            # d = k + w + h + d                     # --
466         or      T1, y3          # y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
467         add     y1, h           # h = k + w + h + S0                    # --
468 
469         add     y2, d           # d = k + w + h + d + S1 + CH = d + t1  # --
470 
471         ROTATE_ARGS
472 
473 ################################### RND N + 3 ###########################
474 
475         add     y2, old_h       # h = k + w + h + S0 + S1 + CH = t1 + S0# --
476         mov     f, y2           # y2 = f                                # CH
477         rorx    $25, e, y0      # y0 = e >> 25                          # S1A
478         rorx    $11, e, y1      # y1 = e >> 11                          # S1B
479         xor     g, y2           # y2 = f^g                              # CH
480 
481         xor     y1, y0          # y0 = (e>>25) ^ (e>>11)                # S1
482         rorx    $6, e, y1       # y1 = (e >> 6)                         # S1
483         and     e, y2           # y2 = (f^g)&e                          # CH
484         add     y3, old_h       # h = t1 + S0 + MAJ                     # --
485 
486         xor     y1, y0          # y0 = (e>>25) ^ (e>>11) ^ (e>>6)       # S1
487         rorx    $13, a, T1      # T1 = a >> 13                          # S0B
488         xor     g, y2           # y2 = CH = ((f^g)&e)^g                 # CH
489         rorx    $22, a, y1      # y1 = a >> 22                          # S0A
490         mov     a, y3           # y3 = a                                # MAJA
491 
492         xor     T1, y1          # y1 = (a>>22) ^ (a>>13)                # S0
493         rorx    $2, a, T1       # T1 = (a >> 2)                         # S0
494         offset = 4*3 + \disp
495         addl    offset(%rsp, SRND), h           # h = k + w + h # --
496         or      c, y3           # y3 = a|c                              # MAJA
497 
498         xor     T1, y1          # y1 = (a>>22) ^ (a>>13) ^ (a>>2)       # S0
499         mov     a, T1           # T1 = a                                # MAJB
500         and     b, y3           # y3 = (a|c)&b                          # MAJA
501         and     c, T1           # T1 = a&c                              # MAJB
502         add     y0, y2          # y2 = S1 + CH                          # --
503 
504 
505         add     h, d            # d = k + w + h + d                     # --
506         or      T1, y3          # y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
507         add     y1, h           # h = k + w + h + S0                    # --
508 
509         add     y2, d           # d = k + w + h + d + S1 + CH = d + t1  # --
510 
511 
512         add     y2, h           # h = k + w + h + S0 + S1 + CH = t1 + S0# --
513 
514         add     y3, h           # h = t1 + S0 + MAJ                     # --
515 
516         ROTATE_ARGS
517 
518 .endm
519 
520 ########################################################################
521 ## void sha256_transform_rorx(struct sha256_state *state, const u8 *data, int blocks)
522 ## arg 1 : pointer to state
523 ## arg 2 : pointer to input data
524 ## arg 3 : Num blocks
525 ########################################################################
526 .text
527 SYM_TYPED_FUNC_START(sha256_transform_rorx)
528         pushq   %rbx
529         pushq   %r12
530         pushq   %r13
531         pushq   %r14
532         pushq   %r15
533 
534         push    %rbp
535         mov     %rsp, %rbp
536 
537         subq    $STACK_SIZE, %rsp
538         and     $-32, %rsp      # align rsp to 32 byte boundary
539 
540         shl     $6, NUM_BLKS    # convert to bytes
541         jz      .Ldone_hash
542         lea     -64(INP, NUM_BLKS), NUM_BLKS # pointer to last block
543         mov     NUM_BLKS, _INP_END(%rsp)
544 
545         cmp     NUM_BLKS, INP
546         je      .Lonly_one_block
547 
548         ## load initial digest
549         mov     (CTX), a
550         mov     4*1(CTX), b
551         mov     4*2(CTX), c
552         mov     4*3(CTX), d
553         mov     4*4(CTX), e
554         mov     4*5(CTX), f
555         mov     4*6(CTX), g
556         mov     4*7(CTX), h
557 
558         vmovdqa  PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
559         vmovdqa  _SHUF_00BA(%rip), SHUF_00BA
560         vmovdqa  _SHUF_DC00(%rip), SHUF_DC00
561 
562         mov     CTX, _CTX(%rsp)
563 
564 .Lloop0:
565         ## Load first 16 dwords from two blocks
566         VMOVDQ  0*32(INP),XTMP0
567         VMOVDQ  1*32(INP),XTMP1
568         VMOVDQ  2*32(INP),XTMP2
569         VMOVDQ  3*32(INP),XTMP3
570 
571         ## byte swap data
572         vpshufb BYTE_FLIP_MASK, XTMP0, XTMP0
573         vpshufb BYTE_FLIP_MASK, XTMP1, XTMP1
574         vpshufb BYTE_FLIP_MASK, XTMP2, XTMP2
575         vpshufb BYTE_FLIP_MASK, XTMP3, XTMP3
576 
577         ## transpose data into high/low halves
578         vperm2i128      $0x20, XTMP2, XTMP0, X0
579         vperm2i128      $0x31, XTMP2, XTMP0, X1
580         vperm2i128      $0x20, XTMP3, XTMP1, X2
581         vperm2i128      $0x31, XTMP3, XTMP1, X3
582 
583 .Llast_block_enter:
584         add     $64, INP
585         mov     INP, _INP(%rsp)
586 
587         ## schedule 48 input dwords, by doing 3 rounds of 12 each
588         xor     SRND, SRND
589 
590 .align 16
591 .Lloop1:
592         leaq    K256+0*32(%rip), INP            ## reuse INP as scratch reg
593         vpaddd  (INP, SRND), X0, XFER
594         vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
595         FOUR_ROUNDS_AND_SCHED   (_XFER + 0*32)
596 
597         leaq    K256+1*32(%rip), INP
598         vpaddd  (INP, SRND), X0, XFER
599         vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
600         FOUR_ROUNDS_AND_SCHED   (_XFER + 1*32)
601 
602         leaq    K256+2*32(%rip), INP
603         vpaddd  (INP, SRND), X0, XFER
604         vmovdqa XFER, 2*32+_XFER(%rsp, SRND)
605         FOUR_ROUNDS_AND_SCHED   (_XFER + 2*32)
606 
607         leaq    K256+3*32(%rip), INP
608         vpaddd  (INP, SRND), X0, XFER
609         vmovdqa XFER, 3*32+_XFER(%rsp, SRND)
610         FOUR_ROUNDS_AND_SCHED   (_XFER + 3*32)
611 
612         add     $4*32, SRND
613         cmp     $3*4*32, SRND
614         jb      .Lloop1
615 
616 .Lloop2:
617         ## Do last 16 rounds with no scheduling
618         leaq    K256+0*32(%rip), INP
619         vpaddd  (INP, SRND), X0, XFER
620         vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
621         DO_4ROUNDS      (_XFER + 0*32)
622 
623         leaq    K256+1*32(%rip), INP
624         vpaddd  (INP, SRND), X1, XFER
625         vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
626         DO_4ROUNDS      (_XFER + 1*32)
627         add     $2*32, SRND
628 
629         vmovdqa X2, X0
630         vmovdqa X3, X1
631 
632         cmp     $4*4*32, SRND
633         jb      .Lloop2
634 
635         mov     _CTX(%rsp), CTX
636         mov     _INP(%rsp), INP
637 
638         addm    (4*0)(CTX),a
639         addm    (4*1)(CTX),b
640         addm    (4*2)(CTX),c
641         addm    (4*3)(CTX),d
642         addm    (4*4)(CTX),e
643         addm    (4*5)(CTX),f
644         addm    (4*6)(CTX),g
645         addm    (4*7)(CTX),h
646 
647         cmp     _INP_END(%rsp), INP
648         ja      .Ldone_hash
649 
650         #### Do second block using previously scheduled results
651         xor     SRND, SRND
652 .align 16
653 .Lloop3:
654         DO_4ROUNDS      (_XFER + 0*32 + 16)
655         DO_4ROUNDS      (_XFER + 1*32 + 16)
656         add     $2*32, SRND
657         cmp     $4*4*32, SRND
658         jb      .Lloop3
659 
660         mov     _CTX(%rsp), CTX
661         mov     _INP(%rsp), INP
662         add     $64, INP
663 
664         addm    (4*0)(CTX),a
665         addm    (4*1)(CTX),b
666         addm    (4*2)(CTX),c
667         addm    (4*3)(CTX),d
668         addm    (4*4)(CTX),e
669         addm    (4*5)(CTX),f
670         addm    (4*6)(CTX),g
671         addm    (4*7)(CTX),h
672 
673         cmp     _INP_END(%rsp), INP
674         jb      .Lloop0
675         ja      .Ldone_hash
676 
677 .Ldo_last_block:
678         VMOVDQ  0*16(INP),XWORD0
679         VMOVDQ  1*16(INP),XWORD1
680         VMOVDQ  2*16(INP),XWORD2
681         VMOVDQ  3*16(INP),XWORD3
682 
683         vpshufb X_BYTE_FLIP_MASK, XWORD0, XWORD0
684         vpshufb X_BYTE_FLIP_MASK, XWORD1, XWORD1
685         vpshufb X_BYTE_FLIP_MASK, XWORD2, XWORD2
686         vpshufb X_BYTE_FLIP_MASK, XWORD3, XWORD3
687 
688         jmp     .Llast_block_enter
689 
690 .Lonly_one_block:
691 
692         ## load initial digest
693         mov     (4*0)(CTX),a
694         mov     (4*1)(CTX),b
695         mov     (4*2)(CTX),c
696         mov     (4*3)(CTX),d
697         mov     (4*4)(CTX),e
698         mov     (4*5)(CTX),f
699         mov     (4*6)(CTX),g
700         mov     (4*7)(CTX),h
701 
702         vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
703         vmovdqa _SHUF_00BA(%rip), SHUF_00BA
704         vmovdqa _SHUF_DC00(%rip), SHUF_DC00
705 
706         mov     CTX, _CTX(%rsp)
707         jmp     .Ldo_last_block
708 
709 .Ldone_hash:
710 
711         mov     %rbp, %rsp
712         pop     %rbp
713 
714         popq    %r15
715         popq    %r14
716         popq    %r13
717         popq    %r12
718         popq    %rbx
719         vzeroupper
720         RET
721 SYM_FUNC_END(sha256_transform_rorx)
722 
723 .section        .rodata.cst512.K256, "aM", @progbits, 512
724 .align 64
725 K256:
726         .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
727         .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
728         .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
729         .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
730         .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
731         .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
732         .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
733         .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
734         .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
735         .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
736         .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
737         .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
738         .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
739         .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
740         .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
741         .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
742         .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
743         .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
744         .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
745         .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
746         .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
747         .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
748         .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
749         .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
750         .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
751         .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
752         .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
753         .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
754         .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
755         .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
756         .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
757         .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
758 
759 .section        .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
760 .align 32
761 PSHUFFLE_BYTE_FLIP_MASK:
762         .octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203
763 
764 # shuffle xBxA -> 00BA
765 .section        .rodata.cst32._SHUF_00BA, "aM", @progbits, 32
766 .align 32
767 _SHUF_00BA:
768         .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100
769 
770 # shuffle xDxC -> DC00
771 .section        .rodata.cst32._SHUF_DC00, "aM", @progbits, 32
772 .align 32
773 _SHUF_DC00:
774         .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

sflogo.php