1 ############################################## 2 # Implement fast SHA-256 with AVX1 instruction 3 # 4 # Copyright (C) 2013 Intel Corporation. 5 # 6 # Authors: 7 # James Guilford <james.guilford@intel.com> 8 # Kirk Yap <kirk.s.yap@intel.com> 9 # Tim Chen <tim.c.chen@linux.intel.com> 10 # 11 # This software is available to you under a ch 12 # licenses. You may choose to be licensed und 13 # General Public License (GPL) Version 2, avai 14 # COPYING in the main directory of this source 15 # OpenIB.org BSD license below: 16 # 17 # Redistribution and use in source and bin 18 # without modification, are permitted prov 19 # conditions are met: 20 # 21 # - Redistributions of source code must r 22 # copyright notice, this list of condit 23 # disclaimer. 24 # 25 # - Redistributions in binary form must r 26 # copyright notice, this list of condit 27 # disclaimer in the documentation and/o 28 # provided with the distribution. 29 # 30 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WA 31 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITE 32 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PU 33 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHO 34 # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LI 35 # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISI 36 # CONNECTION WITH THE SOFTWARE OR THE USE OR O 37 # SOFTWARE. 38 ############################################## 39 # 40 # This code is described in an Intel White-Pap 41 # "Fast SHA-256 Implementations on Intel Archi 42 # 43 # To find it, surf to http://www.intel.com/p/e 44 # and search for that title. 45 # 46 ############################################## 47 # This code schedules 1 block at a time, with 48 ############################################## 49 50 #include <linux/linkage.h> 51 #include <linux/cfi_types.h> 52 53 ## assume buffers not aligned 54 #define VMOVDQ vmovdqu 55 56 ################################ Define Macros 57 58 # addm [mem], reg 59 # Add reg to mem using reg-mem add and store 60 .macro addm p1 p2 61 add \p1, \p2 62 mov \p2, \p1 63 .endm 64 65 66 .macro MY_ROR p1 p2 67 shld $(32-(\p1)), \p2, \p2 68 .endm 69 70 ################################ 71 72 # COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mas 73 # Load xmm with mem and byte swap each dword 74 .macro COPY_XMM_AND_BSWAP p1 p2 p3 75 VMOVDQ \p2, \p1 76 vpshufb \p3, \p1, \p1 77 .endm 78 79 ################################ 80 81 X0 = %xmm4 82 X1 = %xmm5 83 X2 = %xmm6 84 X3 = %xmm7 85 86 XTMP0 = %xmm0 87 XTMP1 = %xmm1 88 XTMP2 = %xmm2 89 XTMP3 = %xmm3 90 XTMP4 = %xmm8 91 XFER = %xmm9 92 XTMP5 = %xmm11 93 94 SHUF_00BA = %xmm10 # shuffle xBxA -> 00BA 95 SHUF_DC00 = %xmm12 # shuffle xDxC -> DC00 96 BYTE_FLIP_MASK = %xmm13 97 98 NUM_BLKS = %rdx # 3rd arg 99 INP = %rsi # 2nd arg 100 CTX = %rdi # 1st arg 101 102 SRND = %rsi # clobbers INP 103 c = %ecx 104 d = %r8d 105 e = %edx 106 TBL = %r12 107 a = %eax 108 b = %ebx 109 110 f = %r9d 111 g = %r10d 112 h = %r11d 113 114 y0 = %r13d 115 y1 = %r14d 116 y2 = %r15d 117 118 119 _INP_END_SIZE = 8 120 _INP_SIZE = 8 121 _XFER_SIZE = 16 122 _XMM_SAVE_SIZE = 0 123 124 _INP_END = 0 125 _INP = _INP_END + _INP_END_SIZE 126 _XFER = _INP + _INP_SIZE 127 _XMM_SAVE = _XFER + _XFER_SIZE 128 STACK_SIZE = _XMM_SAVE + _XMM_SAVE_SIZE 129 130 # rotate_Xs 131 # Rotate values of symbols X0...X3 132 .macro rotate_Xs 133 X_ = X0 134 X0 = X1 135 X1 = X2 136 X2 = X3 137 X3 = X_ 138 .endm 139 140 # ROTATE_ARGS 141 # Rotate values of symbols a...h 142 .macro ROTATE_ARGS 143 TMP_ = h 144 h = g 145 g = f 146 f = e 147 e = d 148 d = c 149 c = b 150 b = a 151 a = TMP_ 152 .endm 153 154 .macro FOUR_ROUNDS_AND_SCHED 155 ## compute s0 four at a time and s1 tw 156 ## compute W[-16] + W[-7] 4 at a time 157 158 mov e, y0 # y0 = 159 MY_ROR (25-11), y0 # y0 = 160 mov a, y1 # y1 = 161 vpalignr $4, X2, X3, XTMP0 # XTMP 162 MY_ROR (22-13), y1 # y1 = 163 xor e, y0 # y0 = 164 mov f, y2 # y2 = 165 MY_ROR (11-6), y0 # y0 = 166 xor a, y1 # y1 = 167 xor g, y2 # y2 = 168 vpaddd X0, XTMP0, XTMP0 # XTMP 169 xor e, y0 # y0 = 170 and e, y2 # y2 = 171 MY_ROR (13-2), y1 # y1 = 172 ## compute s0 173 vpalignr $4, X0, X1, XTMP1 # XTMP 174 xor a, y1 # y1 = 175 MY_ROR 6, y0 # y0 = 176 xor g, y2 # y2 = 177 MY_ROR 2, y1 # y1 = 178 add y0, y2 # y2 = 179 add _XFER(%rsp), y2 # y2 = 180 mov a, y0 # y0 = 181 add y2, h # h = 182 mov a, y2 # y2 = 183 vpsrld $7, XTMP1, XTMP2 184 or c, y0 # y0 = 185 add h, d # d = 186 and c, y2 # y2 = 187 vpslld $(32-7), XTMP1, XTMP3 188 and b, y0 # y0 = 189 add y1, h # h = 190 vpor XTMP2, XTMP3, XTMP3 # XTMP 191 or y2, y0 # y0 = 192 add y0, h # h = 193 ROTATE_ARGS 194 mov e, y0 # y0 = 195 mov a, y1 # y1 = 196 MY_ROR (25-11), y0 # y0 = 197 xor e, y0 # y0 = 198 mov f, y2 # y2 = 199 MY_ROR (22-13), y1 # y1 = 200 vpsrld $18, XTMP1, XTMP2 # 201 xor a, y1 # y1 = 202 MY_ROR (11-6), y0 # y0 = 203 xor g, y2 # y2 = 204 vpsrld $3, XTMP1, XTMP4 # XTMP 205 MY_ROR (13-2), y1 # y1 = 206 xor e, y0 # y0 = 207 and e, y2 # y2 = 208 MY_ROR 6, y0 # y0 = 209 vpslld $(32-18), XTMP1, XTMP1 210 xor a, y1 # y1 = 211 xor g, y2 # y2 = 212 vpxor XTMP1, XTMP3, XTMP3 # 213 add y0, y2 # y2 = 214 add (1*4 + _XFER)(%rsp), y2 # y2 = 215 MY_ROR 2, y1 # y1 = 216 vpxor XTMP2, XTMP3, XTMP3 # XTMP 217 mov a, y0 # y0 = 218 add y2, h # h = 219 mov a, y2 # y2 = 220 vpxor XTMP4, XTMP3, XTMP1 # XTMP 221 or c, y0 # y0 = 222 add h, d # d = 223 and c, y2 # y2 = 224 ## compute low s1 225 vpshufd $0b11111010, X3, XTMP2 # XTMP 226 and b, y0 # y0 = 227 add y1, h # h = 228 vpaddd XTMP1, XTMP0, XTMP0 # XTMP 229 or y2, y0 # y0 = 230 add y0, h # h = 231 ROTATE_ARGS 232 mov e, y0 # y0 = 233 mov a, y1 # y1 = 234 MY_ROR (25-11), y0 # y0 = 235 xor e, y0 # y0 = 236 MY_ROR (22-13), y1 # y1 = 237 mov f, y2 # y2 = 238 xor a, y1 # y1 = 239 MY_ROR (11-6), y0 # y0 = 240 vpsrld $10, XTMP2, XTMP4 # XTMP 241 xor g, y2 # y2 = 242 vpsrlq $19, XTMP2, XTMP3 # XTMP 243 xor e, y0 # y0 = 244 and e, y2 # y2 = 245 vpsrlq $17, XTMP2, XTMP2 # XTMP 246 MY_ROR (13-2), y1 # y1 = 247 xor a, y1 # y1 = 248 xor g, y2 # y2 = 249 MY_ROR 6, y0 # y0 = 250 vpxor XTMP3, XTMP2, XTMP2 # 251 add y0, y2 # y2 = 252 MY_ROR 2, y1 # y1 = 253 add (2*4 + _XFER)(%rsp), y2 # y2 = 254 vpxor XTMP2, XTMP4, XTMP4 # XTMP 255 mov a, y0 # y0 = 256 add y2, h # h = 257 mov a, y2 # y2 = 258 vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP 259 or c, y0 # y0 = 260 add h, d # d = 261 and c, y2 # y2 = 262 vpaddd XTMP4, XTMP0, XTMP0 # XTMP 263 and b, y0 # y0 = 264 add y1, h # h = 265 ## compute high s1 266 vpshufd $0b01010000, XTMP0, XTMP2 # XT 267 or y2, y0 # y0 = 268 add y0, h # h = 269 ROTATE_ARGS 270 mov e, y0 # y0 = 271 MY_ROR (25-11), y0 # y0 = 272 mov a, y1 # y1 = 273 MY_ROR (22-13), y1 # y1 = 274 xor e, y0 # y0 = 275 mov f, y2 # y2 = 276 MY_ROR (11-6), y0 # y0 = 277 vpsrld $10, XTMP2, XTMP5 # XTMP 278 xor a, y1 # y1 = 279 xor g, y2 # y2 = 280 vpsrlq $19, XTMP2, XTMP3 # XTMP 281 xor e, y0 # y0 = 282 and e, y2 # y2 = 283 MY_ROR (13-2), y1 # y1 = 284 vpsrlq $17, XTMP2, XTMP2 # XTMP 285 xor a, y1 # y1 = 286 MY_ROR 6, y0 # y0 = 287 xor g, y2 # y2 = 288 vpxor XTMP3, XTMP2, XTMP2 289 MY_ROR 2, y1 # y1 = 290 add y0, y2 # y2 = 291 add (3*4 + _XFER)(%rsp), y2 # y2 = 292 vpxor XTMP2, XTMP5, XTMP5 # XTMP 293 mov a, y0 # y0 = 294 add y2, h # h = 295 mov a, y2 # y2 = 296 vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP 297 or c, y0 # y0 = 298 add h, d # d = 299 and c, y2 # y2 = 300 vpaddd XTMP0, XTMP5, X0 # X0 = 301 and b, y0 # y0 = 302 add y1, h # h = 303 or y2, y0 # y0 = 304 add y0, h # h = 305 ROTATE_ARGS 306 rotate_Xs 307 .endm 308 309 ## input is [rsp + _XFER + %1 * 4] 310 .macro DO_ROUND round 311 mov e, y0 # y0 = 312 MY_ROR (25-11), y0 # y0 = 313 mov a, y1 # y1 = 314 xor e, y0 # y0 = 315 MY_ROR (22-13), y1 # y1 = 316 mov f, y2 # y2 = 317 xor a, y1 # y1 = 318 MY_ROR (11-6), y0 # y0 = 319 xor g, y2 # y2 = 320 xor e, y0 # y0 = 321 MY_ROR (13-2), y1 # y1 = 322 and e, y2 # y2 = 323 xor a, y1 # y1 = 324 MY_ROR 6, y0 # y0 = 325 xor g, y2 # y2 = 326 add y0, y2 # y2 = 327 MY_ROR 2, y1 # y1 = 328 offset = \round * 4 + _XFER # 329 add offset(%rsp), y2 # y2 = 330 mov a, y0 # y0 = 331 add y2, h # h = 332 mov a, y2 # y2 = 333 or c, y0 # y0 = 334 add h, d # d = 335 and c, y2 # y2 = 336 and b, y0 # y0 = 337 add y1, h # h = 338 or y2, y0 # y0 = 339 add y0, h # h = 340 ROTATE_ARGS 341 .endm 342 343 ############################################## 344 ## void sha256_transform_avx(state sha256_stat 345 ## arg 1 : pointer to state 346 ## arg 2 : pointer to input data 347 ## arg 3 : Num blocks 348 ############################################## 349 .text 350 SYM_TYPED_FUNC_START(sha256_transform_avx) 351 pushq %rbx 352 pushq %r12 353 pushq %r13 354 pushq %r14 355 pushq %r15 356 pushq %rbp 357 movq %rsp, %rbp 358 359 subq $STACK_SIZE, %rsp # allo 360 and $~15, %rsp # alig 361 362 shl $6, NUM_BLKS # conv 363 jz .Ldone_hash 364 add INP, NUM_BLKS # poin 365 mov NUM_BLKS, _INP_END(%rsp) 366 367 ## load initial digest 368 mov 4*0(CTX), a 369 mov 4*1(CTX), b 370 mov 4*2(CTX), c 371 mov 4*3(CTX), d 372 mov 4*4(CTX), e 373 mov 4*5(CTX), f 374 mov 4*6(CTX), g 375 mov 4*7(CTX), h 376 377 vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip) 378 vmovdqa _SHUF_00BA(%rip), SHUF_00BA 379 vmovdqa _SHUF_DC00(%rip), SHUF_DC00 380 .Lloop0: 381 lea K256(%rip), TBL 382 383 ## byte swap first 16 dwords 384 COPY_XMM_AND_BSWAP X0, 0*16(INP), 385 COPY_XMM_AND_BSWAP X1, 1*16(INP), 386 COPY_XMM_AND_BSWAP X2, 2*16(INP), 387 COPY_XMM_AND_BSWAP X3, 3*16(INP), 388 389 mov INP, _INP(%rsp) 390 391 ## schedule 48 input dwords, by doing 392 mov $3, SRND 393 .align 16 394 .Lloop1: 395 vpaddd (TBL), X0, XFER 396 vmovdqa XFER, _XFER(%rsp) 397 FOUR_ROUNDS_AND_SCHED 398 399 vpaddd 1*16(TBL), X0, XFER 400 vmovdqa XFER, _XFER(%rsp) 401 FOUR_ROUNDS_AND_SCHED 402 403 vpaddd 2*16(TBL), X0, XFER 404 vmovdqa XFER, _XFER(%rsp) 405 FOUR_ROUNDS_AND_SCHED 406 407 vpaddd 3*16(TBL), X0, XFER 408 vmovdqa XFER, _XFER(%rsp) 409 add $4*16, TBL 410 FOUR_ROUNDS_AND_SCHED 411 412 sub $1, SRND 413 jne .Lloop1 414 415 mov $2, SRND 416 .Lloop2: 417 vpaddd (TBL), X0, XFER 418 vmovdqa XFER, _XFER(%rsp) 419 DO_ROUND 0 420 DO_ROUND 1 421 DO_ROUND 2 422 DO_ROUND 3 423 424 vpaddd 1*16(TBL), X1, XFER 425 vmovdqa XFER, _XFER(%rsp) 426 add $2*16, TBL 427 DO_ROUND 0 428 DO_ROUND 1 429 DO_ROUND 2 430 DO_ROUND 3 431 432 vmovdqa X2, X0 433 vmovdqa X3, X1 434 435 sub $1, SRND 436 jne .Lloop2 437 438 addm (4*0)(CTX),a 439 addm (4*1)(CTX),b 440 addm (4*2)(CTX),c 441 addm (4*3)(CTX),d 442 addm (4*4)(CTX),e 443 addm (4*5)(CTX),f 444 addm (4*6)(CTX),g 445 addm (4*7)(CTX),h 446 447 mov _INP(%rsp), INP 448 add $64, INP 449 cmp _INP_END(%rsp), INP 450 jne .Lloop0 451 452 .Ldone_hash: 453 454 mov %rbp, %rsp 455 popq %rbp 456 popq %r15 457 popq %r14 458 popq %r13 459 popq %r12 460 popq %rbx 461 RET 462 SYM_FUNC_END(sha256_transform_avx) 463 464 .section .rodata.cst256.K256, "aM", @pr 465 .align 64 466 K256: 467 .long 0x428a2f98,0x71374491,0xb5c0fbcf 468 .long 0x3956c25b,0x59f111f1,0x923f82a4 469 .long 0xd807aa98,0x12835b01,0x243185be 470 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7 471 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6 472 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc 473 .long 0x983e5152,0xa831c66d,0xb00327c8 474 .long 0xc6e00bf3,0xd5a79147,0x06ca6351 475 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc 476 .long 0x650a7354,0x766a0abb,0x81c2c92e 477 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70 478 .long 0xd192e819,0xd6990624,0xf40e3585 479 .long 0x19a4c116,0x1e376c08,0x2748774c 480 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f 481 .long 0x748f82ee,0x78a5636f,0x84c87814 482 .long 0x90befffa,0xa4506ceb,0xbef9a3f7 483 484 .section .rodata.cst16.PSHUFFLE_BYTE_FL 485 .align 16 486 PSHUFFLE_BYTE_FLIP_MASK: 487 .octa 0x0c0d0e0f08090a0b04050607000102 488 489 .section .rodata.cst16._SHUF_00BA, "aM" 490 .align 16 491 # shuffle xBxA -> 00BA 492 _SHUF_00BA: 493 .octa 0xFFFFFFFFFFFFFFFF0b0a0908030201 494 495 .section .rodata.cst16._SHUF_DC00, "aM" 496 .align 16 497 # shuffle xDxC -> DC00 498 _SHUF_DC00: 499 .octa 0x0b0a090803020100FFFFFFFFFFFFFF
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.