1 #!/usr/bin/env perl 2 # SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause 3 # 4 # Copyright (C) 2017-2018 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved. 5 # Copyright (C) 2017-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. 6 # Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved. 7 # 8 # This code is taken from the OpenSSL project but the author, Andy Polyakov, 9 # has relicensed it under the licenses specified in the SPDX header above. 10 # The original headers, including the original license headers, are 11 # included below for completeness. 12 # 13 # ==================================================================== 14 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 15 # project. The module is, however, dual licensed under OpenSSL and 16 # CRYPTOGAMS licenses depending on where you obtain it. For further 17 # details see http://www.openssl.org/~appro/cryptogams/. 18 # ==================================================================== 19 # 20 # This module implements Poly1305 hash for x86_64. 21 # 22 # March 2015 23 # 24 # Initial release. 25 # 26 # December 2016 27 # 28 # Add AVX512F+VL+BW code path. 29 # 30 # November 2017 31 # 32 # Convert AVX512F+VL+BW code path to pure AVX512F, so that it can be 33 # executed even on Knights Landing. Trigger for modification was 34 # observation that AVX512 code paths can negatively affect overall 35 # Skylake-X system performance. Since we are likely to suppress 36 # AVX512F capability flag [at least on Skylake-X], conversion serves 37 # as kind of "investment protection". Note that next *lake processor, 38 # Cannonlake, has AVX512IFMA code path to execute... 39 # 40 # Numbers are cycles per processed byte with poly1305_blocks alone, 41 # measured with rdtsc at fixed clock frequency. 42 # 43 # IALU/gcc-4.8(*) AVX(**) AVX2 AVX-512 44 # P4 4.46/+120% - 45 # Core 2 2.41/+90% - 46 # Westmere 1.88/+120% - 47 # Sandy Bridge 1.39/+140% 1.10 48 # Haswell 1.14/+175% 1.11 0.65 49 # Skylake[-X] 1.13/+120% 0.96 0.51 [0.35] 50 # Silvermont 2.83/+95% - 51 # Knights L 3.60/? 1.65 1.10 0.41(***) 52 # Goldmont 1.70/+180% - 53 # VIA Nano 1.82/+150% - 54 # Sledgehammer 1.38/+160% - 55 # Bulldozer 2.30/+130% 0.97 56 # Ryzen 1.15/+200% 1.08 1.18 57 # 58 # (*) improvement coefficients relative to clang are more modest and 59 # are ~50% on most processors, in both cases we are comparing to 60 # __int128 code; 61 # (**) SSE2 implementation was attempted, but among non-AVX processors 62 # it was faster than integer-only code only on older Intel P4 and 63 # Core processors, 50-30%, less newer processor is, but slower on 64 # contemporary ones, for example almost 2x slower on Atom, and as 65 # former are naturally disappearing, SSE2 is deemed unnecessary; 66 # (***) strangely enough performance seems to vary from core to core, 67 # listed result is best case; 68 69 $flavour = shift; 70 $output = shift; 71 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 72 73 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 74 $kernel=0; $kernel=1 if (!$flavour && !$output); 75 76 if (!$kernel) { 77 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 78 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 79 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 80 die "can't locate x86_64-xlate.pl"; 81 82 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 83 *STDOUT=*OUT; 84 85 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 86 =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 87 $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25); 88 } 89 90 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 91 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) { 92 $avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12); 93 $avx += 1 if ($1==2.11 && $2>=8); 94 } 95 96 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && 97 `ml64 2>&1` =~ /Version ([0-9]+)\./) { 98 $avx = ($1>=10) + ($1>=11); 99 } 100 101 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) { 102 $avx = ($2>=3.0) + ($2>3.0); 103 } 104 } else { 105 $avx = 4; # The kernel uses ifdefs for this. 106 } 107 108 sub declare_function() { 109 my ($name, $align, $nargs) = @_; 110 if($kernel) { 111 $code .= "SYM_FUNC_START($name)\n"; 112 $code .= ".L$name:\n"; 113 } else { 114 $code .= ".globl $name\n"; 115 $code .= ".type $name,\@function,$nargs\n"; 116 $code .= ".align $align\n"; 117 $code .= "$name:\n"; 118 } 119 } 120 121 sub end_function() { 122 my ($name) = @_; 123 if($kernel) { 124 $code .= "SYM_FUNC_END($name)\n"; 125 } else { 126 $code .= ".size $name,.-$name\n"; 127 } 128 } 129 130 $code.=<<___ if $kernel; 131 #include <linux/linkage.h> 132 ___ 133 134 if ($avx) { 135 $code.=<<___ if $kernel; 136 .section .rodata 137 ___ 138 $code.=<<___; 139 .align 64 140 .Lconst: 141 .Lmask24: 142 .long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0 143 .L129: 144 .long `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0 145 .Lmask26: 146 .long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0 147 .Lpermd_avx2: 148 .long 2,2,2,3,2,0,2,1 149 .Lpermd_avx512: 150 .long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7 151 152 .L2_44_inp_permd: 153 .long 0,1,1,2,2,3,7,7 154 .L2_44_inp_shift: 155 .quad 0,12,24,64 156 .L2_44_mask: 157 .quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff 158 .L2_44_shift_rgt: 159 .quad 44,44,42,64 160 .L2_44_shift_lft: 161 .quad 8,8,10,64 162 163 .align 64 164 .Lx_mask44: 165 .quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff 166 .quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff 167 .Lx_mask42: 168 .quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff 169 .quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff 170 ___ 171 } 172 $code.=<<___ if (!$kernel); 173 .asciz "Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 174 .align 16 175 ___ 176 177 my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx"); 178 my ($mac,$nonce)=($inp,$len); # *_emit arguments 179 my ($d1,$d2,$d3, $r0,$r1,$s1)=("%r8","%r9","%rdi","%r11","%r12","%r13"); 180 my ($h0,$h1,$h2)=("%r14","%rbx","%r10"); 181 182 sub poly1305_iteration { 183 # input: copy of $r1 in %rax, $h0-$h2, $r0-$r1 184 # output: $h0-$h2 *= $r0-$r1 185 $code.=<<___; 186 mulq $h0 # h0*r1 187 mov %rax,$d2 188 mov $r0,%rax 189 mov %rdx,$d3 190 191 mulq $h0 # h0*r0 192 mov %rax,$h0 # future $h0 193 mov $r0,%rax 194 mov %rdx,$d1 195 196 mulq $h1 # h1*r0 197 add %rax,$d2 198 mov $s1,%rax 199 adc %rdx,$d3 200 201 mulq $h1 # h1*s1 202 mov $h2,$h1 # borrow $h1 203 add %rax,$h0 204 adc %rdx,$d1 205 206 imulq $s1,$h1 # h2*s1 207 add $h1,$d2 208 mov $d1,$h1 209 adc \$0,$d3 210 211 imulq $r0,$h2 # h2*r0 212 add $d2,$h1 213 mov \$-4,%rax # mask value 214 adc $h2,$d3 215 216 and $d3,%rax # last reduction step 217 mov $d3,$h2 218 shr \$2,$d3 219 and \$3,$h2 220 add $d3,%rax 221 add %rax,$h0 222 adc \$0,$h1 223 adc \$0,$h2 224 ___ 225 } 226 227 ######################################################################## 228 # Layout of opaque area is following. 229 # 230 # unsigned __int64 h[3]; # current hash value base 2^64 231 # unsigned __int64 r[2]; # key value base 2^64 232 233 $code.=<<___; 234 .text 235 ___ 236 $code.=<<___ if (!$kernel); 237 .extern OPENSSL_ia32cap_P 238 239 .globl poly1305_init_x86_64 240 .hidden poly1305_init_x86_64 241 .globl poly1305_blocks_x86_64 242 .hidden poly1305_blocks_x86_64 243 .globl poly1305_emit_x86_64 244 .hidden poly1305_emit_x86_64 245 ___ 246 &declare_function("poly1305_init_x86_64", 32, 3); 247 $code.=<<___; 248 xor %eax,%eax 249 mov %rax,0($ctx) # initialize hash value 250 mov %rax,8($ctx) 251 mov %rax,16($ctx) 252 253 test $inp,$inp 254 je .Lno_key 255 ___ 256 $code.=<<___ if (!$kernel); 257 lea poly1305_blocks_x86_64(%rip),%r10 258 lea poly1305_emit_x86_64(%rip),%r11 259 ___ 260 $code.=<<___ if (!$kernel && $avx); 261 mov OPENSSL_ia32cap_P+4(%rip),%r9 262 lea poly1305_blocks_avx(%rip),%rax 263 lea poly1305_emit_avx(%rip),%rcx 264 bt \$`60-32`,%r9 # AVX? 265 cmovc %rax,%r10 266 cmovc %rcx,%r11 267 ___ 268 $code.=<<___ if (!$kernel && $avx>1); 269 lea poly1305_blocks_avx2(%rip),%rax 270 bt \$`5+32`,%r9 # AVX2? 271 cmovc %rax,%r10 272 ___ 273 $code.=<<___ if (!$kernel && $avx>3); 274 mov \$`(1<<31|1<<21|1<<16)`,%rax 275 shr \$32,%r9 276 and %rax,%r9 277 cmp %rax,%r9 278 je .Linit_base2_44 279 ___ 280 $code.=<<___; 281 mov \$0x0ffffffc0fffffff,%rax 282 mov \$0x0ffffffc0ffffffc,%rcx 283 and 0($inp),%rax 284 and 8($inp),%rcx 285 mov %rax,24($ctx) 286 mov %rcx,32($ctx) 287 ___ 288 $code.=<<___ if (!$kernel && $flavour !~ /elf32/); 289 mov %r10,0(%rdx) 290 mov %r11,8(%rdx) 291 ___ 292 $code.=<<___ if (!$kernel && $flavour =~ /elf32/); 293 mov %r10d,0(%rdx) 294 mov %r11d,4(%rdx) 295 ___ 296 $code.=<<___; 297 mov \$1,%eax 298 .Lno_key: 299 RET 300 ___ 301 &end_function("poly1305_init_x86_64"); 302 303 &declare_function("poly1305_blocks_x86_64", 32, 4); 304 $code.=<<___; 305 .cfi_startproc 306 .Lblocks: 307 shr \$4,$len 308 jz .Lno_data # too short 309 310 push %rbx 311 .cfi_push %rbx 312 push %r12 313 .cfi_push %r12 314 push %r13 315 .cfi_push %r13 316 push %r14 317 .cfi_push %r14 318 push %r15 319 .cfi_push %r15 320 push $ctx 321 .cfi_push $ctx 322 .Lblocks_body: 323 324 mov $len,%r15 # reassign $len 325 326 mov 24($ctx),$r0 # load r 327 mov 32($ctx),$s1 328 329 mov 0($ctx),$h0 # load hash value 330 mov 8($ctx),$h1 331 mov 16($ctx),$h2 332 333 mov $s1,$r1 334 shr \$2,$s1 335 mov $r1,%rax 336 add $r1,$s1 # s1 = r1 + (r1 >> 2) 337 jmp .Loop 338 339 .align 32 340 .Loop: 341 add 0($inp),$h0 # accumulate input 342 adc 8($inp),$h1 343 lea 16($inp),$inp 344 adc $padbit,$h2 345 ___ 346 347 &poly1305_iteration(); 348 349 $code.=<<___; 350 mov $r1,%rax 351 dec %r15 # len-=16 352 jnz .Loop 353 354 mov 0(%rsp),$ctx 355 .cfi_restore $ctx 356 357 mov $h0,0($ctx) # store hash value 358 mov $h1,8($ctx) 359 mov $h2,16($ctx) 360 361 mov 8(%rsp),%r15 362 .cfi_restore %r15 363 mov 16(%rsp),%r14 364 .cfi_restore %r14 365 mov 24(%rsp),%r13 366 .cfi_restore %r13 367 mov 32(%rsp),%r12 368 .cfi_restore %r12 369 mov 40(%rsp),%rbx 370 .cfi_restore %rbx 371 lea 48(%rsp),%rsp 372 .cfi_adjust_cfa_offset -48 373 .Lno_data: 374 .Lblocks_epilogue: 375 RET 376 .cfi_endproc 377 ___ 378 &end_function("poly1305_blocks_x86_64"); 379 380 &declare_function("poly1305_emit_x86_64", 32, 3); 381 $code.=<<___; 382 .Lemit: 383 mov 0($ctx),%r8 # load hash value 384 mov 8($ctx),%r9 385 mov 16($ctx),%r10 386 387 mov %r8,%rax 388 add \$5,%r8 # compare to modulus 389 mov %r9,%rcx 390 adc \$0,%r9 391 adc \$0,%r10 392 shr \$2,%r10 # did 130-bit value overflow? 393 cmovnz %r8,%rax 394 cmovnz %r9,%rcx 395 396 add 0($nonce),%rax # accumulate nonce 397 adc 8($nonce),%rcx 398 mov %rax,0($mac) # write result 399 mov %rcx,8($mac) 400 401 RET 402 ___ 403 &end_function("poly1305_emit_x86_64"); 404 if ($avx) { 405 406 ######################################################################## 407 # Layout of opaque area is following. 408 # 409 # unsigned __int32 h[5]; # current hash value base 2^26 410 # unsigned __int32 is_base2_26; 411 # unsigned __int64 r[2]; # key value base 2^64 412 # unsigned __int64 pad; 413 # struct { unsigned __int32 r^2, r^1, r^4, r^3; } r[9]; 414 # 415 # where r^n are base 2^26 digits of degrees of multiplier key. There are 416 # 5 digits, but last four are interleaved with multiples of 5, totalling 417 # in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4. 418 419 my ($H0,$H1,$H2,$H3,$H4, $T0,$T1,$T2,$T3,$T4, $D0,$D1,$D2,$D3,$D4, $MASK) = 420 map("%xmm$_",(0..15)); 421 422 $code.=<<___; 423 .type __poly1305_block,\@abi-omnipotent 424 .align 32 425 __poly1305_block: 426 push $ctx 427 ___ 428 &poly1305_iteration(); 429 $code.=<<___; 430 pop $ctx 431 RET 432 .size __poly1305_block,.-__poly1305_block 433 434 .type __poly1305_init_avx,\@abi-omnipotent 435 .align 32 436 __poly1305_init_avx: 437 push %rbp 438 mov %rsp,%rbp 439 mov $r0,$h0 440 mov $r1,$h1 441 xor $h2,$h2 442 443 lea 48+64($ctx),$ctx # size optimization 444 445 mov $r1,%rax 446 call __poly1305_block # r^2 447 448 mov \$0x3ffffff,%eax # save interleaved r^2 and r base 2^26 449 mov \$0x3ffffff,%edx 450 mov $h0,$d1 451 and $h0#d,%eax 452 mov $r0,$d2 453 and $r0#d,%edx 454 mov %eax,`16*0+0-64`($ctx) 455 shr \$26,$d1 456 mov %edx,`16*0+4-64`($ctx) 457 shr \$26,$d2 458 459 mov \$0x3ffffff,%eax 460 mov \$0x3ffffff,%edx 461 and $d1#d,%eax 462 and $d2#d,%edx 463 mov %eax,`16*1+0-64`($ctx) 464 lea (%rax,%rax,4),%eax # *5 465 mov %edx,`16*1+4-64`($ctx) 466 lea (%rdx,%rdx,4),%edx # *5 467 mov %eax,`16*2+0-64`($ctx) 468 shr \$26,$d1 469 mov %edx,`16*2+4-64`($ctx) 470 shr \$26,$d2 471 472 mov $h1,%rax 473 mov $r1,%rdx 474 shl \$12,%rax 475 shl \$12,%rdx 476 or $d1,%rax 477 or $d2,%rdx 478 and \$0x3ffffff,%eax 479 and \$0x3ffffff,%edx 480 mov %eax,`16*3+0-64`($ctx) 481 lea (%rax,%rax,4),%eax # *5 482 mov %edx,`16*3+4-64`($ctx) 483 lea (%rdx,%rdx,4),%edx # *5 484 mov %eax,`16*4+0-64`($ctx) 485 mov $h1,$d1 486 mov %edx,`16*4+4-64`($ctx) 487 mov $r1,$d2 488 489 mov \$0x3ffffff,%eax 490 mov \$0x3ffffff,%edx 491 shr \$14,$d1 492 shr \$14,$d2 493 and $d1#d,%eax 494 and $d2#d,%edx 495 mov %eax,`16*5+0-64`($ctx) 496 lea (%rax,%rax,4),%eax # *5 497 mov %edx,`16*5+4-64`($ctx) 498 lea (%rdx,%rdx,4),%edx # *5 499 mov %eax,`16*6+0-64`($ctx) 500 shr \$26,$d1 501 mov %edx,`16*6+4-64`($ctx) 502 shr \$26,$d2 503 504 mov $h2,%rax 505 shl \$24,%rax 506 or %rax,$d1 507 mov $d1#d,`16*7+0-64`($ctx) 508 lea ($d1,$d1,4),$d1 # *5 509 mov $d2#d,`16*7+4-64`($ctx) 510 lea ($d2,$d2,4),$d2 # *5 511 mov $d1#d,`16*8+0-64`($ctx) 512 mov $d2#d,`16*8+4-64`($ctx) 513 514 mov $r1,%rax 515 call __poly1305_block # r^3 516 517 mov \$0x3ffffff,%eax # save r^3 base 2^26 518 mov $h0,$d1 519 and $h0#d,%eax 520 shr \$26,$d1 521 mov %eax,`16*0+12-64`($ctx) 522 523 mov \$0x3ffffff,%edx 524 and $d1#d,%edx 525 mov %edx,`16*1+12-64`($ctx) 526 lea (%rdx,%rdx,4),%edx # *5 527 shr \$26,$d1 528 mov %edx,`16*2+12-64`($ctx) 529 530 mov $h1,%rax 531 shl \$12,%rax 532 or $d1,%rax 533 and \$0x3ffffff,%eax 534 mov %eax,`16*3+12-64`($ctx) 535 lea (%rax,%rax,4),%eax # *5 536 mov $h1,$d1 537 mov %eax,`16*4+12-64`($ctx) 538 539 mov \$0x3ffffff,%edx 540 shr \$14,$d1 541 and $d1#d,%edx 542 mov %edx,`16*5+12-64`($ctx) 543 lea (%rdx,%rdx,4),%edx # *5 544 shr \$26,$d1 545 mov %edx,`16*6+12-64`($ctx) 546 547 mov $h2,%rax 548 shl \$24,%rax 549 or %rax,$d1 550 mov $d1#d,`16*7+12-64`($ctx) 551 lea ($d1,$d1,4),$d1 # *5 552 mov $d1#d,`16*8+12-64`($ctx) 553 554 mov $r1,%rax 555 call __poly1305_block # r^4 556 557 mov \$0x3ffffff,%eax # save r^4 base 2^26 558 mov $h0,$d1 559 and $h0#d,%eax 560 shr \$26,$d1 561 mov %eax,`16*0+8-64`($ctx) 562 563 mov \$0x3ffffff,%edx 564 and $d1#d,%edx 565 mov %edx,`16*1+8-64`($ctx) 566 lea (%rdx,%rdx,4),%edx # *5 567 shr \$26,$d1 568 mov %edx,`16*2+8-64`($ctx) 569 570 mov $h1,%rax 571 shl \$12,%rax 572 or $d1,%rax 573 and \$0x3ffffff,%eax 574 mov %eax,`16*3+8-64`($ctx) 575 lea (%rax,%rax,4),%eax # *5 576 mov $h1,$d1 577 mov %eax,`16*4+8-64`($ctx) 578 579 mov \$0x3ffffff,%edx 580 shr \$14,$d1 581 and $d1#d,%edx 582 mov %edx,`16*5+8-64`($ctx) 583 lea (%rdx,%rdx,4),%edx # *5 584 shr \$26,$d1 585 mov %edx,`16*6+8-64`($ctx) 586 587 mov $h2,%rax 588 shl \$24,%rax 589 or %rax,$d1 590 mov $d1#d,`16*7+8-64`($ctx) 591 lea ($d1,$d1,4),$d1 # *5 592 mov $d1#d,`16*8+8-64`($ctx) 593 594 lea -48-64($ctx),$ctx # size [de-]optimization 595 pop %rbp 596 RET 597 .size __poly1305_init_avx,.-__poly1305_init_avx 598 ___ 599 600 &declare_function("poly1305_blocks_avx", 32, 4); 601 $code.=<<___; 602 .cfi_startproc 603 mov 20($ctx),%r8d # is_base2_26 604 cmp \$128,$len 605 jae .Lblocks_avx 606 test %r8d,%r8d 607 jz .Lblocks 608 609 .Lblocks_avx: 610 and \$-16,$len 611 jz .Lno_data_avx 612 613 vzeroupper 614 615 test %r8d,%r8d 616 jz .Lbase2_64_avx 617 618 test \$31,$len 619 jz .Leven_avx 620 621 push %rbp 622 .cfi_push %rbp 623 mov %rsp,%rbp 624 push %rbx 625 .cfi_push %rbx 626 push %r12 627 .cfi_push %r12 628 push %r13 629 .cfi_push %r13 630 push %r14 631 .cfi_push %r14 632 push %r15 633 .cfi_push %r15 634 .Lblocks_avx_body: 635 636 mov $len,%r15 # reassign $len 637 638 mov 0($ctx),$d1 # load hash value 639 mov 8($ctx),$d2 640 mov 16($ctx),$h2#d 641 642 mov 24($ctx),$r0 # load r 643 mov 32($ctx),$s1 644 645 ################################# base 2^26 -> base 2^64 646 mov $d1#d,$h0#d 647 and \$`-1*(1<<31)`,$d1 648 mov $d2,$r1 # borrow $r1 649 mov $d2#d,$h1#d 650 and \$`-1*(1<<31)`,$d2 651 652 shr \$6,$d1 653 shl \$52,$r1 654 add $d1,$h0 655 shr \$12,$h1 656 shr \$18,$d2 657 add $r1,$h0 658 adc $d2,$h1 659 660 mov $h2,$d1 661 shl \$40,$d1 662 shr \$24,$h2 663 add $d1,$h1 664 adc \$0,$h2 # can be partially reduced... 665 666 mov \$-4,$d2 # ... so reduce 667 mov $h2,$d1 668 and $h2,$d2 669 shr \$2,$d1 670 and \$3,$h2 671 add $d2,$d1 # =*5 672 add $d1,$h0 673 adc \$0,$h1 674 adc \$0,$h2 675 676 mov $s1,$r1 677 mov $s1,%rax 678 shr \$2,$s1 679 add $r1,$s1 # s1 = r1 + (r1 >> 2) 680 681 add 0($inp),$h0 # accumulate input 682 adc 8($inp),$h1 683 lea 16($inp),$inp 684 adc $padbit,$h2 685 686 call __poly1305_block 687 688 test $padbit,$padbit # if $padbit is zero, 689 jz .Lstore_base2_64_avx # store hash in base 2^64 format 690 691 ################################# base 2^64 -> base 2^26 692 mov $h0,%rax 693 mov $h0,%rdx 694 shr \$52,$h0 695 mov $h1,$r0 696 mov $h1,$r1 697 shr \$26,%rdx 698 and \$0x3ffffff,%rax # h[0] 699 shl \$12,$r0 700 and \$0x3ffffff,%rdx # h[1] 701 shr \$14,$h1 702 or $r0,$h0 703 shl \$24,$h2 704 and \$0x3ffffff,$h0 # h[2] 705 shr \$40,$r1 706 and \$0x3ffffff,$h1 # h[3] 707 or $r1,$h2 # h[4] 708 709 sub \$16,%r15 710 jz .Lstore_base2_26_avx 711 712 vmovd %rax#d,$H0 713 vmovd %rdx#d,$H1 714 vmovd $h0#d,$H2 715 vmovd $h1#d,$H3 716 vmovd $h2#d,$H4 717 jmp .Lproceed_avx 718 719 .align 32 720 .Lstore_base2_64_avx: 721 mov $h0,0($ctx) 722 mov $h1,8($ctx) 723 mov $h2,16($ctx) # note that is_base2_26 is zeroed 724 jmp .Ldone_avx 725 726 .align 16 727 .Lstore_base2_26_avx: 728 mov %rax#d,0($ctx) # store hash value base 2^26 729 mov %rdx#d,4($ctx) 730 mov $h0#d,8($ctx) 731 mov $h1#d,12($ctx) 732 mov $h2#d,16($ctx) 733 .align 16 734 .Ldone_avx: 735 pop %r15 736 .cfi_restore %r15 737 pop %r14 738 .cfi_restore %r14 739 pop %r13 740 .cfi_restore %r13 741 pop %r12 742 .cfi_restore %r12 743 pop %rbx 744 .cfi_restore %rbx 745 pop %rbp 746 .cfi_restore %rbp 747 .Lno_data_avx: 748 .Lblocks_avx_epilogue: 749 RET 750 .cfi_endproc 751 752 .align 32 753 .Lbase2_64_avx: 754 .cfi_startproc 755 push %rbp 756 .cfi_push %rbp 757 mov %rsp,%rbp 758 push %rbx 759 .cfi_push %rbx 760 push %r12 761 .cfi_push %r12 762 push %r13 763 .cfi_push %r13 764 push %r14 765 .cfi_push %r14 766 push %r15 767 .cfi_push %r15 768 .Lbase2_64_avx_body: 769 770 mov $len,%r15 # reassign $len 771 772 mov 24($ctx),$r0 # load r 773 mov 32($ctx),$s1 774 775 mov 0($ctx),$h0 # load hash value 776 mov 8($ctx),$h1 777 mov 16($ctx),$h2#d 778 779 mov $s1,$r1 780 mov $s1,%rax 781 shr \$2,$s1 782 add $r1,$s1 # s1 = r1 + (r1 >> 2) 783 784 test \$31,$len 785 jz .Linit_avx 786 787 add 0($inp),$h0 # accumulate input 788 adc 8($inp),$h1 789 lea 16($inp),$inp 790 adc $padbit,$h2 791 sub \$16,%r15 792 793 call __poly1305_block 794 795 .Linit_avx: 796 ################################# base 2^64 -> base 2^26 797 mov $h0,%rax 798 mov $h0,%rdx 799 shr \$52,$h0 800 mov $h1,$d1 801 mov $h1,$d2 802 shr \$26,%rdx 803 and \$0x3ffffff,%rax # h[0] 804 shl \$12,$d1 805 and \$0x3ffffff,%rdx # h[1] 806 shr \$14,$h1 807 or $d1,$h0 808 shl \$24,$h2 809 and \$0x3ffffff,$h0 # h[2] 810 shr \$40,$d2 811 and \$0x3ffffff,$h1 # h[3] 812 or $d2,$h2 # h[4] 813 814 vmovd %rax#d,$H0 815 vmovd %rdx#d,$H1 816 vmovd $h0#d,$H2 817 vmovd $h1#d,$H3 818 vmovd $h2#d,$H4 819 movl \$1,20($ctx) # set is_base2_26 820 821 call __poly1305_init_avx 822 823 .Lproceed_avx: 824 mov %r15,$len 825 pop %r15 826 .cfi_restore %r15 827 pop %r14 828 .cfi_restore %r14 829 pop %r13 830 .cfi_restore %r13 831 pop %r12 832 .cfi_restore %r12 833 pop %rbx 834 .cfi_restore %rbx 835 pop %rbp 836 .cfi_restore %rbp 837 .Lbase2_64_avx_epilogue: 838 jmp .Ldo_avx 839 .cfi_endproc 840 841 .align 32 842 .Leven_avx: 843 .cfi_startproc 844 vmovd 4*0($ctx),$H0 # load hash value 845 vmovd 4*1($ctx),$H1 846 vmovd 4*2($ctx),$H2 847 vmovd 4*3($ctx),$H3 848 vmovd 4*4($ctx),$H4 849 850 .Ldo_avx: 851 ___ 852 $code.=<<___ if (!$win64); 853 lea 8(%rsp),%r10 854 .cfi_def_cfa_register %r10 855 and \$-32,%rsp 856 sub \$-8,%rsp 857 lea -0x58(%rsp),%r11 858 sub \$0x178,%rsp 859 ___ 860 $code.=<<___ if ($win64); 861 lea -0xf8(%rsp),%r11 862 sub \$0x218,%rsp 863 vmovdqa %xmm6,0x50(%r11) 864 vmovdqa %xmm7,0x60(%r11) 865 vmovdqa %xmm8,0x70(%r11) 866 vmovdqa %xmm9,0x80(%r11) 867 vmovdqa %xmm10,0x90(%r11) 868 vmovdqa %xmm11,0xa0(%r11) 869 vmovdqa %xmm12,0xb0(%r11) 870 vmovdqa %xmm13,0xc0(%r11) 871 vmovdqa %xmm14,0xd0(%r11) 872 vmovdqa %xmm15,0xe0(%r11) 873 .Ldo_avx_body: 874 ___ 875 $code.=<<___; 876 sub \$64,$len 877 lea -32($inp),%rax 878 cmovc %rax,$inp 879 880 vmovdqu `16*3`($ctx),$D4 # preload r0^2 881 lea `16*3+64`($ctx),$ctx # size optimization 882 lea .Lconst(%rip),%rcx 883 884 ################################################################ 885 # load input 886 vmovdqu 16*2($inp),$T0 887 vmovdqu 16*3($inp),$T1 888 vmovdqa 64(%rcx),$MASK # .Lmask26 889 890 vpsrldq \$6,$T0,$T2 # splat input 891 vpsrldq \$6,$T1,$T3 892 vpunpckhqdq $T1,$T0,$T4 # 4 893 vpunpcklqdq $T1,$T0,$T0 # 0:1 894 vpunpcklqdq $T3,$T2,$T3 # 2:3 895 896 vpsrlq \$40,$T4,$T4 # 4 897 vpsrlq \$26,$T0,$T1 898 vpand $MASK,$T0,$T0 # 0 899 vpsrlq \$4,$T3,$T2 900 vpand $MASK,$T1,$T1 # 1 901 vpsrlq \$30,$T3,$T3 902 vpand $MASK,$T2,$T2 # 2 903 vpand $MASK,$T3,$T3 # 3 904 vpor 32(%rcx),$T4,$T4 # padbit, yes, always 905 906 jbe .Lskip_loop_avx 907 908 # expand and copy pre-calculated table to stack 909 vmovdqu `16*1-64`($ctx),$D1 910 vmovdqu `16*2-64`($ctx),$D2 911 vpshufd \$0xEE,$D4,$D3 # 34xx -> 3434 912 vpshufd \$0x44,$D4,$D0 # xx12 -> 1212 913 vmovdqa $D3,-0x90(%r11) 914 vmovdqa $D0,0x00(%rsp) 915 vpshufd \$0xEE,$D1,$D4 916 vmovdqu `16*3-64`($ctx),$D0 917 vpshufd \$0x44,$D1,$D1 918 vmovdqa $D4,-0x80(%r11) 919 vmovdqa $D1,0x10(%rsp) 920 vpshufd \$0xEE,$D2,$D3 921 vmovdqu `16*4-64`($ctx),$D1 922 vpshufd \$0x44,$D2,$D2 923 vmovdqa $D3,-0x70(%r11) 924 vmovdqa $D2,0x20(%rsp) 925 vpshufd \$0xEE,$D0,$D4 926 vmovdqu `16*5-64`($ctx),$D2 927 vpshufd \$0x44,$D0,$D0 928 vmovdqa $D4,-0x60(%r11) 929 vmovdqa $D0,0x30(%rsp) 930 vpshufd \$0xEE,$D1,$D3 931 vmovdqu `16*6-64`($ctx),$D0 932 vpshufd \$0x44,$D1,$D1 933 vmovdqa $D3,-0x50(%r11) 934 vmovdqa $D1,0x40(%rsp) 935 vpshufd \$0xEE,$D2,$D4 936 vmovdqu `16*7-64`($ctx),$D1 937 vpshufd \$0x44,$D2,$D2 938 vmovdqa $D4,-0x40(%r11) 939 vmovdqa $D2,0x50(%rsp) 940 vpshufd \$0xEE,$D0,$D3 941 vmovdqu `16*8-64`($ctx),$D2 942 vpshufd \$0x44,$D0,$D0 943 vmovdqa $D3,-0x30(%r11) 944 vmovdqa $D0,0x60(%rsp) 945 vpshufd \$0xEE,$D1,$D4 946 vpshufd \$0x44,$D1,$D1 947 vmovdqa $D4,-0x20(%r11) 948 vmovdqa $D1,0x70(%rsp) 949 vpshufd \$0xEE,$D2,$D3 950 vmovdqa 0x00(%rsp),$D4 # preload r0^2 951 vpshufd \$0x44,$D2,$D2 952 vmovdqa $D3,-0x10(%r11) 953 vmovdqa $D2,0x80(%rsp) 954 955 jmp .Loop_avx 956 957 .align 32 958 .Loop_avx: 959 ################################################################ 960 # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 961 # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r 962 # \___________________/ 963 # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 964 # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r 965 # \___________________/ \____________________/ 966 # 967 # Note that we start with inp[2:3]*r^2. This is because it 968 # doesn't depend on reduction in previous iteration. 969 ################################################################ 970 # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 971 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 972 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 973 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 974 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 975 # 976 # though note that $Tx and $Hx are "reversed" in this section, 977 # and $D4 is preloaded with r0^2... 978 979 vpmuludq $T0,$D4,$D0 # d0 = h0*r0 980 vpmuludq $T1,$D4,$D1 # d1 = h1*r0 981 vmovdqa $H2,0x20(%r11) # offload hash 982 vpmuludq $T2,$D4,$D2 # d3 = h2*r0 983 vmovdqa 0x10(%rsp),$H2 # r1^2 984 vpmuludq $T3,$D4,$D3 # d3 = h3*r0 985 vpmuludq $T4,$D4,$D4 # d4 = h4*r0 986 987 vmovdqa $H0,0x00(%r11) # 988 vpmuludq 0x20(%rsp),$T4,$H0 # h4*s1 989 vmovdqa $H1,0x10(%r11) # 990 vpmuludq $T3,$H2,$H1 # h3*r1 991 vpaddq $H0,$D0,$D0 # d0 += h4*s1 992 vpaddq $H1,$D4,$D4 # d4 += h3*r1 993 vmovdqa $H3,0x30(%r11) # 994 vpmuludq $T2,$H2,$H0 # h2*r1 995 vpmuludq $T1,$H2,$H1 # h1*r1 996 vpaddq $H0,$D3,$D3 # d3 += h2*r1 997 vmovdqa 0x30(%rsp),$H3 # r2^2 998 vpaddq $H1,$D2,$D2 # d2 += h1*r1 999 vmovdqa $H4,0x40(%r11) # 1000 vpmuludq $T0,$H2,$H2 # h0*r1 1001 vpmuludq $T2,$H3,$H0 # h2*r2 1002 vpaddq $H2,$D1,$D1 # d1 += h0*r1 1003 1004 vmovdqa 0x40(%rsp),$H4 # s2^2 1005 vpaddq $H0,$D4,$D4 # d4 += h2*r2 1006 vpmuludq $T1,$H3,$H1 # h1*r2 1007 vpmuludq $T0,$H3,$H3 # h0*r2 1008 vpaddq $H1,$D3,$D3 # d3 += h1*r2 1009 vmovdqa 0x50(%rsp),$H2 # r3^2 1010 vpaddq $H3,$D2,$D2 # d2 += h0*r2 1011 vpmuludq $T4,$H4,$H0 # h4*s2 1012 vpmuludq $T3,$H4,$H4 # h3*s2 1013 vpaddq $H0,$D1,$D1 # d1 += h4*s2 1014 vmovdqa 0x60(%rsp),$H3 # s3^2 1015 vpaddq $H4,$D0,$D0 # d0 += h3*s2 1016 1017 vmovdqa 0x80(%rsp),$H4 # s4^2 1018 vpmuludq $T1,$H2,$H1 # h1*r3 1019 vpmuludq $T0,$H2,$H2 # h0*r3 1020 vpaddq $H1,$D4,$D4 # d4 += h1*r3 1021 vpaddq $H2,$D3,$D3 # d3 += h0*r3 1022 vpmuludq $T4,$H3,$H0 # h4*s3 1023 vpmuludq $T3,$H3,$H1 # h3*s3 1024 vpaddq $H0,$D2,$D2 # d2 += h4*s3 1025 vmovdqu 16*0($inp),$H0 # load input 1026 vpaddq $H1,$D1,$D1 # d1 += h3*s3 1027 vpmuludq $T2,$H3,$H3 # h2*s3 1028 vpmuludq $T2,$H4,$T2 # h2*s4 1029 vpaddq $H3,$D0,$D0 # d0 += h2*s3 1030 1031 vmovdqu 16*1($inp),$H1 # 1032 vpaddq $T2,$D1,$D1 # d1 += h2*s4 1033 vpmuludq $T3,$H4,$T3 # h3*s4 1034 vpmuludq $T4,$H4,$T4 # h4*s4 1035 vpsrldq \$6,$H0,$H2 # splat input 1036 vpaddq $T3,$D2,$D2 # d2 += h3*s4 1037 vpaddq $T4,$D3,$D3 # d3 += h4*s4 1038 vpsrldq \$6,$H1,$H3 # 1039 vpmuludq 0x70(%rsp),$T0,$T4 # h0*r4 1040 vpmuludq $T1,$H4,$T0 # h1*s4 1041 vpunpckhqdq $H1,$H0,$H4 # 4 1042 vpaddq $T4,$D4,$D4 # d4 += h0*r4 1043 vmovdqa -0x90(%r11),$T4 # r0^4 1044 vpaddq $T0,$D0,$D0 # d0 += h1*s4 1045 1046 vpunpcklqdq $H1,$H0,$H0 # 0:1 1047 vpunpcklqdq $H3,$H2,$H3 # 2:3 1048 1049 #vpsrlq \$40,$H4,$H4 # 4 1050 vpsrldq \$`40/8`,$H4,$H4 # 4 1051 vpsrlq \$26,$H0,$H1 1052 vpand $MASK,$H0,$H0 # 0 1053 vpsrlq \$4,$H3,$H2 1054 vpand $MASK,$H1,$H1 # 1 1055 vpand 0(%rcx),$H4,$H4 # .Lmask24 1056 vpsrlq \$30,$H3,$H3 1057 vpand $MASK,$H2,$H2 # 2 1058 vpand $MASK,$H3,$H3 # 3 1059 vpor 32(%rcx),$H4,$H4 # padbit, yes, always 1060 1061 vpaddq 0x00(%r11),$H0,$H0 # add hash value 1062 vpaddq 0x10(%r11),$H1,$H1 1063 vpaddq 0x20(%r11),$H2,$H2 1064 vpaddq 0x30(%r11),$H3,$H3 1065 vpaddq 0x40(%r11),$H4,$H4 1066 1067 lea 16*2($inp),%rax 1068 lea 16*4($inp),$inp 1069 sub \$64,$len 1070 cmovc %rax,$inp 1071 1072 ################################################################ 1073 # Now we accumulate (inp[0:1]+hash)*r^4 1074 ################################################################ 1075 # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 1076 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 1077 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 1078 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 1079 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 1080 1081 vpmuludq $H0,$T4,$T0 # h0*r0 1082 vpmuludq $H1,$T4,$T1 # h1*r0 1083 vpaddq $T0,$D0,$D0 1084 vpaddq $T1,$D1,$D1 1085 vmovdqa -0x80(%r11),$T2 # r1^4 1086 vpmuludq $H2,$T4,$T0 # h2*r0 1087 vpmuludq $H3,$T4,$T1 # h3*r0 1088 vpaddq $T0,$D2,$D2 1089 vpaddq $T1,$D3,$D3 1090 vpmuludq $H4,$T4,$T4 # h4*r0 1091 vpmuludq -0x70(%r11),$H4,$T0 # h4*s1 1092 vpaddq $T4,$D4,$D4 1093 1094 vpaddq $T0,$D0,$D0 # d0 += h4*s1 1095 vpmuludq $H2,$T2,$T1 # h2*r1 1096 vpmuludq $H3,$T2,$T0 # h3*r1 1097 vpaddq $T1,$D3,$D3 # d3 += h2*r1 1098 vmovdqa -0x60(%r11),$T3 # r2^4 1099 vpaddq $T0,$D4,$D4 # d4 += h3*r1 1100 vpmuludq $H1,$T2,$T1 # h1*r1 1101 vpmuludq $H0,$T2,$T2 # h0*r1 1102 vpaddq $T1,$D2,$D2 # d2 += h1*r1 1103 vpaddq $T2,$D1,$D1 # d1 += h0*r1 1104 1105 vmovdqa -0x50(%r11),$T4 # s2^4 1106 vpmuludq $H2,$T3,$T0 # h2*r2 1107 vpmuludq $H1,$T3,$T1 # h1*r2 1108 vpaddq $T0,$D4,$D4 # d4 += h2*r2 1109 vpaddq $T1,$D3,$D3 # d3 += h1*r2 1110 vmovdqa -0x40(%r11),$T2 # r3^4 1111 vpmuludq $H0,$T3,$T3 # h0*r2 1112 vpmuludq $H4,$T4,$T0 # h4*s2 1113 vpaddq $T3,$D2,$D2 # d2 += h0*r2 1114 vpaddq $T0,$D1,$D1 # d1 += h4*s2 1115 vmovdqa -0x30(%r11),$T3 # s3^4 1116 vpmuludq $H3,$T4,$T4 # h3*s2 1117 vpmuludq $H1,$T2,$T1 # h1*r3 1118 vpaddq $T4,$D0,$D0 # d0 += h3*s2 1119 1120 vmovdqa -0x10(%r11),$T4 # s4^4 1121 vpaddq $T1,$D4,$D4 # d4 += h1*r3 1122 vpmuludq $H0,$T2,$T2 # h0*r3 1123 vpmuludq $H4,$T3,$T0 # h4*s3 1124 vpaddq $T2,$D3,$D3 # d3 += h0*r3 1125 vpaddq $T0,$D2,$D2 # d2 += h4*s3 1126 vmovdqu 16*2($inp),$T0 # load input 1127 vpmuludq $H3,$T3,$T2 # h3*s3 1128 vpmuludq $H2,$T3,$T3 # h2*s3 1129 vpaddq $T2,$D1,$D1 # d1 += h3*s3 1130 vmovdqu 16*3($inp),$T1 # 1131 vpaddq $T3,$D0,$D0 # d0 += h2*s3 1132 1133 vpmuludq $H2,$T4,$H2 # h2*s4 1134 vpmuludq $H3,$T4,$H3 # h3*s4 1135 vpsrldq \$6,$T0,$T2 # splat input 1136 vpaddq $H2,$D1,$D1 # d1 += h2*s4 1137 vpmuludq $H4,$T4,$H4 # h4*s4 1138 vpsrldq \$6,$T1,$T3 # 1139 vpaddq $H3,$D2,$H2 # h2 = d2 + h3*s4 1140 vpaddq $H4,$D3,$H3 # h3 = d3 + h4*s4 1141 vpmuludq -0x20(%r11),$H0,$H4 # h0*r4 1142 vpmuludq $H1,$T4,$H0 1143 vpunpckhqdq $T1,$T0,$T4 # 4 1144 vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4 1145 vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4 1146 1147 vpunpcklqdq $T1,$T0,$T0 # 0:1 1148 vpunpcklqdq $T3,$T2,$T3 # 2:3 1149 1150 #vpsrlq \$40,$T4,$T4 # 4 1151 vpsrldq \$`40/8`,$T4,$T4 # 4 1152 vpsrlq \$26,$T0,$T1 1153 vmovdqa 0x00(%rsp),$D4 # preload r0^2 1154 vpand $MASK,$T0,$T0 # 0 1155 vpsrlq \$4,$T3,$T2 1156 vpand $MASK,$T1,$T1 # 1 1157 vpand 0(%rcx),$T4,$T4 # .Lmask24 1158 vpsrlq \$30,$T3,$T3 1159 vpand $MASK,$T2,$T2 # 2 1160 vpand $MASK,$T3,$T3 # 3 1161 vpor 32(%rcx),$T4,$T4 # padbit, yes, always 1162 1163 ################################################################ 1164 # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein 1165 # and P. Schwabe 1166 1167 vpsrlq \$26,$H3,$D3 1168 vpand $MASK,$H3,$H3 1169 vpaddq $D3,$H4,$H4 # h3 -> h4 1170 1171 vpsrlq \$26,$H0,$D0 1172 vpand $MASK,$H0,$H0 1173 vpaddq $D0,$D1,$H1 # h0 -> h1 1174 1175 vpsrlq \$26,$H4,$D0 1176 vpand $MASK,$H4,$H4 1177 1178 vpsrlq \$26,$H1,$D1 1179 vpand $MASK,$H1,$H1 1180 vpaddq $D1,$H2,$H2 # h1 -> h2 1181 1182 vpaddq $D0,$H0,$H0 1183 vpsllq \$2,$D0,$D0 1184 vpaddq $D0,$H0,$H0 # h4 -> h0 1185 1186 vpsrlq \$26,$H2,$D2 1187 vpand $MASK,$H2,$H2 1188 vpaddq $D2,$H3,$H3 # h2 -> h3 1189 1190 vpsrlq \$26,$H0,$D0 1191 vpand $MASK,$H0,$H0 1192 vpaddq $D0,$H1,$H1 # h0 -> h1 1193 1194 vpsrlq \$26,$H3,$D3 1195 vpand $MASK,$H3,$H3 1196 vpaddq $D3,$H4,$H4 # h3 -> h4 1197 1198 ja .Loop_avx 1199 1200 .Lskip_loop_avx: 1201 ################################################################ 1202 # multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 1203 1204 vpshufd \$0x10,$D4,$D4 # r0^n, xx12 -> x1x2 1205 add \$32,$len 1206 jnz .Long_tail_avx 1207 1208 vpaddq $H2,$T2,$T2 1209 vpaddq $H0,$T0,$T0 1210 vpaddq $H1,$T1,$T1 1211 vpaddq $H3,$T3,$T3 1212 vpaddq $H4,$T4,$T4 1213 1214 .Long_tail_avx: 1215 vmovdqa $H2,0x20(%r11) 1216 vmovdqa $H0,0x00(%r11) 1217 vmovdqa $H1,0x10(%r11) 1218 vmovdqa $H3,0x30(%r11) 1219 vmovdqa $H4,0x40(%r11) 1220 1221 # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 1222 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 1223 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 1224 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 1225 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 1226 1227 vpmuludq $T2,$D4,$D2 # d2 = h2*r0 1228 vpmuludq $T0,$D4,$D0 # d0 = h0*r0 1229 vpshufd \$0x10,`16*1-64`($ctx),$H2 # r1^n 1230 vpmuludq $T1,$D4,$D1 # d1 = h1*r0 1231 vpmuludq $T3,$D4,$D3 # d3 = h3*r0 1232 vpmuludq $T4,$D4,$D4 # d4 = h4*r0 1233 1234 vpmuludq $T3,$H2,$H0 # h3*r1 1235 vpaddq $H0,$D4,$D4 # d4 += h3*r1 1236 vpshufd \$0x10,`16*2-64`($ctx),$H3 # s1^n 1237 vpmuludq $T2,$H2,$H1 # h2*r1 1238 vpaddq $H1,$D3,$D3 # d3 += h2*r1 1239 vpshufd \$0x10,`16*3-64`($ctx),$H4 # r2^n 1240 vpmuludq $T1,$H2,$H0 # h1*r1 1241 vpaddq $H0,$D2,$D2 # d2 += h1*r1 1242 vpmuludq $T0,$H2,$H2 # h0*r1 1243 vpaddq $H2,$D1,$D1 # d1 += h0*r1 1244 vpmuludq $T4,$H3,$H3 # h4*s1 1245 vpaddq $H3,$D0,$D0 # d0 += h4*s1 1246 1247 vpshufd \$0x10,`16*4-64`($ctx),$H2 # s2^n 1248 vpmuludq $T2,$H4,$H1 # h2*r2 1249 vpaddq $H1,$D4,$D4 # d4 += h2*r2 1250 vpmuludq $T1,$H4,$H0 # h1*r2 1251 vpaddq $H0,$D3,$D3 # d3 += h1*r2 1252 vpshufd \$0x10,`16*5-64`($ctx),$H3 # r3^n 1253 vpmuludq $T0,$H4,$H4 # h0*r2 1254 vpaddq $H4,$D2,$D2 # d2 += h0*r2 1255 vpmuludq $T4,$H2,$H1 # h4*s2 1256 vpaddq $H1,$D1,$D1 # d1 += h4*s2 1257 vpshufd \$0x10,`16*6-64`($ctx),$H4 # s3^n 1258 vpmuludq $T3,$H2,$H2 # h3*s2 1259 vpaddq $H2,$D0,$D0 # d0 += h3*s2 1260 1261 vpmuludq $T1,$H3,$H0 # h1*r3 1262 vpaddq $H0,$D4,$D4 # d4 += h1*r3 1263 vpmuludq $T0,$H3,$H3 # h0*r3 1264 vpaddq $H3,$D3,$D3 # d3 += h0*r3 1265 vpshufd \$0x10,`16*7-64`($ctx),$H2 # r4^n 1266 vpmuludq $T4,$H4,$H1 # h4*s3 1267 vpaddq $H1,$D2,$D2 # d2 += h4*s3 1268 vpshufd \$0x10,`16*8-64`($ctx),$H3 # s4^n 1269 vpmuludq $T3,$H4,$H0 # h3*s3 1270 vpaddq $H0,$D1,$D1 # d1 += h3*s3 1271 vpmuludq $T2,$H4,$H4 # h2*s3 1272 vpaddq $H4,$D0,$D0 # d0 += h2*s3 1273 1274 vpmuludq $T0,$H2,$H2 # h0*r4 1275 vpaddq $H2,$D4,$D4 # h4 = d4 + h0*r4 1276 vpmuludq $T4,$H3,$H1 # h4*s4 1277 vpaddq $H1,$D3,$D3 # h3 = d3 + h4*s4 1278 vpmuludq $T3,$H3,$H0 # h3*s4 1279 vpaddq $H0,$D2,$D2 # h2 = d2 + h3*s4 1280 vpmuludq $T2,$H3,$H1 # h2*s4 1281 vpaddq $H1,$D1,$D1 # h1 = d1 + h2*s4 1282 vpmuludq $T1,$H3,$H3 # h1*s4 1283 vpaddq $H3,$D0,$D0 # h0 = d0 + h1*s4 1284 1285 jz .Lshort_tail_avx 1286 1287 vmovdqu 16*0($inp),$H0 # load input 1288 vmovdqu 16*1($inp),$H1 1289 1290 vpsrldq \$6,$H0,$H2 # splat input 1291 vpsrldq \$6,$H1,$H3 1292 vpunpckhqdq $H1,$H0,$H4 # 4 1293 vpunpcklqdq $H1,$H0,$H0 # 0:1 1294 vpunpcklqdq $H3,$H2,$H3 # 2:3 1295 1296 vpsrlq \$40,$H4,$H4 # 4 1297 vpsrlq \$26,$H0,$H1 1298 vpand $MASK,$H0,$H0 # 0 1299 vpsrlq \$4,$H3,$H2 1300 vpand $MASK,$H1,$H1 # 1 1301 vpsrlq \$30,$H3,$H3 1302 vpand $MASK,$H2,$H2 # 2 1303 vpand $MASK,$H3,$H3 # 3 1304 vpor 32(%rcx),$H4,$H4 # padbit, yes, always 1305 1306 vpshufd \$0x32,`16*0-64`($ctx),$T4 # r0^n, 34xx -> x3x4 1307 vpaddq 0x00(%r11),$H0,$H0 1308 vpaddq 0x10(%r11),$H1,$H1 1309 vpaddq 0x20(%r11),$H2,$H2 1310 vpaddq 0x30(%r11),$H3,$H3 1311 vpaddq 0x40(%r11),$H4,$H4 1312 1313 ################################################################ 1314 # multiply (inp[0:1]+hash) by r^4:r^3 and accumulate 1315 1316 vpmuludq $H0,$T4,$T0 # h0*r0 1317 vpaddq $T0,$D0,$D0 # d0 += h0*r0 1318 vpmuludq $H1,$T4,$T1 # h1*r0 1319 vpaddq $T1,$D1,$D1 # d1 += h1*r0 1320 vpmuludq $H2,$T4,$T0 # h2*r0 1321 vpaddq $T0,$D2,$D2 # d2 += h2*r0 1322 vpshufd \$0x32,`16*1-64`($ctx),$T2 # r1^n 1323 vpmuludq $H3,$T4,$T1 # h3*r0 1324 vpaddq $T1,$D3,$D3 # d3 += h3*r0 1325 vpmuludq $H4,$T4,$T4 # h4*r0 1326 vpaddq $T4,$D4,$D4 # d4 += h4*r0 1327 1328 vpmuludq $H3,$T2,$T0 # h3*r1 1329 vpaddq $T0,$D4,$D4 # d4 += h3*r1 1330 vpshufd \$0x32,`16*2-64`($ctx),$T3 # s1 1331 vpmuludq $H2,$T2,$T1 # h2*r1 1332 vpaddq $T1,$D3,$D3 # d3 += h2*r1 1333 vpshufd \$0x32,`16*3-64`($ctx),$T4 # r2 1334 vpmuludq $H1,$T2,$T0 # h1*r1 1335 vpaddq $T0,$D2,$D2 # d2 += h1*r1 1336 vpmuludq $H0,$T2,$T2 # h0*r1 1337 vpaddq $T2,$D1,$D1 # d1 += h0*r1 1338 vpmuludq $H4,$T3,$T3 # h4*s1 1339 vpaddq $T3,$D0,$D0 # d0 += h4*s1 1340 1341 vpshufd \$0x32,`16*4-64`($ctx),$T2 # s2 1342 vpmuludq $H2,$T4,$T1 # h2*r2 1343 vpaddq $T1,$D4,$D4 # d4 += h2*r2 1344 vpmuludq $H1,$T4,$T0 # h1*r2 1345 vpaddq $T0,$D3,$D3 # d3 += h1*r2 1346 vpshufd \$0x32,`16*5-64`($ctx),$T3 # r3 1347 vpmuludq $H0,$T4,$T4 # h0*r2 1348 vpaddq $T4,$D2,$D2 # d2 += h0*r2 1349 vpmuludq $H4,$T2,$T1 # h4*s2 1350 vpaddq $T1,$D1,$D1 # d1 += h4*s2 1351 vpshufd \$0x32,`16*6-64`($ctx),$T4 # s3 1352 vpmuludq $H3,$T2,$T2 # h3*s2 1353 vpaddq $T2,$D0,$D0 # d0 += h3*s2 1354 1355 vpmuludq $H1,$T3,$T0 # h1*r3 1356 vpaddq $T0,$D4,$D4 # d4 += h1*r3 1357 vpmuludq $H0,$T3,$T3 # h0*r3 1358 vpaddq $T3,$D3,$D3 # d3 += h0*r3 1359 vpshufd \$0x32,`16*7-64`($ctx),$T2 # r4 1360 vpmuludq $H4,$T4,$T1 # h4*s3 1361 vpaddq $T1,$D2,$D2 # d2 += h4*s3 1362 vpshufd \$0x32,`16*8-64`($ctx),$T3 # s4 1363 vpmuludq $H3,$T4,$T0 # h3*s3 1364 vpaddq $T0,$D1,$D1 # d1 += h3*s3 1365 vpmuludq $H2,$T4,$T4 # h2*s3 1366 vpaddq $T4,$D0,$D0 # d0 += h2*s3 1367 1368 vpmuludq $H0,$T2,$T2 # h0*r4 1369 vpaddq $T2,$D4,$D4 # d4 += h0*r4 1370 vpmuludq $H4,$T3,$T1 # h4*s4 1371 vpaddq $T1,$D3,$D3 # d3 += h4*s4 1372 vpmuludq $H3,$T3,$T0 # h3*s4 1373 vpaddq $T0,$D2,$D2 # d2 += h3*s4 1374 vpmuludq $H2,$T3,$T1 # h2*s4 1375 vpaddq $T1,$D1,$D1 # d1 += h2*s4 1376 vpmuludq $H1,$T3,$T3 # h1*s4 1377 vpaddq $T3,$D0,$D0 # d0 += h1*s4 1378 1379 .Lshort_tail_avx: 1380 ################################################################ 1381 # horizontal addition 1382 1383 vpsrldq \$8,$D4,$T4 1384 vpsrldq \$8,$D3,$T3 1385 vpsrldq \$8,$D1,$T1 1386 vpsrldq \$8,$D0,$T0 1387 vpsrldq \$8,$D2,$T2 1388 vpaddq $T3,$D3,$D3 1389 vpaddq $T4,$D4,$D4 1390 vpaddq $T0,$D0,$D0 1391 vpaddq $T1,$D1,$D1 1392 vpaddq $T2,$D2,$D2 1393 1394 ################################################################ 1395 # lazy reduction 1396 1397 vpsrlq \$26,$D3,$H3 1398 vpand $MASK,$D3,$D3 1399 vpaddq $H3,$D4,$D4 # h3 -> h4 1400 1401 vpsrlq \$26,$D0,$H0 1402 vpand $MASK,$D0,$D0 1403 vpaddq $H0,$D1,$D1 # h0 -> h1 1404 1405 vpsrlq \$26,$D4,$H4 1406 vpand $MASK,$D4,$D4 1407 1408 vpsrlq \$26,$D1,$H1 1409 vpand $MASK,$D1,$D1 1410 vpaddq $H1,$D2,$D2 # h1 -> h2 1411 1412 vpaddq $H4,$D0,$D0 1413 vpsllq \$2,$H4,$H4 1414 vpaddq $H4,$D0,$D0 # h4 -> h0 1415 1416 vpsrlq \$26,$D2,$H2 1417 vpand $MASK,$D2,$D2 1418 vpaddq $H2,$D3,$D3 # h2 -> h3 1419 1420 vpsrlq \$26,$D0,$H0 1421 vpand $MASK,$D0,$D0 1422 vpaddq $H0,$D1,$D1 # h0 -> h1 1423 1424 vpsrlq \$26,$D3,$H3 1425 vpand $MASK,$D3,$D3 1426 vpaddq $H3,$D4,$D4 # h3 -> h4 1427 1428 vmovd $D0,`4*0-48-64`($ctx) # save partially reduced 1429 vmovd $D1,`4*1-48-64`($ctx) 1430 vmovd $D2,`4*2-48-64`($ctx) 1431 vmovd $D3,`4*3-48-64`($ctx) 1432 vmovd $D4,`4*4-48-64`($ctx) 1433 ___ 1434 $code.=<<___ if ($win64); 1435 vmovdqa 0x50(%r11),%xmm6 1436 vmovdqa 0x60(%r11),%xmm7 1437 vmovdqa 0x70(%r11),%xmm8 1438 vmovdqa 0x80(%r11),%xmm9 1439 vmovdqa 0x90(%r11),%xmm10 1440 vmovdqa 0xa0(%r11),%xmm11 1441 vmovdqa 0xb0(%r11),%xmm12 1442 vmovdqa 0xc0(%r11),%xmm13 1443 vmovdqa 0xd0(%r11),%xmm14 1444 vmovdqa 0xe0(%r11),%xmm15 1445 lea 0xf8(%r11),%rsp 1446 .Ldo_avx_epilogue: 1447 ___ 1448 $code.=<<___ if (!$win64); 1449 lea -8(%r10),%rsp 1450 .cfi_def_cfa_register %rsp 1451 ___ 1452 $code.=<<___; 1453 vzeroupper 1454 RET 1455 .cfi_endproc 1456 ___ 1457 &end_function("poly1305_blocks_avx"); 1458 1459 &declare_function("poly1305_emit_avx", 32, 3); 1460 $code.=<<___; 1461 cmpl \$0,20($ctx) # is_base2_26? 1462 je .Lemit 1463 1464 mov 0($ctx),%eax # load hash value base 2^26 1465 mov 4($ctx),%ecx 1466 mov 8($ctx),%r8d 1467 mov 12($ctx),%r11d 1468 mov 16($ctx),%r10d 1469 1470 shl \$26,%rcx # base 2^26 -> base 2^64 1471 mov %r8,%r9 1472 shl \$52,%r8 1473 add %rcx,%rax 1474 shr \$12,%r9 1475 add %rax,%r8 # h0 1476 adc \$0,%r9 1477 1478 shl \$14,%r11 1479 mov %r10,%rax 1480 shr \$24,%r10 1481 add %r11,%r9 1482 shl \$40,%rax 1483 add %rax,%r9 # h1 1484 adc \$0,%r10 # h2 1485 1486 mov %r10,%rax # could be partially reduced, so reduce 1487 mov %r10,%rcx 1488 and \$3,%r10 1489 shr \$2,%rax 1490 and \$-4,%rcx 1491 add %rcx,%rax 1492 add %rax,%r8 1493 adc \$0,%r9 1494 adc \$0,%r10 1495 1496 mov %r8,%rax 1497 add \$5,%r8 # compare to modulus 1498 mov %r9,%rcx 1499 adc \$0,%r9 1500 adc \$0,%r10 1501 shr \$2,%r10 # did 130-bit value overflow? 1502 cmovnz %r8,%rax 1503 cmovnz %r9,%rcx 1504 1505 add 0($nonce),%rax # accumulate nonce 1506 adc 8($nonce),%rcx 1507 mov %rax,0($mac) # write result 1508 mov %rcx,8($mac) 1509 1510 RET 1511 ___ 1512 &end_function("poly1305_emit_avx"); 1513 1514 if ($avx>1) { 1515 1516 my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) = 1517 map("%ymm$_",(0..15)); 1518 my $S4=$MASK; 1519 1520 sub poly1305_blocks_avxN { 1521 my ($avx512) = @_; 1522 my $suffix = $avx512 ? "_avx512" : ""; 1523 $code.=<<___; 1524 .cfi_startproc 1525 mov 20($ctx),%r8d # is_base2_26 1526 cmp \$128,$len 1527 jae .Lblocks_avx2$suffix 1528 test %r8d,%r8d 1529 jz .Lblocks 1530 1531 .Lblocks_avx2$suffix: 1532 and \$-16,$len 1533 jz .Lno_data_avx2$suffix 1534 1535 vzeroupper 1536 1537 test %r8d,%r8d 1538 jz .Lbase2_64_avx2$suffix 1539 1540 test \$63,$len 1541 jz .Leven_avx2$suffix 1542 1543 push %rbp 1544 .cfi_push %rbp 1545 mov %rsp,%rbp 1546 push %rbx 1547 .cfi_push %rbx 1548 push %r12 1549 .cfi_push %r12 1550 push %r13 1551 .cfi_push %r13 1552 push %r14 1553 .cfi_push %r14 1554 push %r15 1555 .cfi_push %r15 1556 .Lblocks_avx2_body$suffix: 1557 1558 mov $len,%r15 # reassign $len 1559 1560 mov 0($ctx),$d1 # load hash value 1561 mov 8($ctx),$d2 1562 mov 16($ctx),$h2#d 1563 1564 mov 24($ctx),$r0 # load r 1565 mov 32($ctx),$s1 1566 1567 ################################# base 2^26 -> base 2^64 1568 mov $d1#d,$h0#d 1569 and \$`-1*(1<<31)`,$d1 1570 mov $d2,$r1 # borrow $r1 1571 mov $d2#d,$h1#d 1572 and \$`-1*(1<<31)`,$d2 1573 1574 shr \$6,$d1 1575 shl \$52,$r1 1576 add $d1,$h0 1577 shr \$12,$h1 1578 shr \$18,$d2 1579 add $r1,$h0 1580 adc $d2,$h1 1581 1582 mov $h2,$d1 1583 shl \$40,$d1 1584 shr \$24,$h2 1585 add $d1,$h1 1586 adc \$0,$h2 # can be partially reduced... 1587 1588 mov \$-4,$d2 # ... so reduce 1589 mov $h2,$d1 1590 and $h2,$d2 1591 shr \$2,$d1 1592 and \$3,$h2 1593 add $d2,$d1 # =*5 1594 add $d1,$h0 1595 adc \$0,$h1 1596 adc \$0,$h2 1597 1598 mov $s1,$r1 1599 mov $s1,%rax 1600 shr \$2,$s1 1601 add $r1,$s1 # s1 = r1 + (r1 >> 2) 1602 1603 .Lbase2_26_pre_avx2$suffix: 1604 add 0($inp),$h0 # accumulate input 1605 adc 8($inp),$h1 1606 lea 16($inp),$inp 1607 adc $padbit,$h2 1608 sub \$16,%r15 1609 1610 call __poly1305_block 1611 mov $r1,%rax 1612 1613 test \$63,%r15 1614 jnz .Lbase2_26_pre_avx2$suffix 1615 1616 test $padbit,$padbit # if $padbit is zero, 1617 jz .Lstore_base2_64_avx2$suffix # store hash in base 2^64 format 1618 1619 ################################# base 2^64 -> base 2^26 1620 mov $h0,%rax 1621 mov $h0,%rdx 1622 shr \$52,$h0 1623 mov $h1,$r0 1624 mov $h1,$r1 1625 shr \$26,%rdx 1626 and \$0x3ffffff,%rax # h[0] 1627 shl \$12,$r0 1628 and \$0x3ffffff,%rdx # h[1] 1629 shr \$14,$h1 1630 or $r0,$h0 1631 shl \$24,$h2 1632 and \$0x3ffffff,$h0 # h[2] 1633 shr \$40,$r1 1634 and \$0x3ffffff,$h1 # h[3] 1635 or $r1,$h2 # h[4] 1636 1637 test %r15,%r15 1638 jz .Lstore_base2_26_avx2$suffix 1639 1640 vmovd %rax#d,%x#$H0 1641 vmovd %rdx#d,%x#$H1 1642 vmovd $h0#d,%x#$H2 1643 vmovd $h1#d,%x#$H3 1644 vmovd $h2#d,%x#$H4 1645 jmp .Lproceed_avx2$suffix 1646 1647 .align 32 1648 .Lstore_base2_64_avx2$suffix: 1649 mov $h0,0($ctx) 1650 mov $h1,8($ctx) 1651 mov $h2,16($ctx) # note that is_base2_26 is zeroed 1652 jmp .Ldone_avx2$suffix 1653 1654 .align 16 1655 .Lstore_base2_26_avx2$suffix: 1656 mov %rax#d,0($ctx) # store hash value base 2^26 1657 mov %rdx#d,4($ctx) 1658 mov $h0#d,8($ctx) 1659 mov $h1#d,12($ctx) 1660 mov $h2#d,16($ctx) 1661 .align 16 1662 .Ldone_avx2$suffix: 1663 pop %r15 1664 .cfi_restore %r15 1665 pop %r14 1666 .cfi_restore %r14 1667 pop %r13 1668 .cfi_restore %r13 1669 pop %r12 1670 .cfi_restore %r12 1671 pop %rbx 1672 .cfi_restore %rbx 1673 pop %rbp 1674 .cfi_restore %rbp 1675 .Lno_data_avx2$suffix: 1676 .Lblocks_avx2_epilogue$suffix: 1677 RET 1678 .cfi_endproc 1679 1680 .align 32 1681 .Lbase2_64_avx2$suffix: 1682 .cfi_startproc 1683 push %rbp 1684 .cfi_push %rbp 1685 mov %rsp,%rbp 1686 push %rbx 1687 .cfi_push %rbx 1688 push %r12 1689 .cfi_push %r12 1690 push %r13 1691 .cfi_push %r13 1692 push %r14 1693 .cfi_push %r14 1694 push %r15 1695 .cfi_push %r15 1696 .Lbase2_64_avx2_body$suffix: 1697 1698 mov $len,%r15 # reassign $len 1699 1700 mov 24($ctx),$r0 # load r 1701 mov 32($ctx),$s1 1702 1703 mov 0($ctx),$h0 # load hash value 1704 mov 8($ctx),$h1 1705 mov 16($ctx),$h2#d 1706 1707 mov $s1,$r1 1708 mov $s1,%rax 1709 shr \$2,$s1 1710 add $r1,$s1 # s1 = r1 + (r1 >> 2) 1711 1712 test \$63,$len 1713 jz .Linit_avx2$suffix 1714 1715 .Lbase2_64_pre_avx2$suffix: 1716 add 0($inp),$h0 # accumulate input 1717 adc 8($inp),$h1 1718 lea 16($inp),$inp 1719 adc $padbit,$h2 1720 sub \$16,%r15 1721 1722 call __poly1305_block 1723 mov $r1,%rax 1724 1725 test \$63,%r15 1726 jnz .Lbase2_64_pre_avx2$suffix 1727 1728 .Linit_avx2$suffix: 1729 ################################# base 2^64 -> base 2^26 1730 mov $h0,%rax 1731 mov $h0,%rdx 1732 shr \$52,$h0 1733 mov $h1,$d1 1734 mov $h1,$d2 1735 shr \$26,%rdx 1736 and \$0x3ffffff,%rax # h[0] 1737 shl \$12,$d1 1738 and \$0x3ffffff,%rdx # h[1] 1739 shr \$14,$h1 1740 or $d1,$h0 1741 shl \$24,$h2 1742 and \$0x3ffffff,$h0 # h[2] 1743 shr \$40,$d2 1744 and \$0x3ffffff,$h1 # h[3] 1745 or $d2,$h2 # h[4] 1746 1747 vmovd %rax#d,%x#$H0 1748 vmovd %rdx#d,%x#$H1 1749 vmovd $h0#d,%x#$H2 1750 vmovd $h1#d,%x#$H3 1751 vmovd $h2#d,%x#$H4 1752 movl \$1,20($ctx) # set is_base2_26 1753 1754 call __poly1305_init_avx 1755 1756 .Lproceed_avx2$suffix: 1757 mov %r15,$len # restore $len 1758 ___ 1759 $code.=<<___ if (!$kernel); 1760 mov OPENSSL_ia32cap_P+8(%rip),%r9d 1761 mov \$`(1<<31|1<<30|1<<16)`,%r11d 1762 ___ 1763 $code.=<<___; 1764 pop %r15 1765 .cfi_restore %r15 1766 pop %r14 1767 .cfi_restore %r14 1768 pop %r13 1769 .cfi_restore %r13 1770 pop %r12 1771 .cfi_restore %r12 1772 pop %rbx 1773 .cfi_restore %rbx 1774 pop %rbp 1775 .cfi_restore %rbp 1776 .Lbase2_64_avx2_epilogue$suffix: 1777 jmp .Ldo_avx2$suffix 1778 .cfi_endproc 1779 1780 .align 32 1781 .Leven_avx2$suffix: 1782 .cfi_startproc 1783 ___ 1784 $code.=<<___ if (!$kernel); 1785 mov OPENSSL_ia32cap_P+8(%rip),%r9d 1786 ___ 1787 $code.=<<___; 1788 vmovd 4*0($ctx),%x#$H0 # load hash value base 2^26 1789 vmovd 4*1($ctx),%x#$H1 1790 vmovd 4*2($ctx),%x#$H2 1791 vmovd 4*3($ctx),%x#$H3 1792 vmovd 4*4($ctx),%x#$H4 1793 1794 .Ldo_avx2$suffix: 1795 ___ 1796 $code.=<<___ if (!$kernel && $avx>2); 1797 cmp \$512,$len 1798 jb .Lskip_avx512 1799 and %r11d,%r9d 1800 test \$`1<<16`,%r9d # check for AVX512F 1801 jnz .Lblocks_avx512 1802 .Lskip_avx512$suffix: 1803 ___ 1804 $code.=<<___ if ($avx > 2 && $avx512 && $kernel); 1805 cmp \$512,$len 1806 jae .Lblocks_avx512 1807 ___ 1808 $code.=<<___ if (!$win64); 1809 lea 8(%rsp),%r10 1810 .cfi_def_cfa_register %r10 1811 sub \$0x128,%rsp 1812 ___ 1813 $code.=<<___ if ($win64); 1814 lea 8(%rsp),%r10 1815 sub \$0x1c8,%rsp 1816 vmovdqa %xmm6,-0xb0(%r10) 1817 vmovdqa %xmm7,-0xa0(%r10) 1818 vmovdqa %xmm8,-0x90(%r10) 1819 vmovdqa %xmm9,-0x80(%r10) 1820 vmovdqa %xmm10,-0x70(%r10) 1821 vmovdqa %xmm11,-0x60(%r10) 1822 vmovdqa %xmm12,-0x50(%r10) 1823 vmovdqa %xmm13,-0x40(%r10) 1824 vmovdqa %xmm14,-0x30(%r10) 1825 vmovdqa %xmm15,-0x20(%r10) 1826 .Ldo_avx2_body$suffix: 1827 ___ 1828 $code.=<<___; 1829 lea .Lconst(%rip),%rcx 1830 lea 48+64($ctx),$ctx # size optimization 1831 vmovdqa 96(%rcx),$T0 # .Lpermd_avx2 1832 1833 # expand and copy pre-calculated table to stack 1834 vmovdqu `16*0-64`($ctx),%x#$T2 1835 and \$-512,%rsp 1836 vmovdqu `16*1-64`($ctx),%x#$T3 1837 vmovdqu `16*2-64`($ctx),%x#$T4 1838 vmovdqu `16*3-64`($ctx),%x#$D0 1839 vmovdqu `16*4-64`($ctx),%x#$D1 1840 vmovdqu `16*5-64`($ctx),%x#$D2 1841 lea 0x90(%rsp),%rax # size optimization 1842 vmovdqu `16*6-64`($ctx),%x#$D3 1843 vpermd $T2,$T0,$T2 # 00003412 -> 14243444 1844 vmovdqu `16*7-64`($ctx),%x#$D4 1845 vpermd $T3,$T0,$T3 1846 vmovdqu `16*8-64`($ctx),%x#$MASK 1847 vpermd $T4,$T0,$T4 1848 vmovdqa $T2,0x00(%rsp) 1849 vpermd $D0,$T0,$D0 1850 vmovdqa $T3,0x20-0x90(%rax) 1851 vpermd $D1,$T0,$D1 1852 vmovdqa $T4,0x40-0x90(%rax) 1853 vpermd $D2,$T0,$D2 1854 vmovdqa $D0,0x60-0x90(%rax) 1855 vpermd $D3,$T0,$D3 1856 vmovdqa $D1,0x80-0x90(%rax) 1857 vpermd $D4,$T0,$D4 1858 vmovdqa $D2,0xa0-0x90(%rax) 1859 vpermd $MASK,$T0,$MASK 1860 vmovdqa $D3,0xc0-0x90(%rax) 1861 vmovdqa $D4,0xe0-0x90(%rax) 1862 vmovdqa $MASK,0x100-0x90(%rax) 1863 vmovdqa 64(%rcx),$MASK # .Lmask26 1864 1865 ################################################################ 1866 # load input 1867 vmovdqu 16*0($inp),%x#$T0 1868 vmovdqu 16*1($inp),%x#$T1 1869 vinserti128 \$1,16*2($inp),$T0,$T0 1870 vinserti128 \$1,16*3($inp),$T1,$T1 1871 lea 16*4($inp),$inp 1872 1873 vpsrldq \$6,$T0,$T2 # splat input 1874 vpsrldq \$6,$T1,$T3 1875 vpunpckhqdq $T1,$T0,$T4 # 4 1876 vpunpcklqdq $T3,$T2,$T2 # 2:3 1877 vpunpcklqdq $T1,$T0,$T0 # 0:1 1878 1879 vpsrlq \$30,$T2,$T3 1880 vpsrlq \$4,$T2,$T2 1881 vpsrlq \$26,$T0,$T1 1882 vpsrlq \$40,$T4,$T4 # 4 1883 vpand $MASK,$T2,$T2 # 2 1884 vpand $MASK,$T0,$T0 # 0 1885 vpand $MASK,$T1,$T1 # 1 1886 vpand $MASK,$T3,$T3 # 3 1887 vpor 32(%rcx),$T4,$T4 # padbit, yes, always 1888 1889 vpaddq $H2,$T2,$H2 # accumulate input 1890 sub \$64,$len 1891 jz .Ltail_avx2$suffix 1892 jmp .Loop_avx2$suffix 1893 1894 .align 32 1895 .Loop_avx2$suffix: 1896 ################################################################ 1897 # ((inp[0]*r^4+inp[4])*r^4+inp[ 8])*r^4 1898 # ((inp[1]*r^4+inp[5])*r^4+inp[ 9])*r^3 1899 # ((inp[2]*r^4+inp[6])*r^4+inp[10])*r^2 1900 # ((inp[3]*r^4+inp[7])*r^4+inp[11])*r^1 1901 # \________/\__________/ 1902 ################################################################ 1903 #vpaddq $H2,$T2,$H2 # accumulate input 1904 vpaddq $H0,$T0,$H0 1905 vmovdqa `32*0`(%rsp),$T0 # r0^4 1906 vpaddq $H1,$T1,$H1 1907 vmovdqa `32*1`(%rsp),$T1 # r1^4 1908 vpaddq $H3,$T3,$H3 1909 vmovdqa `32*3`(%rsp),$T2 # r2^4 1910 vpaddq $H4,$T4,$H4 1911 vmovdqa `32*6-0x90`(%rax),$T3 # s3^4 1912 vmovdqa `32*8-0x90`(%rax),$S4 # s4^4 1913 1914 # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 1915 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 1916 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 1917 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 1918 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 1919 # 1920 # however, as h2 is "chronologically" first one available pull 1921 # corresponding operations up, so it's 1922 # 1923 # d4 = h2*r2 + h4*r0 + h3*r1 + h1*r3 + h0*r4 1924 # d3 = h2*r1 + h3*r0 + h1*r2 + h0*r3 + h4*5*r4 1925 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 1926 # d1 = h2*5*r4 + h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 1927 # d0 = h2*5*r3 + h0*r0 + h4*5*r1 + h3*5*r2 + h1*5*r4 1928 1929 vpmuludq $H2,$T0,$D2 # d2 = h2*r0 1930 vpmuludq $H2,$T1,$D3 # d3 = h2*r1 1931 vpmuludq $H2,$T2,$D4 # d4 = h2*r2 1932 vpmuludq $H2,$T3,$D0 # d0 = h2*s3 1933 vpmuludq $H2,$S4,$D1 # d1 = h2*s4 1934 1935 vpmuludq $H0,$T1,$T4 # h0*r1 1936 vpmuludq $H1,$T1,$H2 # h1*r1, borrow $H2 as temp 1937 vpaddq $T4,$D1,$D1 # d1 += h0*r1 1938 vpaddq $H2,$D2,$D2 # d2 += h1*r1 1939 vpmuludq $H3,$T1,$T4 # h3*r1 1940 vpmuludq `32*2`(%rsp),$H4,$H2 # h4*s1 1941 vpaddq $T4,$D4,$D4 # d4 += h3*r1 1942 vpaddq $H2,$D0,$D0 # d0 += h4*s1 1943 vmovdqa `32*4-0x90`(%rax),$T1 # s2 1944 1945 vpmuludq $H0,$T0,$T4 # h0*r0 1946 vpmuludq $H1,$T0,$H2 # h1*r0 1947 vpaddq $T4,$D0,$D0 # d0 += h0*r0 1948 vpaddq $H2,$D1,$D1 # d1 += h1*r0 1949 vpmuludq $H3,$T0,$T4 # h3*r0 1950 vpmuludq $H4,$T0,$H2 # h4*r0 1951 vmovdqu 16*0($inp),%x#$T0 # load input 1952 vpaddq $T4,$D3,$D3 # d3 += h3*r0 1953 vpaddq $H2,$D4,$D4 # d4 += h4*r0 1954 vinserti128 \$1,16*2($inp),$T0,$T0 1955 1956 vpmuludq $H3,$T1,$T4 # h3*s2 1957 vpmuludq $H4,$T1,$H2 # h4*s2 1958 vmovdqu 16*1($inp),%x#$T1 1959 vpaddq $T4,$D0,$D0 # d0 += h3*s2 1960 vpaddq $H2,$D1,$D1 # d1 += h4*s2 1961 vmovdqa `32*5-0x90`(%rax),$H2 # r3 1962 vpmuludq $H1,$T2,$T4 # h1*r2 1963 vpmuludq $H0,$T2,$T2 # h0*r2 1964 vpaddq $T4,$D3,$D3 # d3 += h1*r2 1965 vpaddq $T2,$D2,$D2 # d2 += h0*r2 1966 vinserti128 \$1,16*3($inp),$T1,$T1 1967 lea 16*4($inp),$inp 1968 1969 vpmuludq $H1,$H2,$T4 # h1*r3 1970 vpmuludq $H0,$H2,$H2 # h0*r3 1971 vpsrldq \$6,$T0,$T2 # splat input 1972 vpaddq $T4,$D4,$D4 # d4 += h1*r3 1973 vpaddq $H2,$D3,$D3 # d3 += h0*r3 1974 vpmuludq $H3,$T3,$T4 # h3*s3 1975 vpmuludq $H4,$T3,$H2 # h4*s3 1976 vpsrldq \$6,$T1,$T3 1977 vpaddq $T4,$D1,$D1 # d1 += h3*s3 1978 vpaddq $H2,$D2,$D2 # d2 += h4*s3 1979 vpunpckhqdq $T1,$T0,$T4 # 4 1980 1981 vpmuludq $H3,$S4,$H3 # h3*s4 1982 vpmuludq $H4,$S4,$H4 # h4*s4 1983 vpunpcklqdq $T1,$T0,$T0 # 0:1 1984 vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4 1985 vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4 1986 vpunpcklqdq $T3,$T2,$T3 # 2:3 1987 vpmuludq `32*7-0x90`(%rax),$H0,$H4 # h0*r4 1988 vpmuludq $H1,$S4,$H0 # h1*s4 1989 vmovdqa 64(%rcx),$MASK # .Lmask26 1990 vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4 1991 vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4 1992 1993 ################################################################ 1994 # lazy reduction (interleaved with tail of input splat) 1995 1996 vpsrlq \$26,$H3,$D3 1997 vpand $MASK,$H3,$H3 1998 vpaddq $D3,$H4,$H4 # h3 -> h4 1999 2000 vpsrlq \$26,$H0,$D0 2001 vpand $MASK,$H0,$H0 2002 vpaddq $D0,$D1,$H1 # h0 -> h1 2003 2004 vpsrlq \$26,$H4,$D4 2005 vpand $MASK,$H4,$H4 2006 2007 vpsrlq \$4,$T3,$T2 2008 2009 vpsrlq \$26,$H1,$D1 2010 vpand $MASK,$H1,$H1 2011 vpaddq $D1,$H2,$H2 # h1 -> h2 2012 2013 vpaddq $D4,$H0,$H0 2014 vpsllq \$2,$D4,$D4 2015 vpaddq $D4,$H0,$H0 # h4 -> h0 2016 2017 vpand $MASK,$T2,$T2 # 2 2018 vpsrlq \$26,$T0,$T1 2019 2020 vpsrlq \$26,$H2,$D2 2021 vpand $MASK,$H2,$H2 2022 vpaddq $D2,$H3,$H3 # h2 -> h3 2023 2024 vpaddq $T2,$H2,$H2 # modulo-scheduled 2025 vpsrlq \$30,$T3,$T3 2026 2027 vpsrlq \$26,$H0,$D0 2028 vpand $MASK,$H0,$H0 2029 vpaddq $D0,$H1,$H1 # h0 -> h1 2030 2031 vpsrlq \$40,$T4,$T4 # 4 2032 2033 vpsrlq \$26,$H3,$D3 2034 vpand $MASK,$H3,$H3 2035 vpaddq $D3,$H4,$H4 # h3 -> h4 2036 2037 vpand $MASK,$T0,$T0 # 0 2038 vpand $MASK,$T1,$T1 # 1 2039 vpand $MASK,$T3,$T3 # 3 2040 vpor 32(%rcx),$T4,$T4 # padbit, yes, always 2041 2042 sub \$64,$len 2043 jnz .Loop_avx2$suffix 2044 2045 .byte 0x66,0x90 2046 .Ltail_avx2$suffix: 2047 ################################################################ 2048 # while above multiplications were by r^4 in all lanes, in last 2049 # iteration we multiply least significant lane by r^4 and most 2050 # significant one by r, so copy of above except that references 2051 # to the precomputed table are displaced by 4... 2052 2053 #vpaddq $H2,$T2,$H2 # accumulate input 2054 vpaddq $H0,$T0,$H0 2055 vmovdqu `32*0+4`(%rsp),$T0 # r0^4 2056 vpaddq $H1,$T1,$H1 2057 vmovdqu `32*1+4`(%rsp),$T1 # r1^4 2058 vpaddq $H3,$T3,$H3 2059 vmovdqu `32*3+4`(%rsp),$T2 # r2^4 2060 vpaddq $H4,$T4,$H4 2061 vmovdqu `32*6+4-0x90`(%rax),$T3 # s3^4 2062 vmovdqu `32*8+4-0x90`(%rax),$S4 # s4^4 2063 2064 vpmuludq $H2,$T0,$D2 # d2 = h2*r0 2065 vpmuludq $H2,$T1,$D3 # d3 = h2*r1 2066 vpmuludq $H2,$T2,$D4 # d4 = h2*r2 2067 vpmuludq $H2,$T3,$D0 # d0 = h2*s3 2068 vpmuludq $H2,$S4,$D1 # d1 = h2*s4 2069 2070 vpmuludq $H0,$T1,$T4 # h0*r1 2071 vpmuludq $H1,$T1,$H2 # h1*r1 2072 vpaddq $T4,$D1,$D1 # d1 += h0*r1 2073 vpaddq $H2,$D2,$D2 # d2 += h1*r1 2074 vpmuludq $H3,$T1,$T4 # h3*r1 2075 vpmuludq `32*2+4`(%rsp),$H4,$H2 # h4*s1 2076 vpaddq $T4,$D4,$D4 # d4 += h3*r1 2077 vpaddq $H2,$D0,$D0 # d0 += h4*s1 2078 2079 vpmuludq $H0,$T0,$T4 # h0*r0 2080 vpmuludq $H1,$T0,$H2 # h1*r0 2081 vpaddq $T4,$D0,$D0 # d0 += h0*r0 2082 vmovdqu `32*4+4-0x90`(%rax),$T1 # s2 2083 vpaddq $H2,$D1,$D1 # d1 += h1*r0 2084 vpmuludq $H3,$T0,$T4 # h3*r0 2085 vpmuludq $H4,$T0,$H2 # h4*r0 2086 vpaddq $T4,$D3,$D3 # d3 += h3*r0 2087 vpaddq $H2,$D4,$D4 # d4 += h4*r0 2088 2089 vpmuludq $H3,$T1,$T4 # h3*s2 2090 vpmuludq $H4,$T1,$H2 # h4*s2 2091 vpaddq $T4,$D0,$D0 # d0 += h3*s2 2092 vpaddq $H2,$D1,$D1 # d1 += h4*s2 2093 vmovdqu `32*5+4-0x90`(%rax),$H2 # r3 2094 vpmuludq $H1,$T2,$T4 # h1*r2 2095 vpmuludq $H0,$T2,$T2 # h0*r2 2096 vpaddq $T4,$D3,$D3 # d3 += h1*r2 2097 vpaddq $T2,$D2,$D2 # d2 += h0*r2 2098 2099 vpmuludq $H1,$H2,$T4 # h1*r3 2100 vpmuludq $H0,$H2,$H2 # h0*r3 2101 vpaddq $T4,$D4,$D4 # d4 += h1*r3 2102 vpaddq $H2,$D3,$D3 # d3 += h0*r3 2103 vpmuludq $H3,$T3,$T4 # h3*s3 2104 vpmuludq $H4,$T3,$H2 # h4*s3 2105 vpaddq $T4,$D1,$D1 # d1 += h3*s3 2106 vpaddq $H2,$D2,$D2 # d2 += h4*s3 2107 2108 vpmuludq $H3,$S4,$H3 # h3*s4 2109 vpmuludq $H4,$S4,$H4 # h4*s4 2110 vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4 2111 vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4 2112 vpmuludq `32*7+4-0x90`(%rax),$H0,$H4 # h0*r4 2113 vpmuludq $H1,$S4,$H0 # h1*s4 2114 vmovdqa 64(%rcx),$MASK # .Lmask26 2115 vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4 2116 vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4 2117 2118 ################################################################ 2119 # horizontal addition 2120 2121 vpsrldq \$8,$D1,$T1 2122 vpsrldq \$8,$H2,$T2 2123 vpsrldq \$8,$H3,$T3 2124 vpsrldq \$8,$H4,$T4 2125 vpsrldq \$8,$H0,$T0 2126 vpaddq $T1,$D1,$D1 2127 vpaddq $T2,$H2,$H2 2128 vpaddq $T3,$H3,$H3 2129 vpaddq $T4,$H4,$H4 2130 vpaddq $T0,$H0,$H0 2131 2132 vpermq \$0x2,$H3,$T3 2133 vpermq \$0x2,$H4,$T4 2134 vpermq \$0x2,$H0,$T0 2135 vpermq \$0x2,$D1,$T1 2136 vpermq \$0x2,$H2,$T2 2137 vpaddq $T3,$H3,$H3 2138 vpaddq $T4,$H4,$H4 2139 vpaddq $T0,$H0,$H0 2140 vpaddq $T1,$D1,$D1 2141 vpaddq $T2,$H2,$H2 2142 2143 ################################################################ 2144 # lazy reduction 2145 2146 vpsrlq \$26,$H3,$D3 2147 vpand $MASK,$H3,$H3 2148 vpaddq $D3,$H4,$H4 # h3 -> h4 2149 2150 vpsrlq \$26,$H0,$D0 2151 vpand $MASK,$H0,$H0 2152 vpaddq $D0,$D1,$H1 # h0 -> h1 2153 2154 vpsrlq \$26,$H4,$D4 2155 vpand $MASK,$H4,$H4 2156 2157 vpsrlq \$26,$H1,$D1 2158 vpand $MASK,$H1,$H1 2159 vpaddq $D1,$H2,$H2 # h1 -> h2 2160 2161 vpaddq $D4,$H0,$H0 2162 vpsllq \$2,$D4,$D4 2163 vpaddq $D4,$H0,$H0 # h4 -> h0 2164 2165 vpsrlq \$26,$H2,$D2 2166 vpand $MASK,$H2,$H2 2167 vpaddq $D2,$H3,$H3 # h2 -> h3 2168 2169 vpsrlq \$26,$H0,$D0 2170 vpand $MASK,$H0,$H0 2171 vpaddq $D0,$H1,$H1 # h0 -> h1 2172 2173 vpsrlq \$26,$H3,$D3 2174 vpand $MASK,$H3,$H3 2175 vpaddq $D3,$H4,$H4 # h3 -> h4 2176 2177 vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced 2178 vmovd %x#$H1,`4*1-48-64`($ctx) 2179 vmovd %x#$H2,`4*2-48-64`($ctx) 2180 vmovd %x#$H3,`4*3-48-64`($ctx) 2181 vmovd %x#$H4,`4*4-48-64`($ctx) 2182 ___ 2183 $code.=<<___ if ($win64); 2184 vmovdqa -0xb0(%r10),%xmm6 2185 vmovdqa -0xa0(%r10),%xmm7 2186 vmovdqa -0x90(%r10),%xmm8 2187 vmovdqa -0x80(%r10),%xmm9 2188 vmovdqa -0x70(%r10),%xmm10 2189 vmovdqa -0x60(%r10),%xmm11 2190 vmovdqa -0x50(%r10),%xmm12 2191 vmovdqa -0x40(%r10),%xmm13 2192 vmovdqa -0x30(%r10),%xmm14 2193 vmovdqa -0x20(%r10),%xmm15 2194 lea -8(%r10),%rsp 2195 .Ldo_avx2_epilogue$suffix: 2196 ___ 2197 $code.=<<___ if (!$win64); 2198 lea -8(%r10),%rsp 2199 .cfi_def_cfa_register %rsp 2200 ___ 2201 $code.=<<___; 2202 vzeroupper 2203 RET 2204 .cfi_endproc 2205 ___ 2206 if($avx > 2 && $avx512) { 2207 my ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%zmm$_",(16..24)); 2208 my ($M0,$M1,$M2,$M3,$M4) = map("%zmm$_",(25..29)); 2209 my $PADBIT="%zmm30"; 2210 2211 map(s/%y/%z/,($T4,$T0,$T1,$T2,$T3)); # switch to %zmm domain 2212 map(s/%y/%z/,($D0,$D1,$D2,$D3,$D4)); 2213 map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4)); 2214 map(s/%y/%z/,($MASK)); 2215 2216 $code.=<<___; 2217 .cfi_startproc 2218 .Lblocks_avx512: 2219 mov \$15,%eax 2220 kmovw %eax,%k2 2221 ___ 2222 $code.=<<___ if (!$win64); 2223 lea 8(%rsp),%r10 2224 .cfi_def_cfa_register %r10 2225 sub \$0x128,%rsp 2226 ___ 2227 $code.=<<___ if ($win64); 2228 lea 8(%rsp),%r10 2229 sub \$0x1c8,%rsp 2230 vmovdqa %xmm6,-0xb0(%r10) 2231 vmovdqa %xmm7,-0xa0(%r10) 2232 vmovdqa %xmm8,-0x90(%r10) 2233 vmovdqa %xmm9,-0x80(%r10) 2234 vmovdqa %xmm10,-0x70(%r10) 2235 vmovdqa %xmm11,-0x60(%r10) 2236 vmovdqa %xmm12,-0x50(%r10) 2237 vmovdqa %xmm13,-0x40(%r10) 2238 vmovdqa %xmm14,-0x30(%r10) 2239 vmovdqa %xmm15,-0x20(%r10) 2240 .Ldo_avx512_body: 2241 ___ 2242 $code.=<<___; 2243 lea .Lconst(%rip),%rcx 2244 lea 48+64($ctx),$ctx # size optimization 2245 vmovdqa 96(%rcx),%y#$T2 # .Lpermd_avx2 2246 2247 # expand pre-calculated table 2248 vmovdqu `16*0-64`($ctx),%x#$D0 # will become expanded ${R0} 2249 and \$-512,%rsp 2250 vmovdqu `16*1-64`($ctx),%x#$D1 # will become ... ${R1} 2251 mov \$0x20,%rax 2252 vmovdqu `16*2-64`($ctx),%x#$T0 # ... ${S1} 2253 vmovdqu `16*3-64`($ctx),%x#$D2 # ... ${R2} 2254 vmovdqu `16*4-64`($ctx),%x#$T1 # ... ${S2} 2255 vmovdqu `16*5-64`($ctx),%x#$D3 # ... ${R3} 2256 vmovdqu `16*6-64`($ctx),%x#$T3 # ... ${S3} 2257 vmovdqu `16*7-64`($ctx),%x#$D4 # ... ${R4} 2258 vmovdqu `16*8-64`($ctx),%x#$T4 # ... ${S4} 2259 vpermd $D0,$T2,$R0 # 00003412 -> 14243444 2260 vpbroadcastq 64(%rcx),$MASK # .Lmask26 2261 vpermd $D1,$T2,$R1 2262 vpermd $T0,$T2,$S1 2263 vpermd $D2,$T2,$R2 2264 vmovdqa64 $R0,0x00(%rsp){%k2} # save in case $len%128 != 0 2265 vpsrlq \$32,$R0,$T0 # 14243444 -> 01020304 2266 vpermd $T1,$T2,$S2 2267 vmovdqu64 $R1,0x00(%rsp,%rax){%k2} 2268 vpsrlq \$32,$R1,$T1 2269 vpermd $D3,$T2,$R3 2270 vmovdqa64 $S1,0x40(%rsp){%k2} 2271 vpermd $T3,$T2,$S3 2272 vpermd $D4,$T2,$R4 2273 vmovdqu64 $R2,0x40(%rsp,%rax){%k2} 2274 vpermd $T4,$T2,$S4 2275 vmovdqa64 $S2,0x80(%rsp){%k2} 2276 vmovdqu64 $R3,0x80(%rsp,%rax){%k2} 2277 vmovdqa64 $S3,0xc0(%rsp){%k2} 2278 vmovdqu64 $R4,0xc0(%rsp,%rax){%k2} 2279 vmovdqa64 $S4,0x100(%rsp){%k2} 2280 2281 ################################################################ 2282 # calculate 5th through 8th powers of the key 2283 # 2284 # d0 = r0'*r0 + r1'*5*r4 + r2'*5*r3 + r3'*5*r2 + r4'*5*r1 2285 # d1 = r0'*r1 + r1'*r0 + r2'*5*r4 + r3'*5*r3 + r4'*5*r2 2286 # d2 = r0'*r2 + r1'*r1 + r2'*r0 + r3'*5*r4 + r4'*5*r3 2287 # d3 = r0'*r3 + r1'*r2 + r2'*r1 + r3'*r0 + r4'*5*r4 2288 # d4 = r0'*r4 + r1'*r3 + r2'*r2 + r3'*r1 + r4'*r0 2289 2290 vpmuludq $T0,$R0,$D0 # d0 = r0'*r0 2291 vpmuludq $T0,$R1,$D1 # d1 = r0'*r1 2292 vpmuludq $T0,$R2,$D2 # d2 = r0'*r2 2293 vpmuludq $T0,$R3,$D3 # d3 = r0'*r3 2294 vpmuludq $T0,$R4,$D4 # d4 = r0'*r4 2295 vpsrlq \$32,$R2,$T2 2296 2297 vpmuludq $T1,$S4,$M0 2298 vpmuludq $T1,$R0,$M1 2299 vpmuludq $T1,$R1,$M2 2300 vpmuludq $T1,$R2,$M3 2301 vpmuludq $T1,$R3,$M4 2302 vpsrlq \$32,$R3,$T3 2303 vpaddq $M0,$D0,$D0 # d0 += r1'*5*r4 2304 vpaddq $M1,$D1,$D1 # d1 += r1'*r0 2305 vpaddq $M2,$D2,$D2 # d2 += r1'*r1 2306 vpaddq $M3,$D3,$D3 # d3 += r1'*r2 2307 vpaddq $M4,$D4,$D4 # d4 += r1'*r3 2308 2309 vpmuludq $T2,$S3,$M0 2310 vpmuludq $T2,$S4,$M1 2311 vpmuludq $T2,$R1,$M3 2312 vpmuludq $T2,$R2,$M4 2313 vpmuludq $T2,$R0,$M2 2314 vpsrlq \$32,$R4,$T4 2315 vpaddq $M0,$D0,$D0 # d0 += r2'*5*r3 2316 vpaddq $M1,$D1,$D1 # d1 += r2'*5*r4 2317 vpaddq $M3,$D3,$D3 # d3 += r2'*r1 2318 vpaddq $M4,$D4,$D4 # d4 += r2'*r2 2319 vpaddq $M2,$D2,$D2 # d2 += r2'*r0 2320 2321 vpmuludq $T3,$S2,$M0 2322 vpmuludq $T3,$R0,$M3 2323 vpmuludq $T3,$R1,$M4 2324 vpmuludq $T3,$S3,$M1 2325 vpmuludq $T3,$S4,$M2 2326 vpaddq $M0,$D0,$D0 # d0 += r3'*5*r2 2327 vpaddq $M3,$D3,$D3 # d3 += r3'*r0 2328 vpaddq $M4,$D4,$D4 # d4 += r3'*r1 2329 vpaddq $M1,$D1,$D1 # d1 += r3'*5*r3 2330 vpaddq $M2,$D2,$D2 # d2 += r3'*5*r4 2331 2332 vpmuludq $T4,$S4,$M3 2333 vpmuludq $T4,$R0,$M4 2334 vpmuludq $T4,$S1,$M0 2335 vpmuludq $T4,$S2,$M1 2336 vpmuludq $T4,$S3,$M2 2337 vpaddq $M3,$D3,$D3 # d3 += r2'*5*r4 2338 vpaddq $M4,$D4,$D4 # d4 += r2'*r0 2339 vpaddq $M0,$D0,$D0 # d0 += r2'*5*r1 2340 vpaddq $M1,$D1,$D1 # d1 += r2'*5*r2 2341 vpaddq $M2,$D2,$D2 # d2 += r2'*5*r3 2342 2343 ################################################################ 2344 # load input 2345 vmovdqu64 16*0($inp),%z#$T3 2346 vmovdqu64 16*4($inp),%z#$T4 2347 lea 16*8($inp),$inp 2348 2349 ################################################################ 2350 # lazy reduction 2351 2352 vpsrlq \$26,$D3,$M3 2353 vpandq $MASK,$D3,$D3 2354 vpaddq $M3,$D4,$D4 # d3 -> d4 2355 2356 vpsrlq \$26,$D0,$M0 2357 vpandq $MASK,$D0,$D0 2358 vpaddq $M0,$D1,$D1 # d0 -> d1 2359 2360 vpsrlq \$26,$D4,$M4 2361 vpandq $MASK,$D4,$D4 2362 2363 vpsrlq \$26,$D1,$M1 2364 vpandq $MASK,$D1,$D1 2365 vpaddq $M1,$D2,$D2 # d1 -> d2 2366 2367 vpaddq $M4,$D0,$D0 2368 vpsllq \$2,$M4,$M4 2369 vpaddq $M4,$D0,$D0 # d4 -> d0 2370 2371 vpsrlq \$26,$D2,$M2 2372 vpandq $MASK,$D2,$D2 2373 vpaddq $M2,$D3,$D3 # d2 -> d3 2374 2375 vpsrlq \$26,$D0,$M0 2376 vpandq $MASK,$D0,$D0 2377 vpaddq $M0,$D1,$D1 # d0 -> d1 2378 2379 vpsrlq \$26,$D3,$M3 2380 vpandq $MASK,$D3,$D3 2381 vpaddq $M3,$D4,$D4 # d3 -> d4 2382 2383 ################################################################ 2384 # at this point we have 14243444 in $R0-$S4 and 05060708 in 2385 # $D0-$D4, ... 2386 2387 vpunpcklqdq $T4,$T3,$T0 # transpose input 2388 vpunpckhqdq $T4,$T3,$T4 2389 2390 # ... since input 64-bit lanes are ordered as 73625140, we could 2391 # "vperm" it to 76543210 (here and in each loop iteration), *or* 2392 # we could just flow along, hence the goal for $R0-$S4 is 2393 # 1858286838784888 ... 2394 2395 vmovdqa32 128(%rcx),$M0 # .Lpermd_avx512: 2396 mov \$0x7777,%eax 2397 kmovw %eax,%k1 2398 2399 vpermd $R0,$M0,$R0 # 14243444 -> 1---2---3---4--- 2400 vpermd $R1,$M0,$R1 2401 vpermd $R2,$M0,$R2 2402 vpermd $R3,$M0,$R3 2403 vpermd $R4,$M0,$R4 2404 2405 vpermd $D0,$M0,${R0}{%k1} # 05060708 -> 1858286838784888 2406 vpermd $D1,$M0,${R1}{%k1} 2407 vpermd $D2,$M0,${R2}{%k1} 2408 vpermd $D3,$M0,${R3}{%k1} 2409 vpermd $D4,$M0,${R4}{%k1} 2410 2411 vpslld \$2,$R1,$S1 # *5 2412 vpslld \$2,$R2,$S2 2413 vpslld \$2,$R3,$S3 2414 vpslld \$2,$R4,$S4 2415 vpaddd $R1,$S1,$S1 2416 vpaddd $R2,$S2,$S2 2417 vpaddd $R3,$S3,$S3 2418 vpaddd $R4,$S4,$S4 2419 2420 vpbroadcastq 32(%rcx),$PADBIT # .L129 2421 2422 vpsrlq \$52,$T0,$T2 # splat input 2423 vpsllq \$12,$T4,$T3 2424 vporq $T3,$T2,$T2 2425 vpsrlq \$26,$T0,$T1 2426 vpsrlq \$14,$T4,$T3 2427 vpsrlq \$40,$T4,$T4 # 4 2428 vpandq $MASK,$T2,$T2 # 2 2429 vpandq $MASK,$T0,$T0 # 0 2430 #vpandq $MASK,$T1,$T1 # 1 2431 #vpandq $MASK,$T3,$T3 # 3 2432 #vporq $PADBIT,$T4,$T4 # padbit, yes, always 2433 2434 vpaddq $H2,$T2,$H2 # accumulate input 2435 sub \$192,$len 2436 jbe .Ltail_avx512 2437 jmp .Loop_avx512 2438 2439 .align 32 2440 .Loop_avx512: 2441 ################################################################ 2442 # ((inp[0]*r^8+inp[ 8])*r^8+inp[16])*r^8 2443 # ((inp[1]*r^8+inp[ 9])*r^8+inp[17])*r^7 2444 # ((inp[2]*r^8+inp[10])*r^8+inp[18])*r^6 2445 # ((inp[3]*r^8+inp[11])*r^8+inp[19])*r^5 2446 # ((inp[4]*r^8+inp[12])*r^8+inp[20])*r^4 2447 # ((inp[5]*r^8+inp[13])*r^8+inp[21])*r^3 2448 # ((inp[6]*r^8+inp[14])*r^8+inp[22])*r^2 2449 # ((inp[7]*r^8+inp[15])*r^8+inp[23])*r^1 2450 # \________/\___________/ 2451 ################################################################ 2452 #vpaddq $H2,$T2,$H2 # accumulate input 2453 2454 # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 2455 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 2456 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 2457 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 2458 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 2459 # 2460 # however, as h2 is "chronologically" first one available pull 2461 # corresponding operations up, so it's 2462 # 2463 # d3 = h2*r1 + h0*r3 + h1*r2 + h3*r0 + h4*5*r4 2464 # d4 = h2*r2 + h0*r4 + h1*r3 + h3*r1 + h4*r0 2465 # d0 = h2*5*r3 + h0*r0 + h1*5*r4 + h3*5*r2 + h4*5*r1 2466 # d1 = h2*5*r4 + h0*r1 + h1*r0 + h3*5*r3 + h4*5*r2 2467 # d2 = h2*r0 + h0*r2 + h1*r1 + h3*5*r4 + h4*5*r3 2468 2469 vpmuludq $H2,$R1,$D3 # d3 = h2*r1 2470 vpaddq $H0,$T0,$H0 2471 vpmuludq $H2,$R2,$D4 # d4 = h2*r2 2472 vpandq $MASK,$T1,$T1 # 1 2473 vpmuludq $H2,$S3,$D0 # d0 = h2*s3 2474 vpandq $MASK,$T3,$T3 # 3 2475 vpmuludq $H2,$S4,$D1 # d1 = h2*s4 2476 vporq $PADBIT,$T4,$T4 # padbit, yes, always 2477 vpmuludq $H2,$R0,$D2 # d2 = h2*r0 2478 vpaddq $H1,$T1,$H1 # accumulate input 2479 vpaddq $H3,$T3,$H3 2480 vpaddq $H4,$T4,$H4 2481 2482 vmovdqu64 16*0($inp),$T3 # load input 2483 vmovdqu64 16*4($inp),$T4 2484 lea 16*8($inp),$inp 2485 vpmuludq $H0,$R3,$M3 2486 vpmuludq $H0,$R4,$M4 2487 vpmuludq $H0,$R0,$M0 2488 vpmuludq $H0,$R1,$M1 2489 vpaddq $M3,$D3,$D3 # d3 += h0*r3 2490 vpaddq $M4,$D4,$D4 # d4 += h0*r4 2491 vpaddq $M0,$D0,$D0 # d0 += h0*r0 2492 vpaddq $M1,$D1,$D1 # d1 += h0*r1 2493 2494 vpmuludq $H1,$R2,$M3 2495 vpmuludq $H1,$R3,$M4 2496 vpmuludq $H1,$S4,$M0 2497 vpmuludq $H0,$R2,$M2 2498 vpaddq $M3,$D3,$D3 # d3 += h1*r2 2499 vpaddq $M4,$D4,$D4 # d4 += h1*r3 2500 vpaddq $M0,$D0,$D0 # d0 += h1*s4 2501 vpaddq $M2,$D2,$D2 # d2 += h0*r2 2502 2503 vpunpcklqdq $T4,$T3,$T0 # transpose input 2504 vpunpckhqdq $T4,$T3,$T4 2505 2506 vpmuludq $H3,$R0,$M3 2507 vpmuludq $H3,$R1,$M4 2508 vpmuludq $H1,$R0,$M1 2509 vpmuludq $H1,$R1,$M2 2510 vpaddq $M3,$D3,$D3 # d3 += h3*r0 2511 vpaddq $M4,$D4,$D4 # d4 += h3*r1 2512 vpaddq $M1,$D1,$D1 # d1 += h1*r0 2513 vpaddq $M2,$D2,$D2 # d2 += h1*r1 2514 2515 vpmuludq $H4,$S4,$M3 2516 vpmuludq $H4,$R0,$M4 2517 vpmuludq $H3,$S2,$M0 2518 vpmuludq $H3,$S3,$M1 2519 vpaddq $M3,$D3,$D3 # d3 += h4*s4 2520 vpmuludq $H3,$S4,$M2 2521 vpaddq $M4,$D4,$D4 # d4 += h4*r0 2522 vpaddq $M0,$D0,$D0 # d0 += h3*s2 2523 vpaddq $M1,$D1,$D1 # d1 += h3*s3 2524 vpaddq $M2,$D2,$D2 # d2 += h3*s4 2525 2526 vpmuludq $H4,$S1,$M0 2527 vpmuludq $H4,$S2,$M1 2528 vpmuludq $H4,$S3,$M2 2529 vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1 2530 vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2 2531 vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3 2532 2533 ################################################################ 2534 # lazy reduction (interleaved with input splat) 2535 2536 vpsrlq \$52,$T0,$T2 # splat input 2537 vpsllq \$12,$T4,$T3 2538 2539 vpsrlq \$26,$D3,$H3 2540 vpandq $MASK,$D3,$D3 2541 vpaddq $H3,$D4,$H4 # h3 -> h4 2542 2543 vporq $T3,$T2,$T2 2544 2545 vpsrlq \$26,$H0,$D0 2546 vpandq $MASK,$H0,$H0 2547 vpaddq $D0,$H1,$H1 # h0 -> h1 2548 2549 vpandq $MASK,$T2,$T2 # 2 2550 2551 vpsrlq \$26,$H4,$D4 2552 vpandq $MASK,$H4,$H4 2553 2554 vpsrlq \$26,$H1,$D1 2555 vpandq $MASK,$H1,$H1 2556 vpaddq $D1,$H2,$H2 # h1 -> h2 2557 2558 vpaddq $D4,$H0,$H0 2559 vpsllq \$2,$D4,$D4 2560 vpaddq $D4,$H0,$H0 # h4 -> h0 2561 2562 vpaddq $T2,$H2,$H2 # modulo-scheduled 2563 vpsrlq \$26,$T0,$T1 2564 2565 vpsrlq \$26,$H2,$D2 2566 vpandq $MASK,$H2,$H2 2567 vpaddq $D2,$D3,$H3 # h2 -> h3 2568 2569 vpsrlq \$14,$T4,$T3 2570 2571 vpsrlq \$26,$H0,$D0 2572 vpandq $MASK,$H0,$H0 2573 vpaddq $D0,$H1,$H1 # h0 -> h1 2574 2575 vpsrlq \$40,$T4,$T4 # 4 2576 2577 vpsrlq \$26,$H3,$D3 2578 vpandq $MASK,$H3,$H3 2579 vpaddq $D3,$H4,$H4 # h3 -> h4 2580 2581 vpandq $MASK,$T0,$T0 # 0 2582 #vpandq $MASK,$T1,$T1 # 1 2583 #vpandq $MASK,$T3,$T3 # 3 2584 #vporq $PADBIT,$T4,$T4 # padbit, yes, always 2585 2586 sub \$128,$len 2587 ja .Loop_avx512 2588 2589 .Ltail_avx512: 2590 ################################################################ 2591 # while above multiplications were by r^8 in all lanes, in last 2592 # iteration we multiply least significant lane by r^8 and most 2593 # significant one by r, that's why table gets shifted... 2594 2595 vpsrlq \$32,$R0,$R0 # 0105020603070408 2596 vpsrlq \$32,$R1,$R1 2597 vpsrlq \$32,$R2,$R2 2598 vpsrlq \$32,$S3,$S3 2599 vpsrlq \$32,$S4,$S4 2600 vpsrlq \$32,$R3,$R3 2601 vpsrlq \$32,$R4,$R4 2602 vpsrlq \$32,$S1,$S1 2603 vpsrlq \$32,$S2,$S2 2604 2605 ################################################################ 2606 # load either next or last 64 byte of input 2607 lea ($inp,$len),$inp 2608 2609 #vpaddq $H2,$T2,$H2 # accumulate input 2610 vpaddq $H0,$T0,$H0 2611 2612 vpmuludq $H2,$R1,$D3 # d3 = h2*r1 2613 vpmuludq $H2,$R2,$D4 # d4 = h2*r2 2614 vpmuludq $H2,$S3,$D0 # d0 = h2*s3 2615 vpandq $MASK,$T1,$T1 # 1 2616 vpmuludq $H2,$S4,$D1 # d1 = h2*s4 2617 vpandq $MASK,$T3,$T3 # 3 2618 vpmuludq $H2,$R0,$D2 # d2 = h2*r0 2619 vporq $PADBIT,$T4,$T4 # padbit, yes, always 2620 vpaddq $H1,$T1,$H1 # accumulate input 2621 vpaddq $H3,$T3,$H3 2622 vpaddq $H4,$T4,$H4 2623 2624 vmovdqu 16*0($inp),%x#$T0 2625 vpmuludq $H0,$R3,$M3 2626 vpmuludq $H0,$R4,$M4 2627 vpmuludq $H0,$R0,$M0 2628 vpmuludq $H0,$R1,$M1 2629 vpaddq $M3,$D3,$D3 # d3 += h0*r3 2630 vpaddq $M4,$D4,$D4 # d4 += h0*r4 2631 vpaddq $M0,$D0,$D0 # d0 += h0*r0 2632 vpaddq $M1,$D1,$D1 # d1 += h0*r1 2633 2634 vmovdqu 16*1($inp),%x#$T1 2635 vpmuludq $H1,$R2,$M3 2636 vpmuludq $H1,$R3,$M4 2637 vpmuludq $H1,$S4,$M0 2638 vpmuludq $H0,$R2,$M2 2639 vpaddq $M3,$D3,$D3 # d3 += h1*r2 2640 vpaddq $M4,$D4,$D4 # d4 += h1*r3 2641 vpaddq $M0,$D0,$D0 # d0 += h1*s4 2642 vpaddq $M2,$D2,$D2 # d2 += h0*r2 2643 2644 vinserti128 \$1,16*2($inp),%y#$T0,%y#$T0 2645 vpmuludq $H3,$R0,$M3 2646 vpmuludq $H3,$R1,$M4 2647 vpmuludq $H1,$R0,$M1 2648 vpmuludq $H1,$R1,$M2 2649 vpaddq $M3,$D3,$D3 # d3 += h3*r0 2650 vpaddq $M4,$D4,$D4 # d4 += h3*r1 2651 vpaddq $M1,$D1,$D1 # d1 += h1*r0 2652 vpaddq $M2,$D2,$D2 # d2 += h1*r1 2653 2654 vinserti128 \$1,16*3($inp),%y#$T1,%y#$T1 2655 vpmuludq $H4,$S4,$M3 2656 vpmuludq $H4,$R0,$M4 2657 vpmuludq $H3,$S2,$M0 2658 vpmuludq $H3,$S3,$M1 2659 vpmuludq $H3,$S4,$M2 2660 vpaddq $M3,$D3,$H3 # h3 = d3 + h4*s4 2661 vpaddq $M4,$D4,$D4 # d4 += h4*r0 2662 vpaddq $M0,$D0,$D0 # d0 += h3*s2 2663 vpaddq $M1,$D1,$D1 # d1 += h3*s3 2664 vpaddq $M2,$D2,$D2 # d2 += h3*s4 2665 2666 vpmuludq $H4,$S1,$M0 2667 vpmuludq $H4,$S2,$M1 2668 vpmuludq $H4,$S3,$M2 2669 vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1 2670 vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2 2671 vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3 2672 2673 ################################################################ 2674 # horizontal addition 2675 2676 mov \$1,%eax 2677 vpermq \$0xb1,$H3,$D3 2678 vpermq \$0xb1,$D4,$H4 2679 vpermq \$0xb1,$H0,$D0 2680 vpermq \$0xb1,$H1,$D1 2681 vpermq \$0xb1,$H2,$D2 2682 vpaddq $D3,$H3,$H3 2683 vpaddq $D4,$H4,$H4 2684 vpaddq $D0,$H0,$H0 2685 vpaddq $D1,$H1,$H1 2686 vpaddq $D2,$H2,$H2 2687 2688 kmovw %eax,%k3 2689 vpermq \$0x2,$H3,$D3 2690 vpermq \$0x2,$H4,$D4 2691 vpermq \$0x2,$H0,$D0 2692 vpermq \$0x2,$H1,$D1 2693 vpermq \$0x2,$H2,$D2 2694 vpaddq $D3,$H3,$H3 2695 vpaddq $D4,$H4,$H4 2696 vpaddq $D0,$H0,$H0 2697 vpaddq $D1,$H1,$H1 2698 vpaddq $D2,$H2,$H2 2699 2700 vextracti64x4 \$0x1,$H3,%y#$D3 2701 vextracti64x4 \$0x1,$H4,%y#$D4 2702 vextracti64x4 \$0x1,$H0,%y#$D0 2703 vextracti64x4 \$0x1,$H1,%y#$D1 2704 vextracti64x4 \$0x1,$H2,%y#$D2 2705 vpaddq $D3,$H3,${H3}{%k3}{z} # keep single qword in case 2706 vpaddq $D4,$H4,${H4}{%k3}{z} # it's passed to .Ltail_avx2 2707 vpaddq $D0,$H0,${H0}{%k3}{z} 2708 vpaddq $D1,$H1,${H1}{%k3}{z} 2709 vpaddq $D2,$H2,${H2}{%k3}{z} 2710 ___ 2711 map(s/%z/%y/,($T0,$T1,$T2,$T3,$T4, $PADBIT)); 2712 map(s/%z/%y/,($H0,$H1,$H2,$H3,$H4, $D0,$D1,$D2,$D3,$D4, $MASK)); 2713 $code.=<<___; 2714 ################################################################ 2715 # lazy reduction (interleaved with input splat) 2716 2717 vpsrlq \$26,$H3,$D3 2718 vpand $MASK,$H3,$H3 2719 vpsrldq \$6,$T0,$T2 # splat input 2720 vpsrldq \$6,$T1,$T3 2721 vpunpckhqdq $T1,$T0,$T4 # 4 2722 vpaddq $D3,$H4,$H4 # h3 -> h4 2723 2724 vpsrlq \$26,$H0,$D0 2725 vpand $MASK,$H0,$H0 2726 vpunpcklqdq $T3,$T2,$T2 # 2:3 2727 vpunpcklqdq $T1,$T0,$T0 # 0:1 2728 vpaddq $D0,$H1,$H1 # h0 -> h1 2729 2730 vpsrlq \$26,$H4,$D4 2731 vpand $MASK,$H4,$H4 2732 2733 vpsrlq \$26,$H1,$D1 2734 vpand $MASK,$H1,$H1 2735 vpsrlq \$30,$T2,$T3 2736 vpsrlq \$4,$T2,$T2 2737 vpaddq $D1,$H2,$H2 # h1 -> h2 2738 2739 vpaddq $D4,$H0,$H0 2740 vpsllq \$2,$D4,$D4 2741 vpsrlq \$26,$T0,$T1 2742 vpsrlq \$40,$T4,$T4 # 4 2743 vpaddq $D4,$H0,$H0 # h4 -> h0 2744 2745 vpsrlq \$26,$H2,$D2 2746 vpand $MASK,$H2,$H2 2747 vpand $MASK,$T2,$T2 # 2 2748 vpand $MASK,$T0,$T0 # 0 2749 vpaddq $D2,$H3,$H3 # h2 -> h3 2750 2751 vpsrlq \$26,$H0,$D0 2752 vpand $MASK,$H0,$H0 2753 vpaddq $H2,$T2,$H2 # accumulate input for .Ltail_avx2 2754 vpand $MASK,$T1,$T1 # 1 2755 vpaddq $D0,$H1,$H1 # h0 -> h1 2756 2757 vpsrlq \$26,$H3,$D3 2758 vpand $MASK,$H3,$H3 2759 vpand $MASK,$T3,$T3 # 3 2760 vpor 32(%rcx),$T4,$T4 # padbit, yes, always 2761 vpaddq $D3,$H4,$H4 # h3 -> h4 2762 2763 lea 0x90(%rsp),%rax # size optimization for .Ltail_avx2 2764 add \$64,$len 2765 jnz .Ltail_avx2$suffix 2766 2767 vpsubq $T2,$H2,$H2 # undo input accumulation 2768 vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced 2769 vmovd %x#$H1,`4*1-48-64`($ctx) 2770 vmovd %x#$H2,`4*2-48-64`($ctx) 2771 vmovd %x#$H3,`4*3-48-64`($ctx) 2772 vmovd %x#$H4,`4*4-48-64`($ctx) 2773 vzeroall 2774 ___ 2775 $code.=<<___ if ($win64); 2776 movdqa -0xb0(%r10),%xmm6 2777 movdqa -0xa0(%r10),%xmm7 2778 movdqa -0x90(%r10),%xmm8 2779 movdqa -0x80(%r10),%xmm9 2780 movdqa -0x70(%r10),%xmm10 2781 movdqa -0x60(%r10),%xmm11 2782 movdqa -0x50(%r10),%xmm12 2783 movdqa -0x40(%r10),%xmm13 2784 movdqa -0x30(%r10),%xmm14 2785 movdqa -0x20(%r10),%xmm15 2786 lea -8(%r10),%rsp 2787 .Ldo_avx512_epilogue: 2788 ___ 2789 $code.=<<___ if (!$win64); 2790 lea -8(%r10),%rsp 2791 .cfi_def_cfa_register %rsp 2792 ___ 2793 $code.=<<___; 2794 RET 2795 .cfi_endproc 2796 ___ 2797 2798 } 2799 2800 } 2801 2802 &declare_function("poly1305_blocks_avx2", 32, 4); 2803 poly1305_blocks_avxN(0); 2804 &end_function("poly1305_blocks_avx2"); 2805 2806 ####################################################################### 2807 if ($avx>2) { 2808 # On entry we have input length divisible by 64. But since inner loop 2809 # processes 128 bytes per iteration, cases when length is not divisible 2810 # by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this 2811 # reason stack layout is kept identical to poly1305_blocks_avx2. If not 2812 # for this tail, we wouldn't have to even allocate stack frame... 2813 2814 if($kernel) { 2815 $code .= "#ifdef CONFIG_AS_AVX512\n"; 2816 } 2817 2818 &declare_function("poly1305_blocks_avx512", 32, 4); 2819 poly1305_blocks_avxN(1); 2820 &end_function("poly1305_blocks_avx512"); 2821 2822 if ($kernel) { 2823 $code .= "#endif\n"; 2824 } 2825 2826 if (!$kernel && $avx>3) { 2827 ######################################################################## 2828 # VPMADD52 version using 2^44 radix. 2829 # 2830 # One can argue that base 2^52 would be more natural. Well, even though 2831 # some operations would be more natural, one has to recognize couple of 2832 # things. Base 2^52 doesn't provide advantage over base 2^44 if you look 2833 # at amount of multiply-n-accumulate operations. Secondly, it makes it 2834 # impossible to pre-compute multiples of 5 [referred to as s[]/sN in 2835 # reference implementations], which means that more such operations 2836 # would have to be performed in inner loop, which in turn makes critical 2837 # path longer. In other words, even though base 2^44 reduction might 2838 # look less elegant, overall critical path is actually shorter... 2839 2840 ######################################################################## 2841 # Layout of opaque area is following. 2842 # 2843 # unsigned __int64 h[3]; # current hash value base 2^44 2844 # unsigned __int64 s[2]; # key value*20 base 2^44 2845 # unsigned __int64 r[3]; # key value base 2^44 2846 # struct { unsigned __int64 r^1, r^3, r^2, r^4; } R[4]; 2847 # # r^n positions reflect 2848 # # placement in register, not 2849 # # memory, R[3] is R[1]*20 2850 2851 $code.=<<___; 2852 .type poly1305_init_base2_44,\@function,3 2853 .align 32 2854 poly1305_init_base2_44: 2855 xor %eax,%eax 2856 mov %rax,0($ctx) # initialize hash value 2857 mov %rax,8($ctx) 2858 mov %rax,16($ctx) 2859 2860 .Linit_base2_44: 2861 lea poly1305_blocks_vpmadd52(%rip),%r10 2862 lea poly1305_emit_base2_44(%rip),%r11 2863 2864 mov \$0x0ffffffc0fffffff,%rax 2865 mov \$0x0ffffffc0ffffffc,%rcx 2866 and 0($inp),%rax 2867 mov \$0x00000fffffffffff,%r8 2868 and 8($inp),%rcx 2869 mov \$0x00000fffffffffff,%r9 2870 and %rax,%r8 2871 shrd \$44,%rcx,%rax 2872 mov %r8,40($ctx) # r0 2873 and %r9,%rax 2874 shr \$24,%rcx 2875 mov %rax,48($ctx) # r1 2876 lea (%rax,%rax,4),%rax # *5 2877 mov %rcx,56($ctx) # r2 2878 shl \$2,%rax # magic <<2 2879 lea (%rcx,%rcx,4),%rcx # *5 2880 shl \$2,%rcx # magic <<2 2881 mov %rax,24($ctx) # s1 2882 mov %rcx,32($ctx) # s2 2883 movq \$-1,64($ctx) # write impossible value 2884 ___ 2885 $code.=<<___ if ($flavour !~ /elf32/); 2886 mov %r10,0(%rdx) 2887 mov %r11,8(%rdx) 2888 ___ 2889 $code.=<<___ if ($flavour =~ /elf32/); 2890 mov %r10d,0(%rdx) 2891 mov %r11d,4(%rdx) 2892 ___ 2893 $code.=<<___; 2894 mov \$1,%eax 2895 RET 2896 .size poly1305_init_base2_44,.-poly1305_init_base2_44 2897 ___ 2898 { 2899 my ($H0,$H1,$H2,$r2r1r0,$r1r0s2,$r0s2s1,$Dlo,$Dhi) = map("%ymm$_",(0..5,16,17)); 2900 my ($T0,$inp_permd,$inp_shift,$PAD) = map("%ymm$_",(18..21)); 2901 my ($reduc_mask,$reduc_rght,$reduc_left) = map("%ymm$_",(22..25)); 2902 2903 $code.=<<___; 2904 .type poly1305_blocks_vpmadd52,\@function,4 2905 .align 32 2906 poly1305_blocks_vpmadd52: 2907 shr \$4,$len 2908 jz .Lno_data_vpmadd52 # too short 2909 2910 shl \$40,$padbit 2911 mov 64($ctx),%r8 # peek on power of the key 2912 2913 # if powers of the key are not calculated yet, process up to 3 2914 # blocks with this single-block subroutine, otherwise ensure that 2915 # length is divisible by 2 blocks and pass the rest down to next 2916 # subroutine... 2917 2918 mov \$3,%rax 2919 mov \$1,%r10 2920 cmp \$4,$len # is input long 2921 cmovae %r10,%rax 2922 test %r8,%r8 # is power value impossible? 2923 cmovns %r10,%rax 2924 2925 and $len,%rax # is input of favourable length? 2926 jz .Lblocks_vpmadd52_4x 2927 2928 sub %rax,$len 2929 mov \$7,%r10d 2930 mov \$1,%r11d 2931 kmovw %r10d,%k7 2932 lea .L2_44_inp_permd(%rip),%r10 2933 kmovw %r11d,%k1 2934 2935 vmovq $padbit,%x#$PAD 2936 vmovdqa64 0(%r10),$inp_permd # .L2_44_inp_permd 2937 vmovdqa64 32(%r10),$inp_shift # .L2_44_inp_shift 2938 vpermq \$0xcf,$PAD,$PAD 2939 vmovdqa64 64(%r10),$reduc_mask # .L2_44_mask 2940 2941 vmovdqu64 0($ctx),${Dlo}{%k7}{z} # load hash value 2942 vmovdqu64 40($ctx),${r2r1r0}{%k7}{z} # load keys 2943 vmovdqu64 32($ctx),${r1r0s2}{%k7}{z} 2944 vmovdqu64 24($ctx),${r0s2s1}{%k7}{z} 2945 2946 vmovdqa64 96(%r10),$reduc_rght # .L2_44_shift_rgt 2947 vmovdqa64 128(%r10),$reduc_left # .L2_44_shift_lft 2948 2949 jmp .Loop_vpmadd52 2950 2951 .align 32 2952 .Loop_vpmadd52: 2953 vmovdqu32 0($inp),%x#$T0 # load input as ----3210 2954 lea 16($inp),$inp 2955 2956 vpermd $T0,$inp_permd,$T0 # ----3210 -> --322110 2957 vpsrlvq $inp_shift,$T0,$T0 2958 vpandq $reduc_mask,$T0,$T0 2959 vporq $PAD,$T0,$T0 2960 2961 vpaddq $T0,$Dlo,$Dlo # accumulate input 2962 2963 vpermq \$0,$Dlo,${H0}{%k7}{z} # smash hash value 2964 vpermq \$0b01010101,$Dlo,${H1}{%k7}{z} 2965 vpermq \$0b10101010,$Dlo,${H2}{%k7}{z} 2966 2967 vpxord $Dlo,$Dlo,$Dlo 2968 vpxord $Dhi,$Dhi,$Dhi 2969 2970 vpmadd52luq $r2r1r0,$H0,$Dlo 2971 vpmadd52huq $r2r1r0,$H0,$Dhi 2972 2973 vpmadd52luq $r1r0s2,$H1,$Dlo 2974 vpmadd52huq $r1r0s2,$H1,$Dhi 2975 2976 vpmadd52luq $r0s2s1,$H2,$Dlo 2977 vpmadd52huq $r0s2s1,$H2,$Dhi 2978 2979 vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost qword 2980 vpsllvq $reduc_left,$Dhi,$Dhi # 0 in topmost qword 2981 vpandq $reduc_mask,$Dlo,$Dlo 2982 2983 vpaddq $T0,$Dhi,$Dhi 2984 2985 vpermq \$0b10010011,$Dhi,$Dhi # 0 in lowest qword 2986 2987 vpaddq $Dhi,$Dlo,$Dlo # note topmost qword :-) 2988 2989 vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost word 2990 vpandq $reduc_mask,$Dlo,$Dlo 2991 2992 vpermq \$0b10010011,$T0,$T0 2993 2994 vpaddq $T0,$Dlo,$Dlo 2995 2996 vpermq \$0b10010011,$Dlo,${T0}{%k1}{z} 2997 2998 vpaddq $T0,$Dlo,$Dlo 2999 vpsllq \$2,$T0,$T0 3000 3001 vpaddq $T0,$Dlo,$Dlo 3002 3003 dec %rax # len-=16 3004 jnz .Loop_vpmadd52 3005 3006 vmovdqu64 $Dlo,0($ctx){%k7} # store hash value 3007 3008 test $len,$len 3009 jnz .Lblocks_vpmadd52_4x 3010 3011 .Lno_data_vpmadd52: 3012 RET 3013 .size poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd52 3014 ___ 3015 } 3016 { 3017 ######################################################################## 3018 # As implied by its name 4x subroutine processes 4 blocks in parallel 3019 # (but handles even 4*n+2 blocks lengths). It takes up to 4th key power 3020 # and is handled in 256-bit %ymm registers. 3021 3022 my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17)); 3023 my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23)); 3024 my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31)); 3025 3026 $code.=<<___; 3027 .type poly1305_blocks_vpmadd52_4x,\@function,4 3028 .align 32 3029 poly1305_blocks_vpmadd52_4x: 3030 shr \$4,$len 3031 jz .Lno_data_vpmadd52_4x # too short 3032 3033 shl \$40,$padbit 3034 mov 64($ctx),%r8 # peek on power of the key 3035 3036 .Lblocks_vpmadd52_4x: 3037 vpbroadcastq $padbit,$PAD 3038 3039 vmovdqa64 .Lx_mask44(%rip),$mask44 3040 mov \$5,%eax 3041 vmovdqa64 .Lx_mask42(%rip),$mask42 3042 kmovw %eax,%k1 # used in 2x path 3043 3044 test %r8,%r8 # is power value impossible? 3045 js .Linit_vpmadd52 # if it is, then init R[4] 3046 3047 vmovq 0($ctx),%x#$H0 # load current hash value 3048 vmovq 8($ctx),%x#$H1 3049 vmovq 16($ctx),%x#$H2 3050 3051 test \$3,$len # is length 4*n+2? 3052 jnz .Lblocks_vpmadd52_2x_do 3053 3054 .Lblocks_vpmadd52_4x_do: 3055 vpbroadcastq 64($ctx),$R0 # load 4th power of the key 3056 vpbroadcastq 96($ctx),$R1 3057 vpbroadcastq 128($ctx),$R2 3058 vpbroadcastq 160($ctx),$S1 3059 3060 .Lblocks_vpmadd52_4x_key_loaded: 3061 vpsllq \$2,$R2,$S2 # S2 = R2*5*4 3062 vpaddq $R2,$S2,$S2 3063 vpsllq \$2,$S2,$S2 3064 3065 test \$7,$len # is len 8*n? 3066 jz .Lblocks_vpmadd52_8x 3067 3068 vmovdqu64 16*0($inp),$T2 # load data 3069 vmovdqu64 16*2($inp),$T3 3070 lea 16*4($inp),$inp 3071 3072 vpunpcklqdq $T3,$T2,$T1 # transpose data 3073 vpunpckhqdq $T3,$T2,$T3 3074 3075 # at this point 64-bit lanes are ordered as 3-1-2-0 3076 3077 vpsrlq \$24,$T3,$T2 # splat the data 3078 vporq $PAD,$T2,$T2 3079 vpaddq $T2,$H2,$H2 # accumulate input 3080 vpandq $mask44,$T1,$T0 3081 vpsrlq \$44,$T1,$T1 3082 vpsllq \$20,$T3,$T3 3083 vporq $T3,$T1,$T1 3084 vpandq $mask44,$T1,$T1 3085 3086 sub \$4,$len 3087 jz .Ltail_vpmadd52_4x 3088 jmp .Loop_vpmadd52_4x 3089 ud2 3090 3091 .align 32 3092 .Linit_vpmadd52: 3093 vmovq 24($ctx),%x#$S1 # load key 3094 vmovq 56($ctx),%x#$H2 3095 vmovq 32($ctx),%x#$S2 3096 vmovq 40($ctx),%x#$R0 3097 vmovq 48($ctx),%x#$R1 3098 3099 vmovdqa $R0,$H0 3100 vmovdqa $R1,$H1 3101 vmovdqa $H2,$R2 3102 3103 mov \$2,%eax 3104 3105 .Lmul_init_vpmadd52: 3106 vpxorq $D0lo,$D0lo,$D0lo 3107 vpmadd52luq $H2,$S1,$D0lo 3108 vpxorq $D0hi,$D0hi,$D0hi 3109 vpmadd52huq $H2,$S1,$D0hi 3110 vpxorq $D1lo,$D1lo,$D1lo 3111 vpmadd52luq $H2,$S2,$D1lo 3112 vpxorq $D1hi,$D1hi,$D1hi 3113 vpmadd52huq $H2,$S2,$D1hi 3114 vpxorq $D2lo,$D2lo,$D2lo 3115 vpmadd52luq $H2,$R0,$D2lo 3116 vpxorq $D2hi,$D2hi,$D2hi 3117 vpmadd52huq $H2,$R0,$D2hi 3118 3119 vpmadd52luq $H0,$R0,$D0lo 3120 vpmadd52huq $H0,$R0,$D0hi 3121 vpmadd52luq $H0,$R1,$D1lo 3122 vpmadd52huq $H0,$R1,$D1hi 3123 vpmadd52luq $H0,$R2,$D2lo 3124 vpmadd52huq $H0,$R2,$D2hi 3125 3126 vpmadd52luq $H1,$S2,$D0lo 3127 vpmadd52huq $H1,$S2,$D0hi 3128 vpmadd52luq $H1,$R0,$D1lo 3129 vpmadd52huq $H1,$R0,$D1hi 3130 vpmadd52luq $H1,$R1,$D2lo 3131 vpmadd52huq $H1,$R1,$D2hi 3132 3133 ################################################################ 3134 # partial reduction 3135 vpsrlq \$44,$D0lo,$tmp 3136 vpsllq \$8,$D0hi,$D0hi 3137 vpandq $mask44,$D0lo,$H0 3138 vpaddq $tmp,$D0hi,$D0hi 3139 3140 vpaddq $D0hi,$D1lo,$D1lo 3141 3142 vpsrlq \$44,$D1lo,$tmp 3143 vpsllq \$8,$D1hi,$D1hi 3144 vpandq $mask44,$D1lo,$H1 3145 vpaddq $tmp,$D1hi,$D1hi 3146 3147 vpaddq $D1hi,$D2lo,$D2lo 3148 3149 vpsrlq \$42,$D2lo,$tmp 3150 vpsllq \$10,$D2hi,$D2hi 3151 vpandq $mask42,$D2lo,$H2 3152 vpaddq $tmp,$D2hi,$D2hi 3153 3154 vpaddq $D2hi,$H0,$H0 3155 vpsllq \$2,$D2hi,$D2hi 3156 3157 vpaddq $D2hi,$H0,$H0 3158 3159 vpsrlq \$44,$H0,$tmp # additional step 3160 vpandq $mask44,$H0,$H0 3161 3162 vpaddq $tmp,$H1,$H1 3163 3164 dec %eax 3165 jz .Ldone_init_vpmadd52 3166 3167 vpunpcklqdq $R1,$H1,$R1 # 1,2 3168 vpbroadcastq %x#$H1,%x#$H1 # 2,2 3169 vpunpcklqdq $R2,$H2,$R2 3170 vpbroadcastq %x#$H2,%x#$H2 3171 vpunpcklqdq $R0,$H0,$R0 3172 vpbroadcastq %x#$H0,%x#$H0 3173 3174 vpsllq \$2,$R1,$S1 # S1 = R1*5*4 3175 vpsllq \$2,$R2,$S2 # S2 = R2*5*4 3176 vpaddq $R1,$S1,$S1 3177 vpaddq $R2,$S2,$S2 3178 vpsllq \$2,$S1,$S1 3179 vpsllq \$2,$S2,$S2 3180 3181 jmp .Lmul_init_vpmadd52 3182 ud2 3183 3184 .align 32 3185 .Ldone_init_vpmadd52: 3186 vinserti128 \$1,%x#$R1,$H1,$R1 # 1,2,3,4 3187 vinserti128 \$1,%x#$R2,$H2,$R2 3188 vinserti128 \$1,%x#$R0,$H0,$R0 3189 3190 vpermq \$0b11011000,$R1,$R1 # 1,3,2,4 3191 vpermq \$0b11011000,$R2,$R2 3192 vpermq \$0b11011000,$R0,$R0 3193 3194 vpsllq \$2,$R1,$S1 # S1 = R1*5*4 3195 vpaddq $R1,$S1,$S1 3196 vpsllq \$2,$S1,$S1 3197 3198 vmovq 0($ctx),%x#$H0 # load current hash value 3199 vmovq 8($ctx),%x#$H1 3200 vmovq 16($ctx),%x#$H2 3201 3202 test \$3,$len # is length 4*n+2? 3203 jnz .Ldone_init_vpmadd52_2x 3204 3205 vmovdqu64 $R0,64($ctx) # save key powers 3206 vpbroadcastq %x#$R0,$R0 # broadcast 4th power 3207 vmovdqu64 $R1,96($ctx) 3208 vpbroadcastq %x#$R1,$R1 3209 vmovdqu64 $R2,128($ctx) 3210 vpbroadcastq %x#$R2,$R2 3211 vmovdqu64 $S1,160($ctx) 3212 vpbroadcastq %x#$S1,$S1 3213 3214 jmp .Lblocks_vpmadd52_4x_key_loaded 3215 ud2 3216 3217 .align 32 3218 .Ldone_init_vpmadd52_2x: 3219 vmovdqu64 $R0,64($ctx) # save key powers 3220 vpsrldq \$8,$R0,$R0 # 0-1-0-2 3221 vmovdqu64 $R1,96($ctx) 3222 vpsrldq \$8,$R1,$R1 3223 vmovdqu64 $R2,128($ctx) 3224 vpsrldq \$8,$R2,$R2 3225 vmovdqu64 $S1,160($ctx) 3226 vpsrldq \$8,$S1,$S1 3227 jmp .Lblocks_vpmadd52_2x_key_loaded 3228 ud2 3229 3230 .align 32 3231 .Lblocks_vpmadd52_2x_do: 3232 vmovdqu64 128+8($ctx),${R2}{%k1}{z}# load 2nd and 1st key powers 3233 vmovdqu64 160+8($ctx),${S1}{%k1}{z} 3234 vmovdqu64 64+8($ctx),${R0}{%k1}{z} 3235 vmovdqu64 96+8($ctx),${R1}{%k1}{z} 3236 3237 .Lblocks_vpmadd52_2x_key_loaded: 3238 vmovdqu64 16*0($inp),$T2 # load data 3239 vpxorq $T3,$T3,$T3 3240 lea 16*2($inp),$inp 3241 3242 vpunpcklqdq $T3,$T2,$T1 # transpose data 3243 vpunpckhqdq $T3,$T2,$T3 3244 3245 # at this point 64-bit lanes are ordered as x-1-x-0 3246 3247 vpsrlq \$24,$T3,$T2 # splat the data 3248 vporq $PAD,$T2,$T2 3249 vpaddq $T2,$H2,$H2 # accumulate input 3250 vpandq $mask44,$T1,$T0 3251 vpsrlq \$44,$T1,$T1 3252 vpsllq \$20,$T3,$T3 3253 vporq $T3,$T1,$T1 3254 vpandq $mask44,$T1,$T1 3255 3256 jmp .Ltail_vpmadd52_2x 3257 ud2 3258 3259 .align 32 3260 .Loop_vpmadd52_4x: 3261 #vpaddq $T2,$H2,$H2 # accumulate input 3262 vpaddq $T0,$H0,$H0 3263 vpaddq $T1,$H1,$H1 3264 3265 vpxorq $D0lo,$D0lo,$D0lo 3266 vpmadd52luq $H2,$S1,$D0lo 3267 vpxorq $D0hi,$D0hi,$D0hi 3268 vpmadd52huq $H2,$S1,$D0hi 3269 vpxorq $D1lo,$D1lo,$D1lo 3270 vpmadd52luq $H2,$S2,$D1lo 3271 vpxorq $D1hi,$D1hi,$D1hi 3272 vpmadd52huq $H2,$S2,$D1hi 3273 vpxorq $D2lo,$D2lo,$D2lo 3274 vpmadd52luq $H2,$R0,$D2lo 3275 vpxorq $D2hi,$D2hi,$D2hi 3276 vpmadd52huq $H2,$R0,$D2hi 3277 3278 vmovdqu64 16*0($inp),$T2 # load data 3279 vmovdqu64 16*2($inp),$T3 3280 lea 16*4($inp),$inp 3281 vpmadd52luq $H0,$R0,$D0lo 3282 vpmadd52huq $H0,$R0,$D0hi 3283 vpmadd52luq $H0,$R1,$D1lo 3284 vpmadd52huq $H0,$R1,$D1hi 3285 vpmadd52luq $H0,$R2,$D2lo 3286 vpmadd52huq $H0,$R2,$D2hi 3287 3288 vpunpcklqdq $T3,$T2,$T1 # transpose data 3289 vpunpckhqdq $T3,$T2,$T3 3290 vpmadd52luq $H1,$S2,$D0lo 3291 vpmadd52huq $H1,$S2,$D0hi 3292 vpmadd52luq $H1,$R0,$D1lo 3293 vpmadd52huq $H1,$R0,$D1hi 3294 vpmadd52luq $H1,$R1,$D2lo 3295 vpmadd52huq $H1,$R1,$D2hi 3296 3297 ################################################################ 3298 # partial reduction (interleaved with data splat) 3299 vpsrlq \$44,$D0lo,$tmp 3300 vpsllq \$8,$D0hi,$D0hi 3301 vpandq $mask44,$D0lo,$H0 3302 vpaddq $tmp,$D0hi,$D0hi 3303 3304 vpsrlq \$24,$T3,$T2 3305 vporq $PAD,$T2,$T2 3306 vpaddq $D0hi,$D1lo,$D1lo 3307 3308 vpsrlq \$44,$D1lo,$tmp 3309 vpsllq \$8,$D1hi,$D1hi 3310 vpandq $mask44,$D1lo,$H1 3311 vpaddq $tmp,$D1hi,$D1hi 3312 3313 vpandq $mask44,$T1,$T0 3314 vpsrlq \$44,$T1,$T1 3315 vpsllq \$20,$T3,$T3 3316 vpaddq $D1hi,$D2lo,$D2lo 3317 3318 vpsrlq \$42,$D2lo,$tmp 3319 vpsllq \$10,$D2hi,$D2hi 3320 vpandq $mask42,$D2lo,$H2 3321 vpaddq $tmp,$D2hi,$D2hi 3322 3323 vpaddq $T2,$H2,$H2 # accumulate input 3324 vpaddq $D2hi,$H0,$H0 3325 vpsllq \$2,$D2hi,$D2hi 3326 3327 vpaddq $D2hi,$H0,$H0 3328 vporq $T3,$T1,$T1 3329 vpandq $mask44,$T1,$T1 3330 3331 vpsrlq \$44,$H0,$tmp # additional step 3332 vpandq $mask44,$H0,$H0 3333 3334 vpaddq $tmp,$H1,$H1 3335 3336 sub \$4,$len # len-=64 3337 jnz .Loop_vpmadd52_4x 3338 3339 .Ltail_vpmadd52_4x: 3340 vmovdqu64 128($ctx),$R2 # load all key powers 3341 vmovdqu64 160($ctx),$S1 3342 vmovdqu64 64($ctx),$R0 3343 vmovdqu64 96($ctx),$R1 3344 3345 .Ltail_vpmadd52_2x: 3346 vpsllq \$2,$R2,$S2 # S2 = R2*5*4 3347 vpaddq $R2,$S2,$S2 3348 vpsllq \$2,$S2,$S2 3349 3350 #vpaddq $T2,$H2,$H2 # accumulate input 3351 vpaddq $T0,$H0,$H0 3352 vpaddq $T1,$H1,$H1 3353 3354 vpxorq $D0lo,$D0lo,$D0lo 3355 vpmadd52luq $H2,$S1,$D0lo 3356 vpxorq $D0hi,$D0hi,$D0hi 3357 vpmadd52huq $H2,$S1,$D0hi 3358 vpxorq $D1lo,$D1lo,$D1lo 3359 vpmadd52luq $H2,$S2,$D1lo 3360 vpxorq $D1hi,$D1hi,$D1hi 3361 vpmadd52huq $H2,$S2,$D1hi 3362 vpxorq $D2lo,$D2lo,$D2lo 3363 vpmadd52luq $H2,$R0,$D2lo 3364 vpxorq $D2hi,$D2hi,$D2hi 3365 vpmadd52huq $H2,$R0,$D2hi 3366 3367 vpmadd52luq $H0,$R0,$D0lo 3368 vpmadd52huq $H0,$R0,$D0hi 3369 vpmadd52luq $H0,$R1,$D1lo 3370 vpmadd52huq $H0,$R1,$D1hi 3371 vpmadd52luq $H0,$R2,$D2lo 3372 vpmadd52huq $H0,$R2,$D2hi 3373 3374 vpmadd52luq $H1,$S2,$D0lo 3375 vpmadd52huq $H1,$S2,$D0hi 3376 vpmadd52luq $H1,$R0,$D1lo 3377 vpmadd52huq $H1,$R0,$D1hi 3378 vpmadd52luq $H1,$R1,$D2lo 3379 vpmadd52huq $H1,$R1,$D2hi 3380 3381 ################################################################ 3382 # horizontal addition 3383 3384 mov \$1,%eax 3385 kmovw %eax,%k1 3386 vpsrldq \$8,$D0lo,$T0 3387 vpsrldq \$8,$D0hi,$H0 3388 vpsrldq \$8,$D1lo,$T1 3389 vpsrldq \$8,$D1hi,$H1 3390 vpaddq $T0,$D0lo,$D0lo 3391 vpaddq $H0,$D0hi,$D0hi 3392 vpsrldq \$8,$D2lo,$T2 3393 vpsrldq \$8,$D2hi,$H2 3394 vpaddq $T1,$D1lo,$D1lo 3395 vpaddq $H1,$D1hi,$D1hi 3396 vpermq \$0x2,$D0lo,$T0 3397 vpermq \$0x2,$D0hi,$H0 3398 vpaddq $T2,$D2lo,$D2lo 3399 vpaddq $H2,$D2hi,$D2hi 3400 3401 vpermq \$0x2,$D1lo,$T1 3402 vpermq \$0x2,$D1hi,$H1 3403 vpaddq $T0,$D0lo,${D0lo}{%k1}{z} 3404 vpaddq $H0,$D0hi,${D0hi}{%k1}{z} 3405 vpermq \$0x2,$D2lo,$T2 3406 vpermq \$0x2,$D2hi,$H2 3407 vpaddq $T1,$D1lo,${D1lo}{%k1}{z} 3408 vpaddq $H1,$D1hi,${D1hi}{%k1}{z} 3409 vpaddq $T2,$D2lo,${D2lo}{%k1}{z} 3410 vpaddq $H2,$D2hi,${D2hi}{%k1}{z} 3411 3412 ################################################################ 3413 # partial reduction 3414 vpsrlq \$44,$D0lo,$tmp 3415 vpsllq \$8,$D0hi,$D0hi 3416 vpandq $mask44,$D0lo,$H0 3417 vpaddq $tmp,$D0hi,$D0hi 3418 3419 vpaddq $D0hi,$D1lo,$D1lo 3420 3421 vpsrlq \$44,$D1lo,$tmp 3422 vpsllq \$8,$D1hi,$D1hi 3423 vpandq $mask44,$D1lo,$H1 3424 vpaddq $tmp,$D1hi,$D1hi 3425 3426 vpaddq $D1hi,$D2lo,$D2lo 3427 3428 vpsrlq \$42,$D2lo,$tmp 3429 vpsllq \$10,$D2hi,$D2hi 3430 vpandq $mask42,$D2lo,$H2 3431 vpaddq $tmp,$D2hi,$D2hi 3432 3433 vpaddq $D2hi,$H0,$H0 3434 vpsllq \$2,$D2hi,$D2hi 3435 3436 vpaddq $D2hi,$H0,$H0 3437 3438 vpsrlq \$44,$H0,$tmp # additional step 3439 vpandq $mask44,$H0,$H0 3440 3441 vpaddq $tmp,$H1,$H1 3442 # at this point $len is 3443 # either 4*n+2 or 0... 3444 sub \$2,$len # len-=32 3445 ja .Lblocks_vpmadd52_4x_do 3446 3447 vmovq %x#$H0,0($ctx) 3448 vmovq %x#$H1,8($ctx) 3449 vmovq %x#$H2,16($ctx) 3450 vzeroall 3451 3452 .Lno_data_vpmadd52_4x: 3453 RET 3454 .size poly1305_blocks_vpmadd52_4x,.-poly1305_blocks_vpmadd52_4x 3455 ___ 3456 } 3457 { 3458 ######################################################################## 3459 # As implied by its name 8x subroutine processes 8 blocks in parallel... 3460 # This is intermediate version, as it's used only in cases when input 3461 # length is either 8*n, 8*n+1 or 8*n+2... 3462 3463 my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17)); 3464 my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23)); 3465 my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31)); 3466 my ($RR0,$RR1,$RR2,$SS1,$SS2) = map("%ymm$_",(6..10)); 3467 3468 $code.=<<___; 3469 .type poly1305_blocks_vpmadd52_8x,\@function,4 3470 .align 32 3471 poly1305_blocks_vpmadd52_8x: 3472 shr \$4,$len 3473 jz .Lno_data_vpmadd52_8x # too short 3474 3475 shl \$40,$padbit 3476 mov 64($ctx),%r8 # peek on power of the key 3477 3478 vmovdqa64 .Lx_mask44(%rip),$mask44 3479 vmovdqa64 .Lx_mask42(%rip),$mask42 3480 3481 test %r8,%r8 # is power value impossible? 3482 js .Linit_vpmadd52 # if it is, then init R[4] 3483 3484 vmovq 0($ctx),%x#$H0 # load current hash value 3485 vmovq 8($ctx),%x#$H1 3486 vmovq 16($ctx),%x#$H2 3487 3488 .Lblocks_vpmadd52_8x: 3489 ################################################################ 3490 # fist we calculate more key powers 3491 3492 vmovdqu64 128($ctx),$R2 # load 1-3-2-4 powers 3493 vmovdqu64 160($ctx),$S1 3494 vmovdqu64 64($ctx),$R0 3495 vmovdqu64 96($ctx),$R1 3496 3497 vpsllq \$2,$R2,$S2 # S2 = R2*5*4 3498 vpaddq $R2,$S2,$S2 3499 vpsllq \$2,$S2,$S2 3500 3501 vpbroadcastq %x#$R2,$RR2 # broadcast 4th power 3502 vpbroadcastq %x#$R0,$RR0 3503 vpbroadcastq %x#$R1,$RR1 3504 3505 vpxorq $D0lo,$D0lo,$D0lo 3506 vpmadd52luq $RR2,$S1,$D0lo 3507 vpxorq $D0hi,$D0hi,$D0hi 3508 vpmadd52huq $RR2,$S1,$D0hi 3509 vpxorq $D1lo,$D1lo,$D1lo 3510 vpmadd52luq $RR2,$S2,$D1lo 3511 vpxorq $D1hi,$D1hi,$D1hi 3512 vpmadd52huq $RR2,$S2,$D1hi 3513 vpxorq $D2lo,$D2lo,$D2lo 3514 vpmadd52luq $RR2,$R0,$D2lo 3515 vpxorq $D2hi,$D2hi,$D2hi 3516 vpmadd52huq $RR2,$R0,$D2hi 3517 3518 vpmadd52luq $RR0,$R0,$D0lo 3519 vpmadd52huq $RR0,$R0,$D0hi 3520 vpmadd52luq $RR0,$R1,$D1lo 3521 vpmadd52huq $RR0,$R1,$D1hi 3522 vpmadd52luq $RR0,$R2,$D2lo 3523 vpmadd52huq $RR0,$R2,$D2hi 3524 3525 vpmadd52luq $RR1,$S2,$D0lo 3526 vpmadd52huq $RR1,$S2,$D0hi 3527 vpmadd52luq $RR1,$R0,$D1lo 3528 vpmadd52huq $RR1,$R0,$D1hi 3529 vpmadd52luq $RR1,$R1,$D2lo 3530 vpmadd52huq $RR1,$R1,$D2hi 3531 3532 ################################################################ 3533 # partial reduction 3534 vpsrlq \$44,$D0lo,$tmp 3535 vpsllq \$8,$D0hi,$D0hi 3536 vpandq $mask44,$D0lo,$RR0 3537 vpaddq $tmp,$D0hi,$D0hi 3538 3539 vpaddq $D0hi,$D1lo,$D1lo 3540 3541 vpsrlq \$44,$D1lo,$tmp 3542 vpsllq \$8,$D1hi,$D1hi 3543 vpandq $mask44,$D1lo,$RR1 3544 vpaddq $tmp,$D1hi,$D1hi 3545 3546 vpaddq $D1hi,$D2lo,$D2lo 3547 3548 vpsrlq \$42,$D2lo,$tmp 3549 vpsllq \$10,$D2hi,$D2hi 3550 vpandq $mask42,$D2lo,$RR2 3551 vpaddq $tmp,$D2hi,$D2hi 3552 3553 vpaddq $D2hi,$RR0,$RR0 3554 vpsllq \$2,$D2hi,$D2hi 3555 3556 vpaddq $D2hi,$RR0,$RR0 3557 3558 vpsrlq \$44,$RR0,$tmp # additional step 3559 vpandq $mask44,$RR0,$RR0 3560 3561 vpaddq $tmp,$RR1,$RR1 3562 3563 ################################################################ 3564 # At this point Rx holds 1324 powers, RRx - 5768, and the goal 3565 # is 15263748, which reflects how data is loaded... 3566 3567 vpunpcklqdq $R2,$RR2,$T2 # 3748 3568 vpunpckhqdq $R2,$RR2,$R2 # 1526 3569 vpunpcklqdq $R0,$RR0,$T0 3570 vpunpckhqdq $R0,$RR0,$R0 3571 vpunpcklqdq $R1,$RR1,$T1 3572 vpunpckhqdq $R1,$RR1,$R1 3573 ___ 3574 ######## switch to %zmm 3575 map(s/%y/%z/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2); 3576 map(s/%y/%z/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi); 3577 map(s/%y/%z/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD); 3578 map(s/%y/%z/, $RR0,$RR1,$RR2,$SS1,$SS2); 3579 3580 $code.=<<___; 3581 vshufi64x2 \$0x44,$R2,$T2,$RR2 # 15263748 3582 vshufi64x2 \$0x44,$R0,$T0,$RR0 3583 vshufi64x2 \$0x44,$R1,$T1,$RR1 3584 3585 vmovdqu64 16*0($inp),$T2 # load data 3586 vmovdqu64 16*4($inp),$T3 3587 lea 16*8($inp),$inp 3588 3589 vpsllq \$2,$RR2,$SS2 # S2 = R2*5*4 3590 vpsllq \$2,$RR1,$SS1 # S1 = R1*5*4 3591 vpaddq $RR2,$SS2,$SS2 3592 vpaddq $RR1,$SS1,$SS1 3593 vpsllq \$2,$SS2,$SS2 3594 vpsllq \$2,$SS1,$SS1 3595 3596 vpbroadcastq $padbit,$PAD 3597 vpbroadcastq %x#$mask44,$mask44 3598 vpbroadcastq %x#$mask42,$mask42 3599 3600 vpbroadcastq %x#$SS1,$S1 # broadcast 8th power 3601 vpbroadcastq %x#$SS2,$S2 3602 vpbroadcastq %x#$RR0,$R0 3603 vpbroadcastq %x#$RR1,$R1 3604 vpbroadcastq %x#$RR2,$R2 3605 3606 vpunpcklqdq $T3,$T2,$T1 # transpose data 3607 vpunpckhqdq $T3,$T2,$T3 3608 3609 # at this point 64-bit lanes are ordered as 73625140 3610 3611 vpsrlq \$24,$T3,$T2 # splat the data 3612 vporq $PAD,$T2,$T2 3613 vpaddq $T2,$H2,$H2 # accumulate input 3614 vpandq $mask44,$T1,$T0 3615 vpsrlq \$44,$T1,$T1 3616 vpsllq \$20,$T3,$T3 3617 vporq $T3,$T1,$T1 3618 vpandq $mask44,$T1,$T1 3619 3620 sub \$8,$len 3621 jz .Ltail_vpmadd52_8x 3622 jmp .Loop_vpmadd52_8x 3623 3624 .align 32 3625 .Loop_vpmadd52_8x: 3626 #vpaddq $T2,$H2,$H2 # accumulate input 3627 vpaddq $T0,$H0,$H0 3628 vpaddq $T1,$H1,$H1 3629 3630 vpxorq $D0lo,$D0lo,$D0lo 3631 vpmadd52luq $H2,$S1,$D0lo 3632 vpxorq $D0hi,$D0hi,$D0hi 3633 vpmadd52huq $H2,$S1,$D0hi 3634 vpxorq $D1lo,$D1lo,$D1lo 3635 vpmadd52luq $H2,$S2,$D1lo 3636 vpxorq $D1hi,$D1hi,$D1hi 3637 vpmadd52huq $H2,$S2,$D1hi 3638 vpxorq $D2lo,$D2lo,$D2lo 3639 vpmadd52luq $H2,$R0,$D2lo 3640 vpxorq $D2hi,$D2hi,$D2hi 3641 vpmadd52huq $H2,$R0,$D2hi 3642 3643 vmovdqu64 16*0($inp),$T2 # load data 3644 vmovdqu64 16*4($inp),$T3 3645 lea 16*8($inp),$inp 3646 vpmadd52luq $H0,$R0,$D0lo 3647 vpmadd52huq $H0,$R0,$D0hi 3648 vpmadd52luq $H0,$R1,$D1lo 3649 vpmadd52huq $H0,$R1,$D1hi 3650 vpmadd52luq $H0,$R2,$D2lo 3651 vpmadd52huq $H0,$R2,$D2hi 3652 3653 vpunpcklqdq $T3,$T2,$T1 # transpose data 3654 vpunpckhqdq $T3,$T2,$T3 3655 vpmadd52luq $H1,$S2,$D0lo 3656 vpmadd52huq $H1,$S2,$D0hi 3657 vpmadd52luq $H1,$R0,$D1lo 3658 vpmadd52huq $H1,$R0,$D1hi 3659 vpmadd52luq $H1,$R1,$D2lo 3660 vpmadd52huq $H1,$R1,$D2hi 3661 3662 ################################################################ 3663 # partial reduction (interleaved with data splat) 3664 vpsrlq \$44,$D0lo,$tmp 3665 vpsllq \$8,$D0hi,$D0hi 3666 vpandq $mask44,$D0lo,$H0 3667 vpaddq $tmp,$D0hi,$D0hi 3668 3669 vpsrlq \$24,$T3,$T2 3670 vporq $PAD,$T2,$T2 3671 vpaddq $D0hi,$D1lo,$D1lo 3672 3673 vpsrlq \$44,$D1lo,$tmp 3674 vpsllq \$8,$D1hi,$D1hi 3675 vpandq $mask44,$D1lo,$H1 3676 vpaddq $tmp,$D1hi,$D1hi 3677 3678 vpandq $mask44,$T1,$T0 3679 vpsrlq \$44,$T1,$T1 3680 vpsllq \$20,$T3,$T3 3681 vpaddq $D1hi,$D2lo,$D2lo 3682 3683 vpsrlq \$42,$D2lo,$tmp 3684 vpsllq \$10,$D2hi,$D2hi 3685 vpandq $mask42,$D2lo,$H2 3686 vpaddq $tmp,$D2hi,$D2hi 3687 3688 vpaddq $T2,$H2,$H2 # accumulate input 3689 vpaddq $D2hi,$H0,$H0 3690 vpsllq \$2,$D2hi,$D2hi 3691 3692 vpaddq $D2hi,$H0,$H0 3693 vporq $T3,$T1,$T1 3694 vpandq $mask44,$T1,$T1 3695 3696 vpsrlq \$44,$H0,$tmp # additional step 3697 vpandq $mask44,$H0,$H0 3698 3699 vpaddq $tmp,$H1,$H1 3700 3701 sub \$8,$len # len-=128 3702 jnz .Loop_vpmadd52_8x 3703 3704 .Ltail_vpmadd52_8x: 3705 #vpaddq $T2,$H2,$H2 # accumulate input 3706 vpaddq $T0,$H0,$H0 3707 vpaddq $T1,$H1,$H1 3708 3709 vpxorq $D0lo,$D0lo,$D0lo 3710 vpmadd52luq $H2,$SS1,$D0lo 3711 vpxorq $D0hi,$D0hi,$D0hi 3712 vpmadd52huq $H2,$SS1,$D0hi 3713 vpxorq $D1lo,$D1lo,$D1lo 3714 vpmadd52luq $H2,$SS2,$D1lo 3715 vpxorq $D1hi,$D1hi,$D1hi 3716 vpmadd52huq $H2,$SS2,$D1hi 3717 vpxorq $D2lo,$D2lo,$D2lo 3718 vpmadd52luq $H2,$RR0,$D2lo 3719 vpxorq $D2hi,$D2hi,$D2hi 3720 vpmadd52huq $H2,$RR0,$D2hi 3721 3722 vpmadd52luq $H0,$RR0,$D0lo 3723 vpmadd52huq $H0,$RR0,$D0hi 3724 vpmadd52luq $H0,$RR1,$D1lo 3725 vpmadd52huq $H0,$RR1,$D1hi 3726 vpmadd52luq $H0,$RR2,$D2lo 3727 vpmadd52huq $H0,$RR2,$D2hi 3728 3729 vpmadd52luq $H1,$SS2,$D0lo 3730 vpmadd52huq $H1,$SS2,$D0hi 3731 vpmadd52luq $H1,$RR0,$D1lo 3732 vpmadd52huq $H1,$RR0,$D1hi 3733 vpmadd52luq $H1,$RR1,$D2lo 3734 vpmadd52huq $H1,$RR1,$D2hi 3735 3736 ################################################################ 3737 # horizontal addition 3738 3739 mov \$1,%eax 3740 kmovw %eax,%k1 3741 vpsrldq \$8,$D0lo,$T0 3742 vpsrldq \$8,$D0hi,$H0 3743 vpsrldq \$8,$D1lo,$T1 3744 vpsrldq \$8,$D1hi,$H1 3745 vpaddq $T0,$D0lo,$D0lo 3746 vpaddq $H0,$D0hi,$D0hi 3747 vpsrldq \$8,$D2lo,$T2 3748 vpsrldq \$8,$D2hi,$H2 3749 vpaddq $T1,$D1lo,$D1lo 3750 vpaddq $H1,$D1hi,$D1hi 3751 vpermq \$0x2,$D0lo,$T0 3752 vpermq \$0x2,$D0hi,$H0 3753 vpaddq $T2,$D2lo,$D2lo 3754 vpaddq $H2,$D2hi,$D2hi 3755 3756 vpermq \$0x2,$D1lo,$T1 3757 vpermq \$0x2,$D1hi,$H1 3758 vpaddq $T0,$D0lo,$D0lo 3759 vpaddq $H0,$D0hi,$D0hi 3760 vpermq \$0x2,$D2lo,$T2 3761 vpermq \$0x2,$D2hi,$H2 3762 vpaddq $T1,$D1lo,$D1lo 3763 vpaddq $H1,$D1hi,$D1hi 3764 vextracti64x4 \$1,$D0lo,%y#$T0 3765 vextracti64x4 \$1,$D0hi,%y#$H0 3766 vpaddq $T2,$D2lo,$D2lo 3767 vpaddq $H2,$D2hi,$D2hi 3768 3769 vextracti64x4 \$1,$D1lo,%y#$T1 3770 vextracti64x4 \$1,$D1hi,%y#$H1 3771 vextracti64x4 \$1,$D2lo,%y#$T2 3772 vextracti64x4 \$1,$D2hi,%y#$H2 3773 ___ 3774 ######## switch back to %ymm 3775 map(s/%z/%y/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2); 3776 map(s/%z/%y/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi); 3777 map(s/%z/%y/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD); 3778 3779 $code.=<<___; 3780 vpaddq $T0,$D0lo,${D0lo}{%k1}{z} 3781 vpaddq $H0,$D0hi,${D0hi}{%k1}{z} 3782 vpaddq $T1,$D1lo,${D1lo}{%k1}{z} 3783 vpaddq $H1,$D1hi,${D1hi}{%k1}{z} 3784 vpaddq $T2,$D2lo,${D2lo}{%k1}{z} 3785 vpaddq $H2,$D2hi,${D2hi}{%k1}{z} 3786 3787 ################################################################ 3788 # partial reduction 3789 vpsrlq \$44,$D0lo,$tmp 3790 vpsllq \$8,$D0hi,$D0hi 3791 vpandq $mask44,$D0lo,$H0 3792 vpaddq $tmp,$D0hi,$D0hi 3793 3794 vpaddq $D0hi,$D1lo,$D1lo 3795 3796 vpsrlq \$44,$D1lo,$tmp 3797 vpsllq \$8,$D1hi,$D1hi 3798 vpandq $mask44,$D1lo,$H1 3799 vpaddq $tmp,$D1hi,$D1hi 3800 3801 vpaddq $D1hi,$D2lo,$D2lo 3802 3803 vpsrlq \$42,$D2lo,$tmp 3804 vpsllq \$10,$D2hi,$D2hi 3805 vpandq $mask42,$D2lo,$H2 3806 vpaddq $tmp,$D2hi,$D2hi 3807 3808 vpaddq $D2hi,$H0,$H0 3809 vpsllq \$2,$D2hi,$D2hi 3810 3811 vpaddq $D2hi,$H0,$H0 3812 3813 vpsrlq \$44,$H0,$tmp # additional step 3814 vpandq $mask44,$H0,$H0 3815 3816 vpaddq $tmp,$H1,$H1 3817 3818 ################################################################ 3819 3820 vmovq %x#$H0,0($ctx) 3821 vmovq %x#$H1,8($ctx) 3822 vmovq %x#$H2,16($ctx) 3823 vzeroall 3824 3825 .Lno_data_vpmadd52_8x: 3826 RET 3827 .size poly1305_blocks_vpmadd52_8x,.-poly1305_blocks_vpmadd52_8x 3828 ___ 3829 } 3830 $code.=<<___; 3831 .type poly1305_emit_base2_44,\@function,3 3832 .align 32 3833 poly1305_emit_base2_44: 3834 mov 0($ctx),%r8 # load hash value 3835 mov 8($ctx),%r9 3836 mov 16($ctx),%r10 3837 3838 mov %r9,%rax 3839 shr \$20,%r9 3840 shl \$44,%rax 3841 mov %r10,%rcx 3842 shr \$40,%r10 3843 shl \$24,%rcx 3844 3845 add %rax,%r8 3846 adc %rcx,%r9 3847 adc \$0,%r10 3848 3849 mov %r8,%rax 3850 add \$5,%r8 # compare to modulus 3851 mov %r9,%rcx 3852 adc \$0,%r9 3853 adc \$0,%r10 3854 shr \$2,%r10 # did 130-bit value overflow? 3855 cmovnz %r8,%rax 3856 cmovnz %r9,%rcx 3857 3858 add 0($nonce),%rax # accumulate nonce 3859 adc 8($nonce),%rcx 3860 mov %rax,0($mac) # write result 3861 mov %rcx,8($mac) 3862 3863 RET 3864 .size poly1305_emit_base2_44,.-poly1305_emit_base2_44 3865 ___ 3866 } } } 3867 } 3868 3869 if (!$kernel) 3870 { # chacha20-poly1305 helpers 3871 my ($out,$inp,$otp,$len)=$win64 ? ("%rcx","%rdx","%r8", "%r9") : # Win64 order 3872 ("%rdi","%rsi","%rdx","%rcx"); # Unix order 3873 $code.=<<___; 3874 .globl xor128_encrypt_n_pad 3875 .type xor128_encrypt_n_pad,\@abi-omnipotent 3876 .align 16 3877 xor128_encrypt_n_pad: 3878 sub $otp,$inp 3879 sub $otp,$out 3880 mov $len,%r10 # put len aside 3881 shr \$4,$len # len / 16 3882 jz .Ltail_enc 3883 nop 3884 .Loop_enc_xmm: 3885 movdqu ($inp,$otp),%xmm0 3886 pxor ($otp),%xmm0 3887 movdqu %xmm0,($out,$otp) 3888 movdqa %xmm0,($otp) 3889 lea 16($otp),$otp 3890 dec $len 3891 jnz .Loop_enc_xmm 3892 3893 and \$15,%r10 # len % 16 3894 jz .Ldone_enc 3895 3896 .Ltail_enc: 3897 mov \$16,$len 3898 sub %r10,$len 3899 xor %eax,%eax 3900 .Loop_enc_byte: 3901 mov ($inp,$otp),%al 3902 xor ($otp),%al 3903 mov %al,($out,$otp) 3904 mov %al,($otp) 3905 lea 1($otp),$otp 3906 dec %r10 3907 jnz .Loop_enc_byte 3908 3909 xor %eax,%eax 3910 .Loop_enc_pad: 3911 mov %al,($otp) 3912 lea 1($otp),$otp 3913 dec $len 3914 jnz .Loop_enc_pad 3915 3916 .Ldone_enc: 3917 mov $otp,%rax 3918 RET 3919 .size xor128_encrypt_n_pad,.-xor128_encrypt_n_pad 3920 3921 .globl xor128_decrypt_n_pad 3922 .type xor128_decrypt_n_pad,\@abi-omnipotent 3923 .align 16 3924 xor128_decrypt_n_pad: 3925 sub $otp,$inp 3926 sub $otp,$out 3927 mov $len,%r10 # put len aside 3928 shr \$4,$len # len / 16 3929 jz .Ltail_dec 3930 nop 3931 .Loop_dec_xmm: 3932 movdqu ($inp,$otp),%xmm0 3933 movdqa ($otp),%xmm1 3934 pxor %xmm0,%xmm1 3935 movdqu %xmm1,($out,$otp) 3936 movdqa %xmm0,($otp) 3937 lea 16($otp),$otp 3938 dec $len 3939 jnz .Loop_dec_xmm 3940 3941 pxor %xmm1,%xmm1 3942 and \$15,%r10 # len % 16 3943 jz .Ldone_dec 3944 3945 .Ltail_dec: 3946 mov \$16,$len 3947 sub %r10,$len 3948 xor %eax,%eax 3949 xor %r11d,%r11d 3950 .Loop_dec_byte: 3951 mov ($inp,$otp),%r11b 3952 mov ($otp),%al 3953 xor %r11b,%al 3954 mov %al,($out,$otp) 3955 mov %r11b,($otp) 3956 lea 1($otp),$otp 3957 dec %r10 3958 jnz .Loop_dec_byte 3959 3960 xor %eax,%eax 3961 .Loop_dec_pad: 3962 mov %al,($otp) 3963 lea 1($otp),$otp 3964 dec $len 3965 jnz .Loop_dec_pad 3966 3967 .Ldone_dec: 3968 mov $otp,%rax 3969 RET 3970 .size xor128_decrypt_n_pad,.-xor128_decrypt_n_pad 3971 ___ 3972 } 3973 3974 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 3975 # CONTEXT *context,DISPATCHER_CONTEXT *disp) 3976 if ($win64) { 3977 $rec="%rcx"; 3978 $frame="%rdx"; 3979 $context="%r8"; 3980 $disp="%r9"; 3981 3982 $code.=<<___; 3983 .extern __imp_RtlVirtualUnwind 3984 .type se_handler,\@abi-omnipotent 3985 .align 16 3986 se_handler: 3987 push %rsi 3988 push %rdi 3989 push %rbx 3990 push %rbp 3991 push %r12 3992 push %r13 3993 push %r14 3994 push %r15 3995 pushfq 3996 sub \$64,%rsp 3997 3998 mov 120($context),%rax # pull context->Rax 3999 mov 248($context),%rbx # pull context->Rip 4000 4001 mov 8($disp),%rsi # disp->ImageBase 4002 mov 56($disp),%r11 # disp->HandlerData 4003 4004 mov 0(%r11),%r10d # HandlerData[0] 4005 lea (%rsi,%r10),%r10 # prologue label 4006 cmp %r10,%rbx # context->Rip<.Lprologue 4007 jb .Lcommon_seh_tail 4008 4009 mov 152($context),%rax # pull context->Rsp 4010 4011 mov 4(%r11),%r10d # HandlerData[1] 4012 lea (%rsi,%r10),%r10 # epilogue label 4013 cmp %r10,%rbx # context->Rip>=.Lepilogue 4014 jae .Lcommon_seh_tail 4015 4016 lea 48(%rax),%rax 4017 4018 mov -8(%rax),%rbx 4019 mov -16(%rax),%rbp 4020 mov -24(%rax),%r12 4021 mov -32(%rax),%r13 4022 mov -40(%rax),%r14 4023 mov -48(%rax),%r15 4024 mov %rbx,144($context) # restore context->Rbx 4025 mov %rbp,160($context) # restore context->Rbp 4026 mov %r12,216($context) # restore context->R12 4027 mov %r13,224($context) # restore context->R13 4028 mov %r14,232($context) # restore context->R14 4029 mov %r15,240($context) # restore context->R14 4030 4031 jmp .Lcommon_seh_tail 4032 .size se_handler,.-se_handler 4033 4034 .type avx_handler,\@abi-omnipotent 4035 .align 16 4036 avx_handler: 4037 push %rsi 4038 push %rdi 4039 push %rbx 4040 push %rbp 4041 push %r12 4042 push %r13 4043 push %r14 4044 push %r15 4045 pushfq 4046 sub \$64,%rsp 4047 4048 mov 120($context),%rax # pull context->Rax 4049 mov 248($context),%rbx # pull context->Rip 4050 4051 mov 8($disp),%rsi # disp->ImageBase 4052 mov 56($disp),%r11 # disp->HandlerData 4053 4054 mov 0(%r11),%r10d # HandlerData[0] 4055 lea (%rsi,%r10),%r10 # prologue label 4056 cmp %r10,%rbx # context->Rip<prologue label 4057 jb .Lcommon_seh_tail 4058 4059 mov 152($context),%rax # pull context->Rsp 4060 4061 mov 4(%r11),%r10d # HandlerData[1] 4062 lea (%rsi,%r10),%r10 # epilogue label 4063 cmp %r10,%rbx # context->Rip>=epilogue label 4064 jae .Lcommon_seh_tail 4065 4066 mov 208($context),%rax # pull context->R11 4067 4068 lea 0x50(%rax),%rsi 4069 lea 0xf8(%rax),%rax 4070 lea 512($context),%rdi # &context.Xmm6 4071 mov \$20,%ecx 4072 .long 0xa548f3fc # cld; rep movsq 4073 4074 .Lcommon_seh_tail: 4075 mov 8(%rax),%rdi 4076 mov 16(%rax),%rsi 4077 mov %rax,152($context) # restore context->Rsp 4078 mov %rsi,168($context) # restore context->Rsi 4079 mov %rdi,176($context) # restore context->Rdi 4080 4081 mov 40($disp),%rdi # disp->ContextRecord 4082 mov $context,%rsi # context 4083 mov \$154,%ecx # sizeof(CONTEXT) 4084 .long 0xa548f3fc # cld; rep movsq 4085 4086 mov $disp,%rsi 4087 xor %ecx,%ecx # arg1, UNW_FLAG_NHANDLER 4088 mov 8(%rsi),%rdx # arg2, disp->ImageBase 4089 mov 0(%rsi),%r8 # arg3, disp->ControlPc 4090 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 4091 mov 40(%rsi),%r10 # disp->ContextRecord 4092 lea 56(%rsi),%r11 # &disp->HandlerData 4093 lea 24(%rsi),%r12 # &disp->EstablisherFrame 4094 mov %r10,32(%rsp) # arg5 4095 mov %r11,40(%rsp) # arg6 4096 mov %r12,48(%rsp) # arg7 4097 mov %rcx,56(%rsp) # arg8, (NULL) 4098 call *__imp_RtlVirtualUnwind(%rip) 4099 4100 mov \$1,%eax # ExceptionContinueSearch 4101 add \$64,%rsp 4102 popfq 4103 pop %r15 4104 pop %r14 4105 pop %r13 4106 pop %r12 4107 pop %rbp 4108 pop %rbx 4109 pop %rdi 4110 pop %rsi 4111 RET 4112 .size avx_handler,.-avx_handler 4113 4114 .section .pdata 4115 .align 4 4116 .rva .LSEH_begin_poly1305_init_x86_64 4117 .rva .LSEH_end_poly1305_init_x86_64 4118 .rva .LSEH_info_poly1305_init_x86_64 4119 4120 .rva .LSEH_begin_poly1305_blocks_x86_64 4121 .rva .LSEH_end_poly1305_blocks_x86_64 4122 .rva .LSEH_info_poly1305_blocks_x86_64 4123 4124 .rva .LSEH_begin_poly1305_emit_x86_64 4125 .rva .LSEH_end_poly1305_emit_x86_64 4126 .rva .LSEH_info_poly1305_emit_x86_64 4127 ___ 4128 $code.=<<___ if ($avx); 4129 .rva .LSEH_begin_poly1305_blocks_avx 4130 .rva .Lbase2_64_avx 4131 .rva .LSEH_info_poly1305_blocks_avx_1 4132 4133 .rva .Lbase2_64_avx 4134 .rva .Leven_avx 4135 .rva .LSEH_info_poly1305_blocks_avx_2 4136 4137 .rva .Leven_avx 4138 .rva .LSEH_end_poly1305_blocks_avx 4139 .rva .LSEH_info_poly1305_blocks_avx_3 4140 4141 .rva .LSEH_begin_poly1305_emit_avx 4142 .rva .LSEH_end_poly1305_emit_avx 4143 .rva .LSEH_info_poly1305_emit_avx 4144 ___ 4145 $code.=<<___ if ($avx>1); 4146 .rva .LSEH_begin_poly1305_blocks_avx2 4147 .rva .Lbase2_64_avx2 4148 .rva .LSEH_info_poly1305_blocks_avx2_1 4149 4150 .rva .Lbase2_64_avx2 4151 .rva .Leven_avx2 4152 .rva .LSEH_info_poly1305_blocks_avx2_2 4153 4154 .rva .Leven_avx2 4155 .rva .LSEH_end_poly1305_blocks_avx2 4156 .rva .LSEH_info_poly1305_blocks_avx2_3 4157 ___ 4158 $code.=<<___ if ($avx>2); 4159 .rva .LSEH_begin_poly1305_blocks_avx512 4160 .rva .LSEH_end_poly1305_blocks_avx512 4161 .rva .LSEH_info_poly1305_blocks_avx512 4162 ___ 4163 $code.=<<___; 4164 .section .xdata 4165 .align 8 4166 .LSEH_info_poly1305_init_x86_64: 4167 .byte 9,0,0,0 4168 .rva se_handler 4169 .rva .LSEH_begin_poly1305_init_x86_64,.LSEH_begin_poly1305_init_x86_64 4170 4171 .LSEH_info_poly1305_blocks_x86_64: 4172 .byte 9,0,0,0 4173 .rva se_handler 4174 .rva .Lblocks_body,.Lblocks_epilogue 4175 4176 .LSEH_info_poly1305_emit_x86_64: 4177 .byte 9,0,0,0 4178 .rva se_handler 4179 .rva .LSEH_begin_poly1305_emit_x86_64,.LSEH_begin_poly1305_emit_x86_64 4180 ___ 4181 $code.=<<___ if ($avx); 4182 .LSEH_info_poly1305_blocks_avx_1: 4183 .byte 9,0,0,0 4184 .rva se_handler 4185 .rva .Lblocks_avx_body,.Lblocks_avx_epilogue # HandlerData[] 4186 4187 .LSEH_info_poly1305_blocks_avx_2: 4188 .byte 9,0,0,0 4189 .rva se_handler 4190 .rva .Lbase2_64_avx_body,.Lbase2_64_avx_epilogue # HandlerData[] 4191 4192 .LSEH_info_poly1305_blocks_avx_3: 4193 .byte 9,0,0,0 4194 .rva avx_handler 4195 .rva .Ldo_avx_body,.Ldo_avx_epilogue # HandlerData[] 4196 4197 .LSEH_info_poly1305_emit_avx: 4198 .byte 9,0,0,0 4199 .rva se_handler 4200 .rva .LSEH_begin_poly1305_emit_avx,.LSEH_begin_poly1305_emit_avx 4201 ___ 4202 $code.=<<___ if ($avx>1); 4203 .LSEH_info_poly1305_blocks_avx2_1: 4204 .byte 9,0,0,0 4205 .rva se_handler 4206 .rva .Lblocks_avx2_body,.Lblocks_avx2_epilogue # HandlerData[] 4207 4208 .LSEH_info_poly1305_blocks_avx2_2: 4209 .byte 9,0,0,0 4210 .rva se_handler 4211 .rva .Lbase2_64_avx2_body,.Lbase2_64_avx2_epilogue # HandlerData[] 4212 4213 .LSEH_info_poly1305_blocks_avx2_3: 4214 .byte 9,0,0,0 4215 .rva avx_handler 4216 .rva .Ldo_avx2_body,.Ldo_avx2_epilogue # HandlerData[] 4217 ___ 4218 $code.=<<___ if ($avx>2); 4219 .LSEH_info_poly1305_blocks_avx512: 4220 .byte 9,0,0,0 4221 .rva avx_handler 4222 .rva .Ldo_avx512_body,.Ldo_avx512_epilogue # HandlerData[] 4223 ___ 4224 } 4225 4226 open SELF,$0; 4227 while(<SELF>) { 4228 next if (/^#!/); 4229 last if (!s/^#/\/\// and !/^$/); 4230 print; 4231 } 4232 close SELF; 4233 4234 foreach (split('\n',$code)) { 4235 s/\`([^\`]*)\`/eval($1)/ge; 4236 s/%r([a-z]+)#d/%e$1/g; 4237 s/%r([0-9]+)#d/%r$1d/g; 4238 s/%x#%[yz]/%x/g or s/%y#%z/%y/g or s/%z#%[yz]/%z/g; 4239 4240 if ($kernel) { 4241 s/(^\.type.*),[0-9]+$/\1/; 4242 s/(^\.type.*),\@abi-omnipotent+$/\1,\@function/; 4243 next if /^\.cfi.*/; 4244 } 4245 4246 print $_,"\n"; 4247 } 4248 close STDOUT;
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.