1 #!/usr/bin/env perl 2 # SPDX-License-Identifier: GPL-2.0 3 4 # This code is taken from the OpenSSL project 5 # has relicensed it under the GPLv2. Therefore 6 # you can redistribute it and/or modify it und 7 # Public License version 2 as published by the 8 # 9 # The original headers, including the original 10 # included below for completeness. 11 12 # ============================================ 13 # Written by Andy Polyakov <appro@openssl.org> 14 # project. The module is, however, dual licens 15 # CRYPTOGAMS licenses depending on where you o 16 # details see https://www.openssl.org/~appro/c 17 # ============================================ 18 19 # SHA512 block procedure for ARMv4. September 20 21 # This code is ~4.5 (four and a half) times fa 22 # by gcc 3.4 and it spends ~72 clock cycles pe 23 # Xscale PXA250 core]. 24 # 25 # July 2010. 26 # 27 # Rescheduling for dual-issue pipeline resulte 28 # Cortex A8 core and ~40 cycles per processed 29 30 # February 2011. 31 # 32 # Profiler-assisted and platform-specific opti 33 # improvement on Coxtex A8 core and ~38 cycles 34 35 # March 2011. 36 # 37 # Add NEON implementation. On Cortex A8 it was 38 # one byte in 23.3 cycles or ~60% faster than 39 40 # August 2012. 41 # 42 # Improve NEON performance by 12% on Snapdrago 43 # terms it's 22.6 cycles per byte, which is di 44 # Technical writers asserted that 3-way S4 pip 45 # multiple NEON instructions per cycle, but du 46 # not be observed, see https://www.openssl.org 47 # for further details. On side note Cortex-A15 48 # 16 cycles. 49 50 # Byte order [in]dependence. ================= 51 # 52 # Originally caller was expected to maintain s 53 # h[0-7], namely with most significant dword a 54 # was reflected in below two parameters as 0 a 55 # expected to maintain native byte order for w 56 $hi="HI"; 57 $lo="LO"; 58 # ============================================ 59 60 while (($output=shift) && ($output!~/^\w[\w\-] 61 open STDOUT,">$output"; 62 63 $ctx="r0"; # parameter block 64 $inp="r1"; 65 $len="r2"; 66 67 $Tlo="r3"; 68 $Thi="r4"; 69 $Alo="r5"; 70 $Ahi="r6"; 71 $Elo="r7"; 72 $Ehi="r8"; 73 $t0="r9"; 74 $t1="r10"; 75 $t2="r11"; 76 $t3="r12"; 77 ############ r13 is stack pointer 78 $Ktbl="r14"; 79 ############ r15 is program counter 80 81 $Aoff=8*0; 82 $Boff=8*1; 83 $Coff=8*2; 84 $Doff=8*3; 85 $Eoff=8*4; 86 $Foff=8*5; 87 $Goff=8*6; 88 $Hoff=8*7; 89 $Xoff=8*8; 90 91 sub BODY_00_15() { 92 my $magic = shift; 93 $code.=<<___; 94 @ Sigma1(x) (ROTR((x),14) ^ ROTR(( 95 @ LO lo>>14^hi<<18 ^ lo>>18 96 @ HI hi>>14^lo<<18 ^ hi>>18 97 mov $t0,$Elo,lsr#14 98 str $Tlo,[sp,#$Xoff+0] 99 mov $t1,$Ehi,lsr#14 100 str $Thi,[sp,#$Xoff+4] 101 eor $t0,$t0,$Ehi,lsl#18 102 ldr $t2,[sp,#$Hoff+0] @ h.lo 103 eor $t1,$t1,$Elo,lsl#18 104 ldr $t3,[sp,#$Hoff+4] @ h.hi 105 eor $t0,$t0,$Elo,lsr#18 106 eor $t1,$t1,$Ehi,lsr#18 107 eor $t0,$t0,$Ehi,lsl#14 108 eor $t1,$t1,$Elo,lsl#14 109 eor $t0,$t0,$Ehi,lsr#9 110 eor $t1,$t1,$Elo,lsr#9 111 eor $t0,$t0,$Elo,lsl#23 112 eor $t1,$t1,$Ehi,lsl#23 @ Sigm 113 adds $Tlo,$Tlo,$t0 114 ldr $t0,[sp,#$Foff+0] @ f.lo 115 adc $Thi,$Thi,$t1 @ T += 116 ldr $t1,[sp,#$Foff+4] @ f.hi 117 adds $Tlo,$Tlo,$t2 118 ldr $t2,[sp,#$Goff+0] @ g.lo 119 adc $Thi,$Thi,$t3 @ T += 120 ldr $t3,[sp,#$Goff+4] @ g.hi 121 122 eor $t0,$t0,$t2 123 str $Elo,[sp,#$Eoff+0] 124 eor $t1,$t1,$t3 125 str $Ehi,[sp,#$Eoff+4] 126 and $t0,$t0,$Elo 127 str $Alo,[sp,#$Aoff+0] 128 and $t1,$t1,$Ehi 129 str $Ahi,[sp,#$Aoff+4] 130 eor $t0,$t0,$t2 131 ldr $t2,[$Ktbl,#$lo] @ K[i] 132 eor $t1,$t1,$t3 @ Ch(e 133 ldr $t3,[$Ktbl,#$hi] @ K[i] 134 135 adds $Tlo,$Tlo,$t0 136 ldr $Elo,[sp,#$Doff+0] @ d.lo 137 adc $Thi,$Thi,$t1 @ T += 138 ldr $Ehi,[sp,#$Doff+4] @ d.hi 139 adds $Tlo,$Tlo,$t2 140 and $t0,$t2,#0xff 141 adc $Thi,$Thi,$t3 @ T += 142 adds $Elo,$Elo,$Tlo 143 ldr $t2,[sp,#$Boff+0] @ b.lo 144 adc $Ehi,$Ehi,$Thi @ d += 145 teq $t0,#$magic 146 147 ldr $t3,[sp,#$Coff+0] @ c.lo 148 #if __ARM_ARCH__>=7 149 it eq @ Thum 150 #endif 151 orreq $Ktbl,$Ktbl,#1 152 @ Sigma0(x) (ROTR((x),28) ^ ROTR(( 153 @ LO lo>>28^hi<<4 ^ hi>>2^ 154 @ HI hi>>28^lo<<4 ^ lo>>2^ 155 mov $t0,$Alo,lsr#28 156 mov $t1,$Ahi,lsr#28 157 eor $t0,$t0,$Ahi,lsl#4 158 eor $t1,$t1,$Alo,lsl#4 159 eor $t0,$t0,$Ahi,lsr#2 160 eor $t1,$t1,$Alo,lsr#2 161 eor $t0,$t0,$Alo,lsl#30 162 eor $t1,$t1,$Ahi,lsl#30 163 eor $t0,$t0,$Ahi,lsr#7 164 eor $t1,$t1,$Alo,lsr#7 165 eor $t0,$t0,$Alo,lsl#25 166 eor $t1,$t1,$Ahi,lsl#25 @ Sigm 167 adds $Tlo,$Tlo,$t0 168 and $t0,$Alo,$t2 169 adc $Thi,$Thi,$t1 @ T += 170 171 ldr $t1,[sp,#$Boff+4] @ b.hi 172 orr $Alo,$Alo,$t2 173 ldr $t2,[sp,#$Coff+4] @ c.hi 174 and $Alo,$Alo,$t3 175 and $t3,$Ahi,$t1 176 orr $Ahi,$Ahi,$t1 177 orr $Alo,$Alo,$t0 @ Maj( 178 and $Ahi,$Ahi,$t2 179 adds $Alo,$Alo,$Tlo 180 orr $Ahi,$Ahi,$t3 @ Maj( 181 sub sp,sp,#8 182 adc $Ahi,$Ahi,$Thi @ h += 183 tst $Ktbl,#1 184 add $Ktbl,$Ktbl,#8 185 ___ 186 } 187 $code=<<___; 188 #ifndef __KERNEL__ 189 # include "arm_arch.h" 190 # define VFP_ABI_PUSH vstmdb sp!,{d8-d15} 191 # define VFP_ABI_POP vldmia sp!,{d8-d15} 192 #else 193 # define __ARM_ARCH__ __LINUX_ARM_ARCH__ 194 # define __ARM_MAX_ARCH__ 7 195 # define VFP_ABI_PUSH 196 # define VFP_ABI_POP 197 #endif 198 199 #ifdef __ARMEL__ 200 # define LO 0 201 # define HI 4 202 # define WORD64(hi0,lo0,hi1,lo1) .word 203 #else 204 # define HI 0 205 # define LO 4 206 # define WORD64(hi0,lo0,hi1,lo1) .word 207 #endif 208 209 .text 210 #if __ARM_ARCH__<7 211 .code 32 212 #else 213 .syntax unified 214 # ifdef __thumb2__ 215 .thumb 216 # else 217 .code 32 218 # endif 219 #endif 220 221 .type K512,%object 222 .align 5 223 K512: 224 WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23e 225 WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x818 226 WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb60 227 WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6 228 WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x457 229 WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5f 230 WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1 231 WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf6 232 WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384 233 WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77a 234 WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea 235 WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831 236 WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db 237 WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbee 238 WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930 239 WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0 240 WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c2 241 WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d9 242 WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c7 243 WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x148 244 WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc4 245 WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x065 246 WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x556 247 WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32b 248 WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x514 249 WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19 250 WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe34 251 WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b 252 WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x431 253 WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6 254 WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde8 255 WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe37 256 WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c 257 WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6 258 WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c 259 WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131 260 WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c 261 WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c1 262 WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc6 263 WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a4 264 .size K512,.-K512 265 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__ 266 .LOPENSSL_armcap: 267 .word OPENSSL_armcap_P-sha512_block_data_ord 268 .skip 32-4 269 #else 270 .skip 32 271 #endif 272 273 .global sha512_block_data_order 274 .type sha512_block_data_order,%function 275 sha512_block_data_order: 276 .Lsha512_block_data_order: 277 #if __ARM_ARCH__<7 278 sub r3,pc,#8 @ sha5 279 #else 280 adr r3,.Lsha512_block_data_order 281 #endif 282 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__ 283 ldr r12,.LOPENSSL_armcap 284 ldr r12,[r3,r12] @ OPEN 285 tst r12,#1 286 bne .LNEON 287 #endif 288 add $len,$inp,$len,lsl#7 @ len 289 stmdb sp!,{r4-r12,lr} 290 sub $Ktbl,r3,#672 @ K512 291 sub sp,sp,#9*8 292 293 ldr $Elo,[$ctx,#$Eoff+$lo] 294 ldr $Ehi,[$ctx,#$Eoff+$hi] 295 ldr $t0, [$ctx,#$Goff+$lo] 296 ldr $t1, [$ctx,#$Goff+$hi] 297 ldr $t2, [$ctx,#$Hoff+$lo] 298 ldr $t3, [$ctx,#$Hoff+$hi] 299 .Loop: 300 str $t0, [sp,#$Goff+0] 301 str $t1, [sp,#$Goff+4] 302 str $t2, [sp,#$Hoff+0] 303 str $t3, [sp,#$Hoff+4] 304 ldr $Alo,[$ctx,#$Aoff+$lo] 305 ldr $Ahi,[$ctx,#$Aoff+$hi] 306 ldr $Tlo,[$ctx,#$Boff+$lo] 307 ldr $Thi,[$ctx,#$Boff+$hi] 308 ldr $t0, [$ctx,#$Coff+$lo] 309 ldr $t1, [$ctx,#$Coff+$hi] 310 ldr $t2, [$ctx,#$Doff+$lo] 311 ldr $t3, [$ctx,#$Doff+$hi] 312 str $Tlo,[sp,#$Boff+0] 313 str $Thi,[sp,#$Boff+4] 314 str $t0, [sp,#$Coff+0] 315 str $t1, [sp,#$Coff+4] 316 str $t2, [sp,#$Doff+0] 317 str $t3, [sp,#$Doff+4] 318 ldr $Tlo,[$ctx,#$Foff+$lo] 319 ldr $Thi,[$ctx,#$Foff+$hi] 320 str $Tlo,[sp,#$Foff+0] 321 str $Thi,[sp,#$Foff+4] 322 323 .L00_15: 324 #if __ARM_ARCH__<7 325 ldrb $Tlo,[$inp,#7] 326 ldrb $t0, [$inp,#6] 327 ldrb $t1, [$inp,#5] 328 ldrb $t2, [$inp,#4] 329 ldrb $Thi,[$inp,#3] 330 ldrb $t3, [$inp,#2] 331 orr $Tlo,$Tlo,$t0,lsl#8 332 ldrb $t0, [$inp,#1] 333 orr $Tlo,$Tlo,$t1,lsl#16 334 ldrb $t1, [$inp],#8 335 orr $Tlo,$Tlo,$t2,lsl#24 336 orr $Thi,$Thi,$t3,lsl#8 337 orr $Thi,$Thi,$t0,lsl#16 338 orr $Thi,$Thi,$t1,lsl#24 339 #else 340 ldr $Tlo,[$inp,#4] 341 ldr $Thi,[$inp],#8 342 #ifdef __ARMEL__ 343 rev $Tlo,$Tlo 344 rev $Thi,$Thi 345 #endif 346 #endif 347 ___ 348 &BODY_00_15(0x94); 349 $code.=<<___; 350 tst $Ktbl,#1 351 beq .L00_15 352 ldr $t0,[sp,#`$Xoff+8*(16-1)`+0] 353 ldr $t1,[sp,#`$Xoff+8*(16-1)`+4] 354 bic $Ktbl,$Ktbl,#1 355 .L16_79: 356 @ sigma0(x) (ROTR((x),1) ^ ROTR(( 357 @ LO lo>>1^hi<<31 ^ lo>>8^ 358 @ HI hi>>1^lo<<31 ^ hi>>8^ 359 mov $Tlo,$t0,lsr#1 360 ldr $t2,[sp,#`$Xoff+8*(16-14)`+0] 361 mov $Thi,$t1,lsr#1 362 ldr $t3,[sp,#`$Xoff+8*(16-14)`+4] 363 eor $Tlo,$Tlo,$t1,lsl#31 364 eor $Thi,$Thi,$t0,lsl#31 365 eor $Tlo,$Tlo,$t0,lsr#8 366 eor $Thi,$Thi,$t1,lsr#8 367 eor $Tlo,$Tlo,$t1,lsl#24 368 eor $Thi,$Thi,$t0,lsl#24 369 eor $Tlo,$Tlo,$t0,lsr#7 370 eor $Thi,$Thi,$t1,lsr#7 371 eor $Tlo,$Tlo,$t1,lsl#25 372 373 @ sigma1(x) (ROTR((x),19) ^ ROTR(( 374 @ LO lo>>19^hi<<13 ^ hi>>29 375 @ HI hi>>19^lo<<13 ^ lo>>29 376 mov $t0,$t2,lsr#19 377 mov $t1,$t3,lsr#19 378 eor $t0,$t0,$t3,lsl#13 379 eor $t1,$t1,$t2,lsl#13 380 eor $t0,$t0,$t3,lsr#29 381 eor $t1,$t1,$t2,lsr#29 382 eor $t0,$t0,$t2,lsl#3 383 eor $t1,$t1,$t3,lsl#3 384 eor $t0,$t0,$t2,lsr#6 385 eor $t1,$t1,$t3,lsr#6 386 ldr $t2,[sp,#`$Xoff+8*(16-9)`+0] 387 eor $t0,$t0,$t3,lsl#26 388 389 ldr $t3,[sp,#`$Xoff+8*(16-9)`+4] 390 adds $Tlo,$Tlo,$t0 391 ldr $t0,[sp,#`$Xoff+8*16`+0] 392 adc $Thi,$Thi,$t1 393 394 ldr $t1,[sp,#`$Xoff+8*16`+4] 395 adds $Tlo,$Tlo,$t2 396 adc $Thi,$Thi,$t3 397 adds $Tlo,$Tlo,$t0 398 adc $Thi,$Thi,$t1 399 ___ 400 &BODY_00_15(0x17); 401 $code.=<<___; 402 #if __ARM_ARCH__>=7 403 ittt eq @ Thum 404 #endif 405 ldreq $t0,[sp,#`$Xoff+8*(16-1)`+0] 406 ldreq $t1,[sp,#`$Xoff+8*(16-1)`+4] 407 beq .L16_79 408 bic $Ktbl,$Ktbl,#1 409 410 ldr $Tlo,[sp,#$Boff+0] 411 ldr $Thi,[sp,#$Boff+4] 412 ldr $t0, [$ctx,#$Aoff+$lo] 413 ldr $t1, [$ctx,#$Aoff+$hi] 414 ldr $t2, [$ctx,#$Boff+$lo] 415 ldr $t3, [$ctx,#$Boff+$hi] 416 adds $t0,$Alo,$t0 417 str $t0, [$ctx,#$Aoff+$lo] 418 adc $t1,$Ahi,$t1 419 str $t1, [$ctx,#$Aoff+$hi] 420 adds $t2,$Tlo,$t2 421 str $t2, [$ctx,#$Boff+$lo] 422 adc $t3,$Thi,$t3 423 str $t3, [$ctx,#$Boff+$hi] 424 425 ldr $Alo,[sp,#$Coff+0] 426 ldr $Ahi,[sp,#$Coff+4] 427 ldr $Tlo,[sp,#$Doff+0] 428 ldr $Thi,[sp,#$Doff+4] 429 ldr $t0, [$ctx,#$Coff+$lo] 430 ldr $t1, [$ctx,#$Coff+$hi] 431 ldr $t2, [$ctx,#$Doff+$lo] 432 ldr $t3, [$ctx,#$Doff+$hi] 433 adds $t0,$Alo,$t0 434 str $t0, [$ctx,#$Coff+$lo] 435 adc $t1,$Ahi,$t1 436 str $t1, [$ctx,#$Coff+$hi] 437 adds $t2,$Tlo,$t2 438 str $t2, [$ctx,#$Doff+$lo] 439 adc $t3,$Thi,$t3 440 str $t3, [$ctx,#$Doff+$hi] 441 442 ldr $Tlo,[sp,#$Foff+0] 443 ldr $Thi,[sp,#$Foff+4] 444 ldr $t0, [$ctx,#$Eoff+$lo] 445 ldr $t1, [$ctx,#$Eoff+$hi] 446 ldr $t2, [$ctx,#$Foff+$lo] 447 ldr $t3, [$ctx,#$Foff+$hi] 448 adds $Elo,$Elo,$t0 449 str $Elo,[$ctx,#$Eoff+$lo] 450 adc $Ehi,$Ehi,$t1 451 str $Ehi,[$ctx,#$Eoff+$hi] 452 adds $t2,$Tlo,$t2 453 str $t2, [$ctx,#$Foff+$lo] 454 adc $t3,$Thi,$t3 455 str $t3, [$ctx,#$Foff+$hi] 456 457 ldr $Alo,[sp,#$Goff+0] 458 ldr $Ahi,[sp,#$Goff+4] 459 ldr $Tlo,[sp,#$Hoff+0] 460 ldr $Thi,[sp,#$Hoff+4] 461 ldr $t0, [$ctx,#$Goff+$lo] 462 ldr $t1, [$ctx,#$Goff+$hi] 463 ldr $t2, [$ctx,#$Hoff+$lo] 464 ldr $t3, [$ctx,#$Hoff+$hi] 465 adds $t0,$Alo,$t0 466 str $t0, [$ctx,#$Goff+$lo] 467 adc $t1,$Ahi,$t1 468 str $t1, [$ctx,#$Goff+$hi] 469 adds $t2,$Tlo,$t2 470 str $t2, [$ctx,#$Hoff+$lo] 471 adc $t3,$Thi,$t3 472 str $t3, [$ctx,#$Hoff+$hi] 473 474 add sp,sp,#640 475 sub $Ktbl,$Ktbl,#640 476 477 teq $inp,$len 478 bne .Loop 479 480 add sp,sp,#8*9 @ dest 481 #if __ARM_ARCH__>=5 482 ldmia sp!,{r4-r12,pc} 483 #else 484 ldmia sp!,{r4-r12,lr} 485 tst lr,#1 486 moveq pc,lr @ be b 487 bx lr @ inte 488 #endif 489 .size sha512_block_data_order,.-sha512_block 490 ___ 491 492 { 493 my @Sigma0=(28,34,39); 494 my @Sigma1=(14,18,41); 495 my @sigma0=(1, 8, 7); 496 my @sigma1=(19,61,6); 497 498 my $Ktbl="r3"; 499 my $cnt="r12"; # volatile register known as i 500 501 my @X=map("d$_",(0..15)); 502 my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16. 503 504 sub NEON_00_15() { 505 my $i=shift; 506 my ($a,$b,$c,$d,$e,$f,$g,$h)=@_; 507 my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24 508 509 $code.=<<___ if ($i<16 || $i&1); 510 vshr.u64 $t0,$e,#@Sigma1[0] 511 #if $i<16 512 vld1.64 {@X[$i%16]},[$inp]! 513 #endif 514 vshr.u64 $t1,$e,#@Sigma1[1] 515 #if $i>0 516 vadd.i64 $a,$Maj 517 #endif 518 vshr.u64 $t2,$e,#@Sigma1[2] 519 ___ 520 $code.=<<___; 521 vld1.64 {$K},[$Ktbl,:64]! 522 vsli.64 $t0,$e,#`64-@Sigma1[0] 523 vsli.64 $t1,$e,#`64-@Sigma1[1] 524 vmov $Ch,$e 525 vsli.64 $t2,$e,#`64-@Sigma1[2] 526 #if $i<16 && defined(__ARMEL__) 527 vrev64.8 @X[$i],@X[$i] 528 #endif 529 veor $t1,$t0 530 vbsl $Ch,$f,$g 531 vshr.u64 $t0,$a,#@Sigma0[0] 532 veor $t2,$t1 533 vadd.i64 $T1,$Ch,$h 534 vshr.u64 $t1,$a,#@Sigma0[1] 535 vsli.64 $t0,$a,#`64-@Sigma0[0] 536 vadd.i64 $T1,$t2 537 vshr.u64 $t2,$a,#@Sigma0[2] 538 vadd.i64 $K,@X[$i%16] 539 vsli.64 $t1,$a,#`64-@Sigma0[1] 540 veor $Maj,$a,$b 541 vsli.64 $t2,$a,#`64-@Sigma0[2] 542 veor $h,$t0,$t1 543 vadd.i64 $T1,$K 544 vbsl $Maj,$c,$b 545 veor $h,$t2 546 vadd.i64 $d,$T1 547 vadd.i64 $Maj,$T1 548 @ vadd.i64 $h,$Maj 549 ___ 550 } 551 552 sub NEON_16_79() { 553 my $i=shift; 554 555 if ($i&1) { &NEON_00_15($i,@_); return; 556 557 # 2x-vectorized, therefore runs every 2nd roun 558 my @X=map("q$_",(0..7)); 559 my ($t0,$t1,$s0,$s1) = map("q$_",(12..15)); 560 my ($d0,$d1,$d2) = map("d$_",(24..26)); 561 my $e=@_[4]; 562 $i /= 2; 563 $code.=<<___; 564 vshr.u64 $t0,@X[($i+7)%8],#@sig 565 vshr.u64 $t1,@X[($i+7)%8],#@sig 566 vadd.i64 @_[0],d30 567 vshr.u64 $s1,@X[($i+7)%8],#@sig 568 vsli.64 $t0,@X[($i+7)%8],#`64- 569 vext.8 $s0,@X[$i%8],@X[($i+1) 570 vsli.64 $t1,@X[($i+7)%8],#`64- 571 veor $s1,$t0 572 vshr.u64 $t0,$s0,#@sigma0[0] 573 veor $s1,$t1 574 vshr.u64 $t1,$s0,#@sigma0[1] 575 vadd.i64 @X[$i%8],$s1 576 vshr.u64 $s1,$s0,#@sigma0[2] 577 vsli.64 $t0,$s0,#`64-@sigma0[0 578 vsli.64 $t1,$s0,#`64-@sigma0[1 579 vext.8 $s0,@X[($i+4)%8],@X[($ 580 veor $s1,$t0 581 vshr.u64 $d0,$e,#@Sigma1[0] 582 vadd.i64 @X[$i%8],$s0 583 vshr.u64 $d1,$e,#@Sigma1[1] 584 veor $s1,$t1 585 vshr.u64 $d2,$e,#@Sigma1[2] 586 vadd.i64 @X[$i%8],$s1 587 ___ 588 &NEON_00_15(2*$i,@_); 589 } 590 591 $code.=<<___; 592 #if __ARM_MAX_ARCH__>=7 593 .arch armv7-a 594 .fpu neon 595 596 .global sha512_block_data_order_neon 597 .type sha512_block_data_order_neon,%function 598 .align 4 599 sha512_block_data_order_neon: 600 .LNEON: 601 dmb @ erra 602 add $len,$inp,$len,lsl#7 @ len 603 VFP_ABI_PUSH 604 adr $Ktbl,.Lsha512_block_data_orde 605 sub $Ktbl,$Ktbl,.Lsha512_block_dat 606 vldmia $ctx,{$A-$H} @ load 607 .Loop_neon: 608 ___ 609 for($i=0;$i<16;$i++) { &NEON_00_15($i,@V); 610 $code.=<<___; 611 mov $cnt,#4 612 .L16_79_neon: 613 subs $cnt,#1 614 ___ 615 for(;$i<32;$i++) { &NEON_16_79($i,@V); 616 $code.=<<___; 617 bne .L16_79_neon 618 619 vadd.i64 $A,d30 @ h+=M 620 vldmia $ctx,{d24-d31} @ load 621 vadd.i64 q8,q12 @ vect 622 vadd.i64 q9,q13 623 vadd.i64 q10,q14 624 vadd.i64 q11,q15 625 vstmia $ctx,{$A-$H} @ save 626 teq $inp,$len 627 sub $Ktbl,#640 @ rewi 628 bne .Loop_neon 629 630 VFP_ABI_POP 631 ret @ bx l 632 .size sha512_block_data_order_neon,.-sha512_ 633 #endif 634 ___ 635 } 636 $code.=<<___; 637 .asciz "SHA512 block transform for ARMv4/NEON< 638 .align 2 639 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__ 640 .comm OPENSSL_armcap_P,4,4 641 #endif 642 ___ 643 644 $code =~ s/\`([^\`]*)\`/eval $1/gem; 645 $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; 646 $code =~ s/\bret\b/bx lr/gm; 647 648 open SELF,$0; 649 while(<SELF>) { 650 next if (/^#!/); 651 last if (!s/^#/@/ and !/^$/); 652 print; 653 } 654 close SELF; 655 656 print $code; 657 close STDOUT; # enforce flush
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.