1 #!/usr/bin/env perl 2 # SPDX-License-Identifier: GPL-2.0 3 4 # This code is taken from the OpenSSL project but the author (Andy Polyakov) 5 # has relicensed it under the GPLv2. Therefore this program is free software; 6 # you can redistribute it and/or modify it under the terms of the GNU General 7 # Public License version 2 as published by the Free Software Foundation. 8 # 9 # The original headers, including the original license headers, are 10 # included below for completeness. 11 12 # ==================================================================== 13 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 14 # project. The module is, however, dual licensed under OpenSSL and 15 # CRYPTOGAMS licenses depending on where you obtain it. For further 16 # details see https://www.openssl.org/~appro/cryptogams/. 17 # ==================================================================== 18 19 # SHA512 block procedure for ARMv4. September 2007. 20 21 # This code is ~4.5 (four and a half) times faster than code generated 22 # by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue 23 # Xscale PXA250 core]. 24 # 25 # July 2010. 26 # 27 # Rescheduling for dual-issue pipeline resulted in 6% improvement on 28 # Cortex A8 core and ~40 cycles per processed byte. 29 30 # February 2011. 31 # 32 # Profiler-assisted and platform-specific optimization resulted in 7% 33 # improvement on Coxtex A8 core and ~38 cycles per byte. 34 35 # March 2011. 36 # 37 # Add NEON implementation. On Cortex A8 it was measured to process 38 # one byte in 23.3 cycles or ~60% faster than integer-only code. 39 40 # August 2012. 41 # 42 # Improve NEON performance by 12% on Snapdragon S4. In absolute 43 # terms it's 22.6 cycles per byte, which is disappointing result. 44 # Technical writers asserted that 3-way S4 pipeline can sustain 45 # multiple NEON instructions per cycle, but dual NEON issue could 46 # not be observed, see https://www.openssl.org/~appro/Snapdragon-S4.html 47 # for further details. On side note Cortex-A15 processes one byte in 48 # 16 cycles. 49 50 # Byte order [in]dependence. ========================================= 51 # 52 # Originally caller was expected to maintain specific *dword* order in 53 # h[0-7], namely with most significant dword at *lower* address, which 54 # was reflected in below two parameters as 0 and 4. Now caller is 55 # expected to maintain native byte order for whole 64-bit values. 56 $hi="HI"; 57 $lo="LO"; 58 # ==================================================================== 59 60 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} 61 open STDOUT,">$output"; 62 63 $ctx="r0"; # parameter block 64 $inp="r1"; 65 $len="r2"; 66 67 $Tlo="r3"; 68 $Thi="r4"; 69 $Alo="r5"; 70 $Ahi="r6"; 71 $Elo="r7"; 72 $Ehi="r8"; 73 $t0="r9"; 74 $t1="r10"; 75 $t2="r11"; 76 $t3="r12"; 77 ############ r13 is stack pointer 78 $Ktbl="r14"; 79 ############ r15 is program counter 80 81 $Aoff=8*0; 82 $Boff=8*1; 83 $Coff=8*2; 84 $Doff=8*3; 85 $Eoff=8*4; 86 $Foff=8*5; 87 $Goff=8*6; 88 $Hoff=8*7; 89 $Xoff=8*8; 90 91 sub BODY_00_15() { 92 my $magic = shift; 93 $code.=<<___; 94 @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41)) 95 @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23 96 @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23 97 mov $t0,$Elo,lsr#14 98 str $Tlo,[sp,#$Xoff+0] 99 mov $t1,$Ehi,lsr#14 100 str $Thi,[sp,#$Xoff+4] 101 eor $t0,$t0,$Ehi,lsl#18 102 ldr $t2,[sp,#$Hoff+0] @ h.lo 103 eor $t1,$t1,$Elo,lsl#18 104 ldr $t3,[sp,#$Hoff+4] @ h.hi 105 eor $t0,$t0,$Elo,lsr#18 106 eor $t1,$t1,$Ehi,lsr#18 107 eor $t0,$t0,$Ehi,lsl#14 108 eor $t1,$t1,$Elo,lsl#14 109 eor $t0,$t0,$Ehi,lsr#9 110 eor $t1,$t1,$Elo,lsr#9 111 eor $t0,$t0,$Elo,lsl#23 112 eor $t1,$t1,$Ehi,lsl#23 @ Sigma1(e) 113 adds $Tlo,$Tlo,$t0 114 ldr $t0,[sp,#$Foff+0] @ f.lo 115 adc $Thi,$Thi,$t1 @ T += Sigma1(e) 116 ldr $t1,[sp,#$Foff+4] @ f.hi 117 adds $Tlo,$Tlo,$t2 118 ldr $t2,[sp,#$Goff+0] @ g.lo 119 adc $Thi,$Thi,$t3 @ T += h 120 ldr $t3,[sp,#$Goff+4] @ g.hi 121 122 eor $t0,$t0,$t2 123 str $Elo,[sp,#$Eoff+0] 124 eor $t1,$t1,$t3 125 str $Ehi,[sp,#$Eoff+4] 126 and $t0,$t0,$Elo 127 str $Alo,[sp,#$Aoff+0] 128 and $t1,$t1,$Ehi 129 str $Ahi,[sp,#$Aoff+4] 130 eor $t0,$t0,$t2 131 ldr $t2,[$Ktbl,#$lo] @ K[i].lo 132 eor $t1,$t1,$t3 @ Ch(e,f,g) 133 ldr $t3,[$Ktbl,#$hi] @ K[i].hi 134 135 adds $Tlo,$Tlo,$t0 136 ldr $Elo,[sp,#$Doff+0] @ d.lo 137 adc $Thi,$Thi,$t1 @ T += Ch(e,f,g) 138 ldr $Ehi,[sp,#$Doff+4] @ d.hi 139 adds $Tlo,$Tlo,$t2 140 and $t0,$t2,#0xff 141 adc $Thi,$Thi,$t3 @ T += K[i] 142 adds $Elo,$Elo,$Tlo 143 ldr $t2,[sp,#$Boff+0] @ b.lo 144 adc $Ehi,$Ehi,$Thi @ d += T 145 teq $t0,#$magic 146 147 ldr $t3,[sp,#$Coff+0] @ c.lo 148 #if __ARM_ARCH__>=7 149 it eq @ Thumb2 thing, sanity check in ARM 150 #endif 151 orreq $Ktbl,$Ktbl,#1 152 @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39)) 153 @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25 154 @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25 155 mov $t0,$Alo,lsr#28 156 mov $t1,$Ahi,lsr#28 157 eor $t0,$t0,$Ahi,lsl#4 158 eor $t1,$t1,$Alo,lsl#4 159 eor $t0,$t0,$Ahi,lsr#2 160 eor $t1,$t1,$Alo,lsr#2 161 eor $t0,$t0,$Alo,lsl#30 162 eor $t1,$t1,$Ahi,lsl#30 163 eor $t0,$t0,$Ahi,lsr#7 164 eor $t1,$t1,$Alo,lsr#7 165 eor $t0,$t0,$Alo,lsl#25 166 eor $t1,$t1,$Ahi,lsl#25 @ Sigma0(a) 167 adds $Tlo,$Tlo,$t0 168 and $t0,$Alo,$t2 169 adc $Thi,$Thi,$t1 @ T += Sigma0(a) 170 171 ldr $t1,[sp,#$Boff+4] @ b.hi 172 orr $Alo,$Alo,$t2 173 ldr $t2,[sp,#$Coff+4] @ c.hi 174 and $Alo,$Alo,$t3 175 and $t3,$Ahi,$t1 176 orr $Ahi,$Ahi,$t1 177 orr $Alo,$Alo,$t0 @ Maj(a,b,c).lo 178 and $Ahi,$Ahi,$t2 179 adds $Alo,$Alo,$Tlo 180 orr $Ahi,$Ahi,$t3 @ Maj(a,b,c).hi 181 sub sp,sp,#8 182 adc $Ahi,$Ahi,$Thi @ h += T 183 tst $Ktbl,#1 184 add $Ktbl,$Ktbl,#8 185 ___ 186 } 187 $code=<<___; 188 #ifndef __KERNEL__ 189 # include "arm_arch.h" 190 # define VFP_ABI_PUSH vstmdb sp!,{d8-d15} 191 # define VFP_ABI_POP vldmia sp!,{d8-d15} 192 #else 193 # define __ARM_ARCH__ __LINUX_ARM_ARCH__ 194 # define __ARM_MAX_ARCH__ 7 195 # define VFP_ABI_PUSH 196 # define VFP_ABI_POP 197 #endif 198 199 #ifdef __ARMEL__ 200 # define LO 0 201 # define HI 4 202 # define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1 203 #else 204 # define HI 0 205 # define LO 4 206 # define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1 207 #endif 208 209 .text 210 #if __ARM_ARCH__<7 211 .code 32 212 #else 213 .syntax unified 214 # ifdef __thumb2__ 215 .thumb 216 # else 217 .code 32 218 # endif 219 #endif 220 221 .type K512,%object 222 .align 5 223 K512: 224 WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd) 225 WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc) 226 WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019) 227 WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118) 228 WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe) 229 WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2) 230 WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1) 231 WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694) 232 WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3) 233 WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65) 234 WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483) 235 WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5) 236 WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210) 237 WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4) 238 WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725) 239 WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70) 240 WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926) 241 WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df) 242 WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8) 243 WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b) 244 WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001) 245 WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30) 246 WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910) 247 WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8) 248 WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53) 249 WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8) 250 WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb) 251 WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3) 252 WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60) 253 WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec) 254 WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9) 255 WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b) 256 WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207) 257 WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178) 258 WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6) 259 WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b) 260 WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493) 261 WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c) 262 WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a) 263 WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817) 264 .size K512,.-K512 265 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) 266 .LOPENSSL_armcap: 267 .word OPENSSL_armcap_P-sha512_block_data_order 268 .skip 32-4 269 #else 270 .skip 32 271 #endif 272 273 .global sha512_block_data_order 274 .type sha512_block_data_order,%function 275 sha512_block_data_order: 276 .Lsha512_block_data_order: 277 #if __ARM_ARCH__<7 278 sub r3,pc,#8 @ sha512_block_data_order 279 #else 280 adr r3,.Lsha512_block_data_order 281 #endif 282 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) 283 ldr r12,.LOPENSSL_armcap 284 ldr r12,[r3,r12] @ OPENSSL_armcap_P 285 tst r12,#1 286 bne .LNEON 287 #endif 288 add $len,$inp,$len,lsl#7 @ len to point at the end of inp 289 stmdb sp!,{r4-r12,lr} 290 sub $Ktbl,r3,#672 @ K512 291 sub sp,sp,#9*8 292 293 ldr $Elo,[$ctx,#$Eoff+$lo] 294 ldr $Ehi,[$ctx,#$Eoff+$hi] 295 ldr $t0, [$ctx,#$Goff+$lo] 296 ldr $t1, [$ctx,#$Goff+$hi] 297 ldr $t2, [$ctx,#$Hoff+$lo] 298 ldr $t3, [$ctx,#$Hoff+$hi] 299 .Loop: 300 str $t0, [sp,#$Goff+0] 301 str $t1, [sp,#$Goff+4] 302 str $t2, [sp,#$Hoff+0] 303 str $t3, [sp,#$Hoff+4] 304 ldr $Alo,[$ctx,#$Aoff+$lo] 305 ldr $Ahi,[$ctx,#$Aoff+$hi] 306 ldr $Tlo,[$ctx,#$Boff+$lo] 307 ldr $Thi,[$ctx,#$Boff+$hi] 308 ldr $t0, [$ctx,#$Coff+$lo] 309 ldr $t1, [$ctx,#$Coff+$hi] 310 ldr $t2, [$ctx,#$Doff+$lo] 311 ldr $t3, [$ctx,#$Doff+$hi] 312 str $Tlo,[sp,#$Boff+0] 313 str $Thi,[sp,#$Boff+4] 314 str $t0, [sp,#$Coff+0] 315 str $t1, [sp,#$Coff+4] 316 str $t2, [sp,#$Doff+0] 317 str $t3, [sp,#$Doff+4] 318 ldr $Tlo,[$ctx,#$Foff+$lo] 319 ldr $Thi,[$ctx,#$Foff+$hi] 320 str $Tlo,[sp,#$Foff+0] 321 str $Thi,[sp,#$Foff+4] 322 323 .L00_15: 324 #if __ARM_ARCH__<7 325 ldrb $Tlo,[$inp,#7] 326 ldrb $t0, [$inp,#6] 327 ldrb $t1, [$inp,#5] 328 ldrb $t2, [$inp,#4] 329 ldrb $Thi,[$inp,#3] 330 ldrb $t3, [$inp,#2] 331 orr $Tlo,$Tlo,$t0,lsl#8 332 ldrb $t0, [$inp,#1] 333 orr $Tlo,$Tlo,$t1,lsl#16 334 ldrb $t1, [$inp],#8 335 orr $Tlo,$Tlo,$t2,lsl#24 336 orr $Thi,$Thi,$t3,lsl#8 337 orr $Thi,$Thi,$t0,lsl#16 338 orr $Thi,$Thi,$t1,lsl#24 339 #else 340 ldr $Tlo,[$inp,#4] 341 ldr $Thi,[$inp],#8 342 #ifdef __ARMEL__ 343 rev $Tlo,$Tlo 344 rev $Thi,$Thi 345 #endif 346 #endif 347 ___ 348 &BODY_00_15(0x94); 349 $code.=<<___; 350 tst $Ktbl,#1 351 beq .L00_15 352 ldr $t0,[sp,#`$Xoff+8*(16-1)`+0] 353 ldr $t1,[sp,#`$Xoff+8*(16-1)`+4] 354 bic $Ktbl,$Ktbl,#1 355 .L16_79: 356 @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7)) 357 @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25 358 @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7 359 mov $Tlo,$t0,lsr#1 360 ldr $t2,[sp,#`$Xoff+8*(16-14)`+0] 361 mov $Thi,$t1,lsr#1 362 ldr $t3,[sp,#`$Xoff+8*(16-14)`+4] 363 eor $Tlo,$Tlo,$t1,lsl#31 364 eor $Thi,$Thi,$t0,lsl#31 365 eor $Tlo,$Tlo,$t0,lsr#8 366 eor $Thi,$Thi,$t1,lsr#8 367 eor $Tlo,$Tlo,$t1,lsl#24 368 eor $Thi,$Thi,$t0,lsl#24 369 eor $Tlo,$Tlo,$t0,lsr#7 370 eor $Thi,$Thi,$t1,lsr#7 371 eor $Tlo,$Tlo,$t1,lsl#25 372 373 @ sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6)) 374 @ LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26 375 @ HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6 376 mov $t0,$t2,lsr#19 377 mov $t1,$t3,lsr#19 378 eor $t0,$t0,$t3,lsl#13 379 eor $t1,$t1,$t2,lsl#13 380 eor $t0,$t0,$t3,lsr#29 381 eor $t1,$t1,$t2,lsr#29 382 eor $t0,$t0,$t2,lsl#3 383 eor $t1,$t1,$t3,lsl#3 384 eor $t0,$t0,$t2,lsr#6 385 eor $t1,$t1,$t3,lsr#6 386 ldr $t2,[sp,#`$Xoff+8*(16-9)`+0] 387 eor $t0,$t0,$t3,lsl#26 388 389 ldr $t3,[sp,#`$Xoff+8*(16-9)`+4] 390 adds $Tlo,$Tlo,$t0 391 ldr $t0,[sp,#`$Xoff+8*16`+0] 392 adc $Thi,$Thi,$t1 393 394 ldr $t1,[sp,#`$Xoff+8*16`+4] 395 adds $Tlo,$Tlo,$t2 396 adc $Thi,$Thi,$t3 397 adds $Tlo,$Tlo,$t0 398 adc $Thi,$Thi,$t1 399 ___ 400 &BODY_00_15(0x17); 401 $code.=<<___; 402 #if __ARM_ARCH__>=7 403 ittt eq @ Thumb2 thing, sanity check in ARM 404 #endif 405 ldreq $t0,[sp,#`$Xoff+8*(16-1)`+0] 406 ldreq $t1,[sp,#`$Xoff+8*(16-1)`+4] 407 beq .L16_79 408 bic $Ktbl,$Ktbl,#1 409 410 ldr $Tlo,[sp,#$Boff+0] 411 ldr $Thi,[sp,#$Boff+4] 412 ldr $t0, [$ctx,#$Aoff+$lo] 413 ldr $t1, [$ctx,#$Aoff+$hi] 414 ldr $t2, [$ctx,#$Boff+$lo] 415 ldr $t3, [$ctx,#$Boff+$hi] 416 adds $t0,$Alo,$t0 417 str $t0, [$ctx,#$Aoff+$lo] 418 adc $t1,$Ahi,$t1 419 str $t1, [$ctx,#$Aoff+$hi] 420 adds $t2,$Tlo,$t2 421 str $t2, [$ctx,#$Boff+$lo] 422 adc $t3,$Thi,$t3 423 str $t3, [$ctx,#$Boff+$hi] 424 425 ldr $Alo,[sp,#$Coff+0] 426 ldr $Ahi,[sp,#$Coff+4] 427 ldr $Tlo,[sp,#$Doff+0] 428 ldr $Thi,[sp,#$Doff+4] 429 ldr $t0, [$ctx,#$Coff+$lo] 430 ldr $t1, [$ctx,#$Coff+$hi] 431 ldr $t2, [$ctx,#$Doff+$lo] 432 ldr $t3, [$ctx,#$Doff+$hi] 433 adds $t0,$Alo,$t0 434 str $t0, [$ctx,#$Coff+$lo] 435 adc $t1,$Ahi,$t1 436 str $t1, [$ctx,#$Coff+$hi] 437 adds $t2,$Tlo,$t2 438 str $t2, [$ctx,#$Doff+$lo] 439 adc $t3,$Thi,$t3 440 str $t3, [$ctx,#$Doff+$hi] 441 442 ldr $Tlo,[sp,#$Foff+0] 443 ldr $Thi,[sp,#$Foff+4] 444 ldr $t0, [$ctx,#$Eoff+$lo] 445 ldr $t1, [$ctx,#$Eoff+$hi] 446 ldr $t2, [$ctx,#$Foff+$lo] 447 ldr $t3, [$ctx,#$Foff+$hi] 448 adds $Elo,$Elo,$t0 449 str $Elo,[$ctx,#$Eoff+$lo] 450 adc $Ehi,$Ehi,$t1 451 str $Ehi,[$ctx,#$Eoff+$hi] 452 adds $t2,$Tlo,$t2 453 str $t2, [$ctx,#$Foff+$lo] 454 adc $t3,$Thi,$t3 455 str $t3, [$ctx,#$Foff+$hi] 456 457 ldr $Alo,[sp,#$Goff+0] 458 ldr $Ahi,[sp,#$Goff+4] 459 ldr $Tlo,[sp,#$Hoff+0] 460 ldr $Thi,[sp,#$Hoff+4] 461 ldr $t0, [$ctx,#$Goff+$lo] 462 ldr $t1, [$ctx,#$Goff+$hi] 463 ldr $t2, [$ctx,#$Hoff+$lo] 464 ldr $t3, [$ctx,#$Hoff+$hi] 465 adds $t0,$Alo,$t0 466 str $t0, [$ctx,#$Goff+$lo] 467 adc $t1,$Ahi,$t1 468 str $t1, [$ctx,#$Goff+$hi] 469 adds $t2,$Tlo,$t2 470 str $t2, [$ctx,#$Hoff+$lo] 471 adc $t3,$Thi,$t3 472 str $t3, [$ctx,#$Hoff+$hi] 473 474 add sp,sp,#640 475 sub $Ktbl,$Ktbl,#640 476 477 teq $inp,$len 478 bne .Loop 479 480 add sp,sp,#8*9 @ destroy frame 481 #if __ARM_ARCH__>=5 482 ldmia sp!,{r4-r12,pc} 483 #else 484 ldmia sp!,{r4-r12,lr} 485 tst lr,#1 486 moveq pc,lr @ be binary compatible with V4, yet 487 bx lr @ interoperable with Thumb ISA:-) 488 #endif 489 .size sha512_block_data_order,.-sha512_block_data_order 490 ___ 491 492 { 493 my @Sigma0=(28,34,39); 494 my @Sigma1=(14,18,41); 495 my @sigma0=(1, 8, 7); 496 my @sigma1=(19,61,6); 497 498 my $Ktbl="r3"; 499 my $cnt="r12"; # volatile register known as ip, intra-procedure-call scratch 500 501 my @X=map("d$_",(0..15)); 502 my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23)); 503 504 sub NEON_00_15() { 505 my $i=shift; 506 my ($a,$b,$c,$d,$e,$f,$g,$h)=@_; 507 my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31)); # temps 508 509 $code.=<<___ if ($i<16 || $i&1); 510 vshr.u64 $t0,$e,#@Sigma1[0] @ $i 511 #if $i<16 512 vld1.64 {@X[$i%16]},[$inp]! @ handles unaligned 513 #endif 514 vshr.u64 $t1,$e,#@Sigma1[1] 515 #if $i>0 516 vadd.i64 $a,$Maj @ h+=Maj from the past 517 #endif 518 vshr.u64 $t2,$e,#@Sigma1[2] 519 ___ 520 $code.=<<___; 521 vld1.64 {$K},[$Ktbl,:64]! @ K[i++] 522 vsli.64 $t0,$e,#`64-@Sigma1[0]` 523 vsli.64 $t1,$e,#`64-@Sigma1[1]` 524 vmov $Ch,$e 525 vsli.64 $t2,$e,#`64-@Sigma1[2]` 526 #if $i<16 && defined(__ARMEL__) 527 vrev64.8 @X[$i],@X[$i] 528 #endif 529 veor $t1,$t0 530 vbsl $Ch,$f,$g @ Ch(e,f,g) 531 vshr.u64 $t0,$a,#@Sigma0[0] 532 veor $t2,$t1 @ Sigma1(e) 533 vadd.i64 $T1,$Ch,$h 534 vshr.u64 $t1,$a,#@Sigma0[1] 535 vsli.64 $t0,$a,#`64-@Sigma0[0]` 536 vadd.i64 $T1,$t2 537 vshr.u64 $t2,$a,#@Sigma0[2] 538 vadd.i64 $K,@X[$i%16] 539 vsli.64 $t1,$a,#`64-@Sigma0[1]` 540 veor $Maj,$a,$b 541 vsli.64 $t2,$a,#`64-@Sigma0[2]` 542 veor $h,$t0,$t1 543 vadd.i64 $T1,$K 544 vbsl $Maj,$c,$b @ Maj(a,b,c) 545 veor $h,$t2 @ Sigma0(a) 546 vadd.i64 $d,$T1 547 vadd.i64 $Maj,$T1 548 @ vadd.i64 $h,$Maj 549 ___ 550 } 551 552 sub NEON_16_79() { 553 my $i=shift; 554 555 if ($i&1) { &NEON_00_15($i,@_); return; } 556 557 # 2x-vectorized, therefore runs every 2nd round 558 my @X=map("q$_",(0..7)); # view @X as 128-bit vector 559 my ($t0,$t1,$s0,$s1) = map("q$_",(12..15)); # temps 560 my ($d0,$d1,$d2) = map("d$_",(24..26)); # temps from NEON_00_15 561 my $e=@_[4]; # $e from NEON_00_15 562 $i /= 2; 563 $code.=<<___; 564 vshr.u64 $t0,@X[($i+7)%8],#@sigma1[0] 565 vshr.u64 $t1,@X[($i+7)%8],#@sigma1[1] 566 vadd.i64 @_[0],d30 @ h+=Maj from the past 567 vshr.u64 $s1,@X[($i+7)%8],#@sigma1[2] 568 vsli.64 $t0,@X[($i+7)%8],#`64-@sigma1[0]` 569 vext.8 $s0,@X[$i%8],@X[($i+1)%8],#8 @ X[i+1] 570 vsli.64 $t1,@X[($i+7)%8],#`64-@sigma1[1]` 571 veor $s1,$t0 572 vshr.u64 $t0,$s0,#@sigma0[0] 573 veor $s1,$t1 @ sigma1(X[i+14]) 574 vshr.u64 $t1,$s0,#@sigma0[1] 575 vadd.i64 @X[$i%8],$s1 576 vshr.u64 $s1,$s0,#@sigma0[2] 577 vsli.64 $t0,$s0,#`64-@sigma0[0]` 578 vsli.64 $t1,$s0,#`64-@sigma0[1]` 579 vext.8 $s0,@X[($i+4)%8],@X[($i+5)%8],#8 @ X[i+9] 580 veor $s1,$t0 581 vshr.u64 $d0,$e,#@Sigma1[0] @ from NEON_00_15 582 vadd.i64 @X[$i%8],$s0 583 vshr.u64 $d1,$e,#@Sigma1[1] @ from NEON_00_15 584 veor $s1,$t1 @ sigma0(X[i+1]) 585 vshr.u64 $d2,$e,#@Sigma1[2] @ from NEON_00_15 586 vadd.i64 @X[$i%8],$s1 587 ___ 588 &NEON_00_15(2*$i,@_); 589 } 590 591 $code.=<<___; 592 #if __ARM_MAX_ARCH__>=7 593 .arch armv7-a 594 .fpu neon 595 596 .global sha512_block_data_order_neon 597 .type sha512_block_data_order_neon,%function 598 .align 4 599 sha512_block_data_order_neon: 600 .LNEON: 601 dmb @ errata #451034 on early Cortex A8 602 add $len,$inp,$len,lsl#7 @ len to point at the end of inp 603 VFP_ABI_PUSH 604 adr $Ktbl,.Lsha512_block_data_order 605 sub $Ktbl,$Ktbl,.Lsha512_block_data_order-K512 606 vldmia $ctx,{$A-$H} @ load context 607 .Loop_neon: 608 ___ 609 for($i=0;$i<16;$i++) { &NEON_00_15($i,@V); unshift(@V,pop(@V)); } 610 $code.=<<___; 611 mov $cnt,#4 612 .L16_79_neon: 613 subs $cnt,#1 614 ___ 615 for(;$i<32;$i++) { &NEON_16_79($i,@V); unshift(@V,pop(@V)); } 616 $code.=<<___; 617 bne .L16_79_neon 618 619 vadd.i64 $A,d30 @ h+=Maj from the past 620 vldmia $ctx,{d24-d31} @ load context to temp 621 vadd.i64 q8,q12 @ vectorized accumulate 622 vadd.i64 q9,q13 623 vadd.i64 q10,q14 624 vadd.i64 q11,q15 625 vstmia $ctx,{$A-$H} @ save context 626 teq $inp,$len 627 sub $Ktbl,#640 @ rewind K512 628 bne .Loop_neon 629 630 VFP_ABI_POP 631 ret @ bx lr 632 .size sha512_block_data_order_neon,.-sha512_block_data_order_neon 633 #endif 634 ___ 635 } 636 $code.=<<___; 637 .asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>" 638 .align 2 639 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) 640 .comm OPENSSL_armcap_P,4,4 641 #endif 642 ___ 643 644 $code =~ s/\`([^\`]*)\`/eval $1/gem; 645 $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 646 $code =~ s/\bret\b/bx lr/gm; 647 648 open SELF,$0; 649 while(<SELF>) { 650 next if (/^#!/); 651 last if (!s/^#/@/ and !/^$/); 652 print; 653 } 654 close SELF; 655 656 print $code; 657 close STDOUT; # enforce flush
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.