~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/arch/arm/crypto/sha512-armv4.pl

Version: ~ [ linux-6.11.5 ] ~ [ linux-6.10.14 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.58 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.114 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.169 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.228 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.284 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.322 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.9 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 #!/usr/bin/env perl
  2 # SPDX-License-Identifier: GPL-2.0
  3 
  4 # This code is taken from the OpenSSL project but the author (Andy Polyakov)
  5 # has relicensed it under the GPLv2. Therefore this program is free software;
  6 # you can redistribute it and/or modify it under the terms of the GNU General
  7 # Public License version 2 as published by the Free Software Foundation.
  8 #
  9 # The original headers, including the original license headers, are
 10 # included below for completeness.
 11 
 12 # ====================================================================
 13 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
 14 # project. The module is, however, dual licensed under OpenSSL and
 15 # CRYPTOGAMS licenses depending on where you obtain it. For further
 16 # details see https://www.openssl.org/~appro/cryptogams/.
 17 # ====================================================================
 18 
 19 # SHA512 block procedure for ARMv4. September 2007.
 20 
 21 # This code is ~4.5 (four and a half) times faster than code generated
 22 # by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
 23 # Xscale PXA250 core].
 24 #
 25 # July 2010.
 26 #
 27 # Rescheduling for dual-issue pipeline resulted in 6% improvement on
 28 # Cortex A8 core and ~40 cycles per processed byte.
 29 
 30 # February 2011.
 31 #
 32 # Profiler-assisted and platform-specific optimization resulted in 7%
 33 # improvement on Coxtex A8 core and ~38 cycles per byte.
 34 
 35 # March 2011.
 36 #
 37 # Add NEON implementation. On Cortex A8 it was measured to process
 38 # one byte in 23.3 cycles or ~60% faster than integer-only code.
 39 
 40 # August 2012.
 41 #
 42 # Improve NEON performance by 12% on Snapdragon S4. In absolute
 43 # terms it's 22.6 cycles per byte, which is disappointing result.
 44 # Technical writers asserted that 3-way S4 pipeline can sustain
 45 # multiple NEON instructions per cycle, but dual NEON issue could
 46 # not be observed, see https://www.openssl.org/~appro/Snapdragon-S4.html
 47 # for further details. On side note Cortex-A15 processes one byte in
 48 # 16 cycles.
 49 
 50 # Byte order [in]dependence. =========================================
 51 #
 52 # Originally caller was expected to maintain specific *dword* order in
 53 # h[0-7], namely with most significant dword at *lower* address, which
 54 # was reflected in below two parameters as 0 and 4. Now caller is
 55 # expected to maintain native byte order for whole 64-bit values.
 56 $hi="HI";
 57 $lo="LO";
 58 # ====================================================================
 59 
 60 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
 61 open STDOUT,">$output";
 62 
 63 $ctx="r0";      # parameter block
 64 $inp="r1";
 65 $len="r2";
 66 
 67 $Tlo="r3";
 68 $Thi="r4";
 69 $Alo="r5";
 70 $Ahi="r6";
 71 $Elo="r7";
 72 $Ehi="r8";
 73 $t0="r9";
 74 $t1="r10";
 75 $t2="r11";
 76 $t3="r12";
 77 ############    r13 is stack pointer
 78 $Ktbl="r14";
 79 ############    r15 is program counter
 80 
 81 $Aoff=8*0;
 82 $Boff=8*1;
 83 $Coff=8*2;
 84 $Doff=8*3;
 85 $Eoff=8*4;
 86 $Foff=8*5;
 87 $Goff=8*6;
 88 $Hoff=8*7;
 89 $Xoff=8*8;
 90 
 91 sub BODY_00_15() {
 92 my $magic = shift;
 93 $code.=<<___;
 94         @ Sigma1(x)     (ROTR((x),14) ^ ROTR((x),18)  ^ ROTR((x),41))
 95         @ LO            lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
 96         @ HI            hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
 97         mov     $t0,$Elo,lsr#14
 98         str     $Tlo,[sp,#$Xoff+0]
 99         mov     $t1,$Ehi,lsr#14
100         str     $Thi,[sp,#$Xoff+4]
101         eor     $t0,$t0,$Ehi,lsl#18
102         ldr     $t2,[sp,#$Hoff+0]       @ h.lo
103         eor     $t1,$t1,$Elo,lsl#18
104         ldr     $t3,[sp,#$Hoff+4]       @ h.hi
105         eor     $t0,$t0,$Elo,lsr#18
106         eor     $t1,$t1,$Ehi,lsr#18
107         eor     $t0,$t0,$Ehi,lsl#14
108         eor     $t1,$t1,$Elo,lsl#14
109         eor     $t0,$t0,$Ehi,lsr#9
110         eor     $t1,$t1,$Elo,lsr#9
111         eor     $t0,$t0,$Elo,lsl#23
112         eor     $t1,$t1,$Ehi,lsl#23     @ Sigma1(e)
113         adds    $Tlo,$Tlo,$t0
114         ldr     $t0,[sp,#$Foff+0]       @ f.lo
115         adc     $Thi,$Thi,$t1           @ T += Sigma1(e)
116         ldr     $t1,[sp,#$Foff+4]       @ f.hi
117         adds    $Tlo,$Tlo,$t2
118         ldr     $t2,[sp,#$Goff+0]       @ g.lo
119         adc     $Thi,$Thi,$t3           @ T += h
120         ldr     $t3,[sp,#$Goff+4]       @ g.hi
121 
122         eor     $t0,$t0,$t2
123         str     $Elo,[sp,#$Eoff+0]
124         eor     $t1,$t1,$t3
125         str     $Ehi,[sp,#$Eoff+4]
126         and     $t0,$t0,$Elo
127         str     $Alo,[sp,#$Aoff+0]
128         and     $t1,$t1,$Ehi
129         str     $Ahi,[sp,#$Aoff+4]
130         eor     $t0,$t0,$t2
131         ldr     $t2,[$Ktbl,#$lo]        @ K[i].lo
132         eor     $t1,$t1,$t3             @ Ch(e,f,g)
133         ldr     $t3,[$Ktbl,#$hi]        @ K[i].hi
134 
135         adds    $Tlo,$Tlo,$t0
136         ldr     $Elo,[sp,#$Doff+0]      @ d.lo
137         adc     $Thi,$Thi,$t1           @ T += Ch(e,f,g)
138         ldr     $Ehi,[sp,#$Doff+4]      @ d.hi
139         adds    $Tlo,$Tlo,$t2
140         and     $t0,$t2,#0xff
141         adc     $Thi,$Thi,$t3           @ T += K[i]
142         adds    $Elo,$Elo,$Tlo
143         ldr     $t2,[sp,#$Boff+0]       @ b.lo
144         adc     $Ehi,$Ehi,$Thi          @ d += T
145         teq     $t0,#$magic
146 
147         ldr     $t3,[sp,#$Coff+0]       @ c.lo
148 #if __ARM_ARCH__>=7
149         it      eq                      @ Thumb2 thing, sanity check in ARM
150 #endif
151         orreq   $Ktbl,$Ktbl,#1
152         @ Sigma0(x)     (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
153         @ LO            lo>>28^hi<<4  ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
154         @ HI            hi>>28^lo<<4  ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
155         mov     $t0,$Alo,lsr#28
156         mov     $t1,$Ahi,lsr#28
157         eor     $t0,$t0,$Ahi,lsl#4
158         eor     $t1,$t1,$Alo,lsl#4
159         eor     $t0,$t0,$Ahi,lsr#2
160         eor     $t1,$t1,$Alo,lsr#2
161         eor     $t0,$t0,$Alo,lsl#30
162         eor     $t1,$t1,$Ahi,lsl#30
163         eor     $t0,$t0,$Ahi,lsr#7
164         eor     $t1,$t1,$Alo,lsr#7
165         eor     $t0,$t0,$Alo,lsl#25
166         eor     $t1,$t1,$Ahi,lsl#25     @ Sigma0(a)
167         adds    $Tlo,$Tlo,$t0
168         and     $t0,$Alo,$t2
169         adc     $Thi,$Thi,$t1           @ T += Sigma0(a)
170 
171         ldr     $t1,[sp,#$Boff+4]       @ b.hi
172         orr     $Alo,$Alo,$t2
173         ldr     $t2,[sp,#$Coff+4]       @ c.hi
174         and     $Alo,$Alo,$t3
175         and     $t3,$Ahi,$t1
176         orr     $Ahi,$Ahi,$t1
177         orr     $Alo,$Alo,$t0           @ Maj(a,b,c).lo
178         and     $Ahi,$Ahi,$t2
179         adds    $Alo,$Alo,$Tlo
180         orr     $Ahi,$Ahi,$t3           @ Maj(a,b,c).hi
181         sub     sp,sp,#8
182         adc     $Ahi,$Ahi,$Thi          @ h += T
183         tst     $Ktbl,#1
184         add     $Ktbl,$Ktbl,#8
185 ___
186 }
187 $code=<<___;
188 #ifndef __KERNEL__
189 # include "arm_arch.h"
190 # define VFP_ABI_PUSH   vstmdb  sp!,{d8-d15}
191 # define VFP_ABI_POP    vldmia  sp!,{d8-d15}
192 #else
193 # define __ARM_ARCH__ __LINUX_ARM_ARCH__
194 # define __ARM_MAX_ARCH__ 7
195 # define VFP_ABI_PUSH
196 # define VFP_ABI_POP
197 #endif
198 
199 #ifdef __ARMEL__
200 # define LO 0
201 # define HI 4
202 # define WORD64(hi0,lo0,hi1,lo1)        .word   lo0,hi0, lo1,hi1
203 #else
204 # define HI 0
205 # define LO 4
206 # define WORD64(hi0,lo0,hi1,lo1)        .word   hi0,lo0, hi1,lo1
207 #endif
208 
209 .text
210 #if __ARM_ARCH__<7
211 .code   32
212 #else
213 .syntax unified
214 # ifdef __thumb2__
215 .thumb
216 # else
217 .code   32
218 # endif
219 #endif
220 
221 .type   K512,%object
222 .align  5
223 K512:
224 WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
225 WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
226 WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
227 WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
228 WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
229 WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
230 WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
231 WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
232 WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
233 WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
234 WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
235 WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
236 WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
237 WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
238 WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
239 WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
240 WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
241 WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
242 WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
243 WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
244 WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
245 WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
246 WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
247 WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
248 WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
249 WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
250 WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
251 WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
252 WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
253 WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
254 WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
255 WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
256 WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
257 WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
258 WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
259 WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
260 WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
261 WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
262 WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
263 WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
264 .size   K512,.-K512
265 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
266 .LOPENSSL_armcap:
267 .word   OPENSSL_armcap_P-sha512_block_data_order
268 .skip   32-4
269 #else
270 .skip   32
271 #endif
272 
273 .global sha512_block_data_order
274 .type   sha512_block_data_order,%function
275 sha512_block_data_order:
276 .Lsha512_block_data_order:
277 #if __ARM_ARCH__<7
278         sub     r3,pc,#8                @ sha512_block_data_order
279 #else
280         adr     r3,.Lsha512_block_data_order
281 #endif
282 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
283         ldr     r12,.LOPENSSL_armcap
284         ldr     r12,[r3,r12]            @ OPENSSL_armcap_P
285         tst     r12,#1
286         bne     .LNEON
287 #endif
288         add     $len,$inp,$len,lsl#7    @ len to point at the end of inp
289         stmdb   sp!,{r4-r12,lr}
290         sub     $Ktbl,r3,#672           @ K512
291         sub     sp,sp,#9*8
292 
293         ldr     $Elo,[$ctx,#$Eoff+$lo]
294         ldr     $Ehi,[$ctx,#$Eoff+$hi]
295         ldr     $t0, [$ctx,#$Goff+$lo]
296         ldr     $t1, [$ctx,#$Goff+$hi]
297         ldr     $t2, [$ctx,#$Hoff+$lo]
298         ldr     $t3, [$ctx,#$Hoff+$hi]
299 .Loop:
300         str     $t0, [sp,#$Goff+0]
301         str     $t1, [sp,#$Goff+4]
302         str     $t2, [sp,#$Hoff+0]
303         str     $t3, [sp,#$Hoff+4]
304         ldr     $Alo,[$ctx,#$Aoff+$lo]
305         ldr     $Ahi,[$ctx,#$Aoff+$hi]
306         ldr     $Tlo,[$ctx,#$Boff+$lo]
307         ldr     $Thi,[$ctx,#$Boff+$hi]
308         ldr     $t0, [$ctx,#$Coff+$lo]
309         ldr     $t1, [$ctx,#$Coff+$hi]
310         ldr     $t2, [$ctx,#$Doff+$lo]
311         ldr     $t3, [$ctx,#$Doff+$hi]
312         str     $Tlo,[sp,#$Boff+0]
313         str     $Thi,[sp,#$Boff+4]
314         str     $t0, [sp,#$Coff+0]
315         str     $t1, [sp,#$Coff+4]
316         str     $t2, [sp,#$Doff+0]
317         str     $t3, [sp,#$Doff+4]
318         ldr     $Tlo,[$ctx,#$Foff+$lo]
319         ldr     $Thi,[$ctx,#$Foff+$hi]
320         str     $Tlo,[sp,#$Foff+0]
321         str     $Thi,[sp,#$Foff+4]
322 
323 .L00_15:
324 #if __ARM_ARCH__<7
325         ldrb    $Tlo,[$inp,#7]
326         ldrb    $t0, [$inp,#6]
327         ldrb    $t1, [$inp,#5]
328         ldrb    $t2, [$inp,#4]
329         ldrb    $Thi,[$inp,#3]
330         ldrb    $t3, [$inp,#2]
331         orr     $Tlo,$Tlo,$t0,lsl#8
332         ldrb    $t0, [$inp,#1]
333         orr     $Tlo,$Tlo,$t1,lsl#16
334         ldrb    $t1, [$inp],#8
335         orr     $Tlo,$Tlo,$t2,lsl#24
336         orr     $Thi,$Thi,$t3,lsl#8
337         orr     $Thi,$Thi,$t0,lsl#16
338         orr     $Thi,$Thi,$t1,lsl#24
339 #else
340         ldr     $Tlo,[$inp,#4]
341         ldr     $Thi,[$inp],#8
342 #ifdef __ARMEL__
343         rev     $Tlo,$Tlo
344         rev     $Thi,$Thi
345 #endif
346 #endif
347 ___
348         &BODY_00_15(0x94);
349 $code.=<<___;
350         tst     $Ktbl,#1
351         beq     .L00_15
352         ldr     $t0,[sp,#`$Xoff+8*(16-1)`+0]
353         ldr     $t1,[sp,#`$Xoff+8*(16-1)`+4]
354         bic     $Ktbl,$Ktbl,#1
355 .L16_79:
356         @ sigma0(x)     (ROTR((x),1)  ^ ROTR((x),8)  ^ ((x)>>7))
357         @ LO            lo>>1^hi<<31  ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
358         @ HI            hi>>1^lo<<31  ^ hi>>8^lo<<24 ^ hi>>7
359         mov     $Tlo,$t0,lsr#1
360         ldr     $t2,[sp,#`$Xoff+8*(16-14)`+0]
361         mov     $Thi,$t1,lsr#1
362         ldr     $t3,[sp,#`$Xoff+8*(16-14)`+4]
363         eor     $Tlo,$Tlo,$t1,lsl#31
364         eor     $Thi,$Thi,$t0,lsl#31
365         eor     $Tlo,$Tlo,$t0,lsr#8
366         eor     $Thi,$Thi,$t1,lsr#8
367         eor     $Tlo,$Tlo,$t1,lsl#24
368         eor     $Thi,$Thi,$t0,lsl#24
369         eor     $Tlo,$Tlo,$t0,lsr#7
370         eor     $Thi,$Thi,$t1,lsr#7
371         eor     $Tlo,$Tlo,$t1,lsl#25
372 
373         @ sigma1(x)     (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
374         @ LO            lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
375         @ HI            hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
376         mov     $t0,$t2,lsr#19
377         mov     $t1,$t3,lsr#19
378         eor     $t0,$t0,$t3,lsl#13
379         eor     $t1,$t1,$t2,lsl#13
380         eor     $t0,$t0,$t3,lsr#29
381         eor     $t1,$t1,$t2,lsr#29
382         eor     $t0,$t0,$t2,lsl#3
383         eor     $t1,$t1,$t3,lsl#3
384         eor     $t0,$t0,$t2,lsr#6
385         eor     $t1,$t1,$t3,lsr#6
386         ldr     $t2,[sp,#`$Xoff+8*(16-9)`+0]
387         eor     $t0,$t0,$t3,lsl#26
388 
389         ldr     $t3,[sp,#`$Xoff+8*(16-9)`+4]
390         adds    $Tlo,$Tlo,$t0
391         ldr     $t0,[sp,#`$Xoff+8*16`+0]
392         adc     $Thi,$Thi,$t1
393 
394         ldr     $t1,[sp,#`$Xoff+8*16`+4]
395         adds    $Tlo,$Tlo,$t2
396         adc     $Thi,$Thi,$t3
397         adds    $Tlo,$Tlo,$t0
398         adc     $Thi,$Thi,$t1
399 ___
400         &BODY_00_15(0x17);
401 $code.=<<___;
402 #if __ARM_ARCH__>=7
403         ittt    eq                      @ Thumb2 thing, sanity check in ARM
404 #endif
405         ldreq   $t0,[sp,#`$Xoff+8*(16-1)`+0]
406         ldreq   $t1,[sp,#`$Xoff+8*(16-1)`+4]
407         beq     .L16_79
408         bic     $Ktbl,$Ktbl,#1
409 
410         ldr     $Tlo,[sp,#$Boff+0]
411         ldr     $Thi,[sp,#$Boff+4]
412         ldr     $t0, [$ctx,#$Aoff+$lo]
413         ldr     $t1, [$ctx,#$Aoff+$hi]
414         ldr     $t2, [$ctx,#$Boff+$lo]
415         ldr     $t3, [$ctx,#$Boff+$hi]
416         adds    $t0,$Alo,$t0
417         str     $t0, [$ctx,#$Aoff+$lo]
418         adc     $t1,$Ahi,$t1
419         str     $t1, [$ctx,#$Aoff+$hi]
420         adds    $t2,$Tlo,$t2
421         str     $t2, [$ctx,#$Boff+$lo]
422         adc     $t3,$Thi,$t3
423         str     $t3, [$ctx,#$Boff+$hi]
424 
425         ldr     $Alo,[sp,#$Coff+0]
426         ldr     $Ahi,[sp,#$Coff+4]
427         ldr     $Tlo,[sp,#$Doff+0]
428         ldr     $Thi,[sp,#$Doff+4]
429         ldr     $t0, [$ctx,#$Coff+$lo]
430         ldr     $t1, [$ctx,#$Coff+$hi]
431         ldr     $t2, [$ctx,#$Doff+$lo]
432         ldr     $t3, [$ctx,#$Doff+$hi]
433         adds    $t0,$Alo,$t0
434         str     $t0, [$ctx,#$Coff+$lo]
435         adc     $t1,$Ahi,$t1
436         str     $t1, [$ctx,#$Coff+$hi]
437         adds    $t2,$Tlo,$t2
438         str     $t2, [$ctx,#$Doff+$lo]
439         adc     $t3,$Thi,$t3
440         str     $t3, [$ctx,#$Doff+$hi]
441 
442         ldr     $Tlo,[sp,#$Foff+0]
443         ldr     $Thi,[sp,#$Foff+4]
444         ldr     $t0, [$ctx,#$Eoff+$lo]
445         ldr     $t1, [$ctx,#$Eoff+$hi]
446         ldr     $t2, [$ctx,#$Foff+$lo]
447         ldr     $t3, [$ctx,#$Foff+$hi]
448         adds    $Elo,$Elo,$t0
449         str     $Elo,[$ctx,#$Eoff+$lo]
450         adc     $Ehi,$Ehi,$t1
451         str     $Ehi,[$ctx,#$Eoff+$hi]
452         adds    $t2,$Tlo,$t2
453         str     $t2, [$ctx,#$Foff+$lo]
454         adc     $t3,$Thi,$t3
455         str     $t3, [$ctx,#$Foff+$hi]
456 
457         ldr     $Alo,[sp,#$Goff+0]
458         ldr     $Ahi,[sp,#$Goff+4]
459         ldr     $Tlo,[sp,#$Hoff+0]
460         ldr     $Thi,[sp,#$Hoff+4]
461         ldr     $t0, [$ctx,#$Goff+$lo]
462         ldr     $t1, [$ctx,#$Goff+$hi]
463         ldr     $t2, [$ctx,#$Hoff+$lo]
464         ldr     $t3, [$ctx,#$Hoff+$hi]
465         adds    $t0,$Alo,$t0
466         str     $t0, [$ctx,#$Goff+$lo]
467         adc     $t1,$Ahi,$t1
468         str     $t1, [$ctx,#$Goff+$hi]
469         adds    $t2,$Tlo,$t2
470         str     $t2, [$ctx,#$Hoff+$lo]
471         adc     $t3,$Thi,$t3
472         str     $t3, [$ctx,#$Hoff+$hi]
473 
474         add     sp,sp,#640
475         sub     $Ktbl,$Ktbl,#640
476 
477         teq     $inp,$len
478         bne     .Loop
479 
480         add     sp,sp,#8*9              @ destroy frame
481 #if __ARM_ARCH__>=5
482         ldmia   sp!,{r4-r12,pc}
483 #else
484         ldmia   sp!,{r4-r12,lr}
485         tst     lr,#1
486         moveq   pc,lr                   @ be binary compatible with V4, yet
487         bx      lr                      @ interoperable with Thumb ISA:-)
488 #endif
489 .size   sha512_block_data_order,.-sha512_block_data_order
490 ___
491 
492 {
493 my @Sigma0=(28,34,39);
494 my @Sigma1=(14,18,41);
495 my @sigma0=(1, 8, 7);
496 my @sigma1=(19,61,6);
497 
498 my $Ktbl="r3";
499 my $cnt="r12";  # volatile register known as ip, intra-procedure-call scratch
500 
501 my @X=map("d$_",(0..15));
502 my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
503 
504 sub NEON_00_15() {
505 my $i=shift;
506 my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
507 my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31));   # temps
508 
509 $code.=<<___ if ($i<16 || $i&1);
510         vshr.u64        $t0,$e,#@Sigma1[0]      @ $i
511 #if $i<16
512         vld1.64         {@X[$i%16]},[$inp]!     @ handles unaligned
513 #endif
514         vshr.u64        $t1,$e,#@Sigma1[1]
515 #if $i>0
516          vadd.i64       $a,$Maj                 @ h+=Maj from the past
517 #endif
518         vshr.u64        $t2,$e,#@Sigma1[2]
519 ___
520 $code.=<<___;
521         vld1.64         {$K},[$Ktbl,:64]!       @ K[i++]
522         vsli.64         $t0,$e,#`64-@Sigma1[0]`
523         vsli.64         $t1,$e,#`64-@Sigma1[1]`
524         vmov            $Ch,$e
525         vsli.64         $t2,$e,#`64-@Sigma1[2]`
526 #if $i<16 && defined(__ARMEL__)
527         vrev64.8        @X[$i],@X[$i]
528 #endif
529         veor            $t1,$t0
530         vbsl            $Ch,$f,$g               @ Ch(e,f,g)
531         vshr.u64        $t0,$a,#@Sigma0[0]
532         veor            $t2,$t1                 @ Sigma1(e)
533         vadd.i64        $T1,$Ch,$h
534         vshr.u64        $t1,$a,#@Sigma0[1]
535         vsli.64         $t0,$a,#`64-@Sigma0[0]`
536         vadd.i64        $T1,$t2
537         vshr.u64        $t2,$a,#@Sigma0[2]
538         vadd.i64        $K,@X[$i%16]
539         vsli.64         $t1,$a,#`64-@Sigma0[1]`
540         veor            $Maj,$a,$b
541         vsli.64         $t2,$a,#`64-@Sigma0[2]`
542         veor            $h,$t0,$t1
543         vadd.i64        $T1,$K
544         vbsl            $Maj,$c,$b              @ Maj(a,b,c)
545         veor            $h,$t2                  @ Sigma0(a)
546         vadd.i64        $d,$T1
547         vadd.i64        $Maj,$T1
548         @ vadd.i64      $h,$Maj
549 ___
550 }
551 
552 sub NEON_16_79() {
553 my $i=shift;
554 
555 if ($i&1)       { &NEON_00_15($i,@_); return; }
556 
557 # 2x-vectorized, therefore runs every 2nd round
558 my @X=map("q$_",(0..7));                        # view @X as 128-bit vector
559 my ($t0,$t1,$s0,$s1) = map("q$_",(12..15));     # temps
560 my ($d0,$d1,$d2) = map("d$_",(24..26));         # temps from NEON_00_15
561 my $e=@_[4];                                    # $e from NEON_00_15
562 $i /= 2;
563 $code.=<<___;
564         vshr.u64        $t0,@X[($i+7)%8],#@sigma1[0]
565         vshr.u64        $t1,@X[($i+7)%8],#@sigma1[1]
566          vadd.i64       @_[0],d30                       @ h+=Maj from the past
567         vshr.u64        $s1,@X[($i+7)%8],#@sigma1[2]
568         vsli.64         $t0,@X[($i+7)%8],#`64-@sigma1[0]`
569         vext.8          $s0,@X[$i%8],@X[($i+1)%8],#8    @ X[i+1]
570         vsli.64         $t1,@X[($i+7)%8],#`64-@sigma1[1]`
571         veor            $s1,$t0
572         vshr.u64        $t0,$s0,#@sigma0[0]
573         veor            $s1,$t1                         @ sigma1(X[i+14])
574         vshr.u64        $t1,$s0,#@sigma0[1]
575         vadd.i64        @X[$i%8],$s1
576         vshr.u64        $s1,$s0,#@sigma0[2]
577         vsli.64         $t0,$s0,#`64-@sigma0[0]`
578         vsli.64         $t1,$s0,#`64-@sigma0[1]`
579         vext.8          $s0,@X[($i+4)%8],@X[($i+5)%8],#8        @ X[i+9]
580         veor            $s1,$t0
581         vshr.u64        $d0,$e,#@Sigma1[0]              @ from NEON_00_15
582         vadd.i64        @X[$i%8],$s0
583         vshr.u64        $d1,$e,#@Sigma1[1]              @ from NEON_00_15
584         veor            $s1,$t1                         @ sigma0(X[i+1])
585         vshr.u64        $d2,$e,#@Sigma1[2]              @ from NEON_00_15
586         vadd.i64        @X[$i%8],$s1
587 ___
588         &NEON_00_15(2*$i,@_);
589 }
590 
591 $code.=<<___;
592 #if __ARM_MAX_ARCH__>=7
593 .arch   armv7-a
594 .fpu    neon
595 
596 .global sha512_block_data_order_neon
597 .type   sha512_block_data_order_neon,%function
598 .align  4
599 sha512_block_data_order_neon:
600 .LNEON:
601         dmb                             @ errata #451034 on early Cortex A8
602         add     $len,$inp,$len,lsl#7    @ len to point at the end of inp
603         VFP_ABI_PUSH
604         adr     $Ktbl,.Lsha512_block_data_order
605         sub     $Ktbl,$Ktbl,.Lsha512_block_data_order-K512
606         vldmia  $ctx,{$A-$H}            @ load context
607 .Loop_neon:
608 ___
609 for($i=0;$i<16;$i++)    { &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
610 $code.=<<___;
611         mov             $cnt,#4
612 .L16_79_neon:
613         subs            $cnt,#1
614 ___
615 for(;$i<32;$i++)        { &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
616 $code.=<<___;
617         bne             .L16_79_neon
618 
619          vadd.i64       $A,d30          @ h+=Maj from the past
620         vldmia          $ctx,{d24-d31}  @ load context to temp
621         vadd.i64        q8,q12          @ vectorized accumulate
622         vadd.i64        q9,q13
623         vadd.i64        q10,q14
624         vadd.i64        q11,q15
625         vstmia          $ctx,{$A-$H}    @ save context
626         teq             $inp,$len
627         sub             $Ktbl,#640      @ rewind K512
628         bne             .Loop_neon
629 
630         VFP_ABI_POP
631         ret                             @ bx lr
632 .size   sha512_block_data_order_neon,.-sha512_block_data_order_neon
633 #endif
634 ___
635 }
636 $code.=<<___;
637 .asciz  "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
638 .align  2
639 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
640 .comm   OPENSSL_armcap_P,4,4
641 #endif
642 ___
643 
644 $code =~ s/\`([^\`]*)\`/eval $1/gem;
645 $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;    # make it possible to compile with -march=armv4
646 $code =~ s/\bret\b/bx   lr/gm;
647 
648 open SELF,$0;
649 while(<SELF>) {
650         next if (/^#!/);
651         last if (!s/^#/@/ and !/^$/);
652         print;
653 }
654 close SELF;
655 
656 print $code;
657 close STDOUT; # enforce flush

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

sflogo.php