~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/arch/arm/crypto/chacha-scalar-core.S

Version: ~ [ linux-6.11.5 ] ~ [ linux-6.10.14 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.58 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.114 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.169 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.228 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.284 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.322 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.9 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 /* SPDX-License-Identifier: GPL-2.0 */
  2 /*
  3  * Copyright (C) 2018 Google, Inc.
  4  */
  5 
  6 #include <linux/linkage.h>
  7 #include <asm/assembler.h>
  8 
  9 /*
 10  * Design notes:
 11  *
 12  * 16 registers would be needed to hold the state matrix, but only 14 are
 13  * available because 'sp' and 'pc' cannot be used.  So we spill the elements
 14  * (x8, x9) to the stack and swap them out with (x10, x11).  This adds one
 15  * 'ldrd' and one 'strd' instruction per round.
 16  *
 17  * All rotates are performed using the implicit rotate operand accepted by the
 18  * 'add' and 'eor' instructions.  This is faster than using explicit rotate
 19  * instructions.  To make this work, we allow the values in the second and last
 20  * rows of the ChaCha state matrix (rows 'b' and 'd') to temporarily have the
 21  * wrong rotation amount.  The rotation amount is then fixed up just in time
 22  * when the values are used.  'brot' is the number of bits the values in row 'b'
 23  * need to be rotated right to arrive at the correct values, and 'drot'
 24  * similarly for row 'd'.  (brot, drot) start out as (0, 0) but we make it such
 25  * that they end up as (25, 24) after every round.
 26  */
 27 
 28         // ChaCha state registers
 29         X0      .req    r0
 30         X1      .req    r1
 31         X2      .req    r2
 32         X3      .req    r3
 33         X4      .req    r4
 34         X5      .req    r5
 35         X6      .req    r6
 36         X7      .req    r7
 37         X8_X10  .req    r8      // shared by x8 and x10
 38         X9_X11  .req    r9      // shared by x9 and x11
 39         X12     .req    r10
 40         X13     .req    r11
 41         X14     .req    r12
 42         X15     .req    r14
 43 
 44 .macro _le32_bswap_4x   a, b, c, d,  tmp
 45 #ifdef __ARMEB__
 46         rev_l           \a,  \tmp
 47         rev_l           \b,  \tmp
 48         rev_l           \c,  \tmp
 49         rev_l           \d,  \tmp
 50 #endif
 51 .endm
 52 
 53 .macro __ldrd           a, b, src, offset
 54 #if __LINUX_ARM_ARCH__ >= 6
 55         ldrd            \a, \b, [\src, #\offset]
 56 #else
 57         ldr             \a, [\src, #\offset]
 58         ldr             \b, [\src, #\offset + 4]
 59 #endif
 60 .endm
 61 
 62 .macro __strd           a, b, dst, offset
 63 #if __LINUX_ARM_ARCH__ >= 6
 64         strd            \a, \b, [\dst, #\offset]
 65 #else
 66         str             \a, [\dst, #\offset]
 67         str             \b, [\dst, #\offset + 4]
 68 #endif
 69 .endm
 70 
 71 .macro _halfround       a1, b1, c1, d1,  a2, b2, c2, d2
 72 
 73         // a += b; d ^= a; d = rol(d, 16);
 74         add             \a1, \a1, \b1, ror #brot
 75         add             \a2, \a2, \b2, ror #brot
 76         eor             \d1, \a1, \d1, ror #drot
 77         eor             \d2, \a2, \d2, ror #drot
 78         // drot == 32 - 16 == 16
 79 
 80         // c += d; b ^= c; b = rol(b, 12);
 81         add             \c1, \c1, \d1, ror #16
 82         add             \c2, \c2, \d2, ror #16
 83         eor             \b1, \c1, \b1, ror #brot
 84         eor             \b2, \c2, \b2, ror #brot
 85         // brot == 32 - 12 == 20
 86 
 87         // a += b; d ^= a; d = rol(d, 8);
 88         add             \a1, \a1, \b1, ror #20
 89         add             \a2, \a2, \b2, ror #20
 90         eor             \d1, \a1, \d1, ror #16
 91         eor             \d2, \a2, \d2, ror #16
 92         // drot == 32 - 8 == 24
 93 
 94         // c += d; b ^= c; b = rol(b, 7);
 95         add             \c1, \c1, \d1, ror #24
 96         add             \c2, \c2, \d2, ror #24
 97         eor             \b1, \c1, \b1, ror #20
 98         eor             \b2, \c2, \b2, ror #20
 99         // brot == 32 - 7 == 25
100 .endm
101 
102 .macro _doubleround
103 
104         // column round
105 
106         // quarterrounds: (x0, x4, x8, x12) and (x1, x5, x9, x13)
107         _halfround      X0, X4, X8_X10, X12,  X1, X5, X9_X11, X13
108 
109         // save (x8, x9); restore (x10, x11)
110         __strd          X8_X10, X9_X11, sp, 0
111         __ldrd          X8_X10, X9_X11, sp, 8
112 
113         // quarterrounds: (x2, x6, x10, x14) and (x3, x7, x11, x15)
114         _halfround      X2, X6, X8_X10, X14,  X3, X7, X9_X11, X15
115 
116         .set brot, 25
117         .set drot, 24
118 
119         // diagonal round
120 
121         // quarterrounds: (x0, x5, x10, x15) and (x1, x6, x11, x12)
122         _halfround      X0, X5, X8_X10, X15,  X1, X6, X9_X11, X12
123 
124         // save (x10, x11); restore (x8, x9)
125         __strd          X8_X10, X9_X11, sp, 8
126         __ldrd          X8_X10, X9_X11, sp, 0
127 
128         // quarterrounds: (x2, x7, x8, x13) and (x3, x4, x9, x14)
129         _halfround      X2, X7, X8_X10, X13,  X3, X4, X9_X11, X14
130 .endm
131 
132 .macro _chacha_permute  nrounds
133         .set brot, 0
134         .set drot, 0
135         .rept \nrounds / 2
136          _doubleround
137         .endr
138 .endm
139 
140 .macro _chacha          nrounds
141 
142 .Lnext_block\@:
143         // Stack: unused0-unused1 x10-x11 x0-x15 OUT IN LEN
144         // Registers contain x0-x9,x12-x15.
145 
146         // Do the core ChaCha permutation to update x0-x15.
147         _chacha_permute \nrounds
148 
149         add             sp, #8
150         // Stack: x10-x11 orig_x0-orig_x15 OUT IN LEN
151         // Registers contain x0-x9,x12-x15.
152         // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
153 
154         // Free up some registers (r8-r12,r14) by pushing (x8-x9,x12-x15).
155         push            {X8_X10, X9_X11, X12, X13, X14, X15}
156 
157         // Load (OUT, IN, LEN).
158         ldr             r14, [sp, #96]
159         ldr             r12, [sp, #100]
160         ldr             r11, [sp, #104]
161 
162         orr             r10, r14, r12
163 
164         // Use slow path if fewer than 64 bytes remain.
165         cmp             r11, #64
166         blt             .Lxor_slowpath\@
167 
168         // Use slow path if IN and/or OUT isn't 4-byte aligned.  Needed even on
169         // ARMv6+, since ldmia and stmia (used below) still require alignment.
170         tst             r10, #3
171         bne             .Lxor_slowpath\@
172 
173         // Fast path: XOR 64 bytes of aligned data.
174 
175         // Stack: x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
176         // Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is OUT.
177         // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
178 
179         // x0-x3
180         __ldrd          r8, r9, sp, 32
181         __ldrd          r10, r11, sp, 40
182         add             X0, X0, r8
183         add             X1, X1, r9
184         add             X2, X2, r10
185         add             X3, X3, r11
186         _le32_bswap_4x  X0, X1, X2, X3,  r8
187         ldmia           r12!, {r8-r11}
188         eor             X0, X0, r8
189         eor             X1, X1, r9
190         eor             X2, X2, r10
191         eor             X3, X3, r11
192         stmia           r14!, {X0-X3}
193 
194         // x4-x7
195         __ldrd          r8, r9, sp, 48
196         __ldrd          r10, r11, sp, 56
197         add             X4, r8, X4, ror #brot
198         add             X5, r9, X5, ror #brot
199         ldmia           r12!, {X0-X3}
200         add             X6, r10, X6, ror #brot
201         add             X7, r11, X7, ror #brot
202         _le32_bswap_4x  X4, X5, X6, X7,  r8
203         eor             X4, X4, X0
204         eor             X5, X5, X1
205         eor             X6, X6, X2
206         eor             X7, X7, X3
207         stmia           r14!, {X4-X7}
208 
209         // x8-x15
210         pop             {r0-r7}                 // (x8-x9,x12-x15,x10-x11)
211         __ldrd          r8, r9, sp, 32
212         __ldrd          r10, r11, sp, 40
213         add             r0, r0, r8              // x8
214         add             r1, r1, r9              // x9
215         add             r6, r6, r10             // x10
216         add             r7, r7, r11             // x11
217         _le32_bswap_4x  r0, r1, r6, r7,  r8
218         ldmia           r12!, {r8-r11}
219         eor             r0, r0, r8              // x8
220         eor             r1, r1, r9              // x9
221         eor             r6, r6, r10             // x10
222         eor             r7, r7, r11             // x11
223         stmia           r14!, {r0,r1,r6,r7}
224         ldmia           r12!, {r0,r1,r6,r7}
225         __ldrd          r8, r9, sp, 48
226         __ldrd          r10, r11, sp, 56
227         add             r2, r8, r2, ror #drot   // x12
228         add             r3, r9, r3, ror #drot   // x13
229         add             r4, r10, r4, ror #drot  // x14
230         add             r5, r11, r5, ror #drot  // x15
231         _le32_bswap_4x  r2, r3, r4, r5,  r9
232           ldr           r9, [sp, #72]           // load LEN
233         eor             r2, r2, r0              // x12
234         eor             r3, r3, r1              // x13
235         eor             r4, r4, r6              // x14
236         eor             r5, r5, r7              // x15
237           subs          r9, #64                 // decrement and check LEN
238         stmia           r14!, {r2-r5}
239 
240         beq             .Ldone\@
241 
242 .Lprepare_for_next_block\@:
243 
244         // Stack: x0-x15 OUT IN LEN
245 
246         // Increment block counter (x12)
247         add             r8, #1
248 
249         // Store updated (OUT, IN, LEN)
250         str             r14, [sp, #64]
251         str             r12, [sp, #68]
252         str             r9, [sp, #72]
253 
254           mov           r14, sp
255 
256         // Store updated block counter (x12)
257         str             r8, [sp, #48]
258 
259           sub           sp, #16
260 
261         // Reload state and do next block
262         ldmia           r14!, {r0-r11}          // load x0-x11
263         __strd          r10, r11, sp, 8         // store x10-x11 before state
264         ldmia           r14, {r10-r12,r14}      // load x12-x15
265         b               .Lnext_block\@
266 
267 .Lxor_slowpath\@:
268         // Slow path: < 64 bytes remaining, or unaligned input or output buffer.
269         // We handle it by storing the 64 bytes of keystream to the stack, then
270         // XOR-ing the needed portion with the data.
271 
272         // Allocate keystream buffer
273         sub             sp, #64
274         mov             r14, sp
275 
276         // Stack: ks0-ks15 x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
277         // Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is &ks0.
278         // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
279 
280         // Save keystream for x0-x3
281         __ldrd          r8, r9, sp, 96
282         __ldrd          r10, r11, sp, 104
283         add             X0, X0, r8
284         add             X1, X1, r9
285         add             X2, X2, r10
286         add             X3, X3, r11
287         _le32_bswap_4x  X0, X1, X2, X3,  r8
288         stmia           r14!, {X0-X3}
289 
290         // Save keystream for x4-x7
291         __ldrd          r8, r9, sp, 112
292         __ldrd          r10, r11, sp, 120
293         add             X4, r8, X4, ror #brot
294         add             X5, r9, X5, ror #brot
295         add             X6, r10, X6, ror #brot
296         add             X7, r11, X7, ror #brot
297         _le32_bswap_4x  X4, X5, X6, X7,  r8
298           add           r8, sp, #64
299         stmia           r14!, {X4-X7}
300 
301         // Save keystream for x8-x15
302         ldm             r8, {r0-r7}             // (x8-x9,x12-x15,x10-x11)
303         __ldrd          r8, r9, sp, 128
304         __ldrd          r10, r11, sp, 136
305         add             r0, r0, r8              // x8
306         add             r1, r1, r9              // x9
307         add             r6, r6, r10             // x10
308         add             r7, r7, r11             // x11
309         _le32_bswap_4x  r0, r1, r6, r7,  r8
310         stmia           r14!, {r0,r1,r6,r7}
311         __ldrd          r8, r9, sp, 144
312         __ldrd          r10, r11, sp, 152
313         add             r2, r8, r2, ror #drot   // x12
314         add             r3, r9, r3, ror #drot   // x13
315         add             r4, r10, r4, ror #drot  // x14
316         add             r5, r11, r5, ror #drot  // x15
317         _le32_bswap_4x  r2, r3, r4, r5,  r9
318         stmia           r14, {r2-r5}
319 
320         // Stack: ks0-ks15 unused0-unused7 x0-x15 OUT IN LEN
321         // Registers: r8 is block counter, r12 is IN.
322 
323         ldr             r9, [sp, #168]          // LEN
324         ldr             r14, [sp, #160]         // OUT
325         cmp             r9, #64
326           mov           r0, sp
327         movle           r1, r9
328         movgt           r1, #64
329         // r1 is number of bytes to XOR, in range [1, 64]
330 
331 .if __LINUX_ARM_ARCH__ < 6
332         orr             r2, r12, r14
333         tst             r2, #3                  // IN or OUT misaligned?
334         bne             .Lxor_next_byte\@
335 .endif
336 
337         // XOR a word at a time
338 .rept 16
339         subs            r1, #4
340         blt             .Lxor_words_done\@
341         ldr             r2, [r12], #4
342         ldr             r3, [r0], #4
343         eor             r2, r2, r3
344         str             r2, [r14], #4
345 .endr
346         b               .Lxor_slowpath_done\@
347 .Lxor_words_done\@:
348         ands            r1, r1, #3
349         beq             .Lxor_slowpath_done\@
350 
351         // XOR a byte at a time
352 .Lxor_next_byte\@:
353         ldrb            r2, [r12], #1
354         ldrb            r3, [r0], #1
355         eor             r2, r2, r3
356         strb            r2, [r14], #1
357         subs            r1, #1
358         bne             .Lxor_next_byte\@
359 
360 .Lxor_slowpath_done\@:
361         subs            r9, #64
362         add             sp, #96
363         bgt             .Lprepare_for_next_block\@
364 
365 .Ldone\@:
366 .endm   // _chacha
367 
368 /*
369  * void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes,
370  *                   const u32 *state, int nrounds);
371  */
372 ENTRY(chacha_doarm)
373         cmp             r2, #0                  // len == 0?
374         reteq           lr
375 
376         ldr             ip, [sp]
377         cmp             ip, #12
378 
379         push            {r0-r2,r4-r11,lr}
380 
381         // Push state x0-x15 onto stack.
382         // Also store an extra copy of x10-x11 just before the state.
383 
384         add             X12, r3, #48
385         ldm             X12, {X12,X13,X14,X15}
386         push            {X12,X13,X14,X15}
387         sub             sp, sp, #64
388 
389         __ldrd          X8_X10, X9_X11, r3, 40
390         __strd          X8_X10, X9_X11, sp, 8
391         __strd          X8_X10, X9_X11, sp, 56
392         ldm             r3, {X0-X9_X11}
393         __strd          X0, X1, sp, 16
394         __strd          X2, X3, sp, 24
395         __strd          X4, X5, sp, 32
396         __strd          X6, X7, sp, 40
397         __strd          X8_X10, X9_X11, sp, 48
398 
399         beq             1f
400         _chacha         20
401 
402 0:      add             sp, #76
403         pop             {r4-r11, pc}
404 
405 1:      _chacha         12
406         b               0b
407 ENDPROC(chacha_doarm)
408 
409 /*
410  * void hchacha_block_arm(const u32 state[16], u32 out[8], int nrounds);
411  */
412 ENTRY(hchacha_block_arm)
413         push            {r1,r4-r11,lr}
414 
415         cmp             r2, #12                 // ChaCha12 ?
416 
417         mov             r14, r0
418         ldmia           r14!, {r0-r11}          // load x0-x11
419         push            {r10-r11}               // store x10-x11 to stack
420         ldm             r14, {r10-r12,r14}      // load x12-x15
421         sub             sp, #8
422 
423         beq             1f
424         _chacha_permute 20
425 
426         // Skip over (unused0-unused1, x10-x11)
427 0:      add             sp, #16
428 
429         // Fix up rotations of x12-x15
430         ror             X12, X12, #drot
431         ror             X13, X13, #drot
432           pop           {r4}                    // load 'out'
433         ror             X14, X14, #drot
434         ror             X15, X15, #drot
435 
436         // Store (x0-x3,x12-x15) to 'out'
437         stm             r4, {X0,X1,X2,X3,X12,X13,X14,X15}
438 
439         pop             {r4-r11,pc}
440 
441 1:      _chacha_permute 12
442         b               0b
443 ENDPROC(hchacha_block_arm)

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

sflogo.php