~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/arch/arm64/kernel/vdso/vgetrandom-chacha.S

Version: ~ [ linux-6.12-rc7 ] ~ [ linux-6.11.7 ] ~ [ linux-6.10.14 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.60 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.116 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.171 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.229 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.285 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.323 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.12 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 // SPDX-License-Identifier: GPL-2.0
  2 
  3 #include <linux/linkage.h>
  4 #include <asm/cache.h>
  5 #include <asm/assembler.h>
  6 
  7         .text
  8 
  9 #define state0          v0
 10 #define state1          v1
 11 #define state2          v2
 12 #define state3          v3
 13 #define copy0           v4
 14 #define copy0_q         q4
 15 #define copy1           v5
 16 #define copy2           v6
 17 #define copy3           v7
 18 #define copy3_d         d7
 19 #define one_d           d16
 20 #define one_q           q16
 21 #define one_v           v16
 22 #define tmp             v17
 23 #define rot8            v18
 24 
 25 /*
 26  * ARM64 ChaCha20 implementation meant for vDSO.  Produces a given positive
 27  * number of blocks of output with nonce 0, taking an input key and 8-bytes
 28  * counter.  Importantly does not spill to the stack.
 29  *
 30  * This implementation avoids d8-d15 because they are callee-save in user
 31  * space.
 32  *
 33  * void __arch_chacha20_blocks_nostack(uint8_t *dst_bytes,
 34  *                                     const uint8_t *key,
 35  *                                     uint32_t *counter,
 36  *                                     size_t nblocks)
 37  *
 38  *      x0: output bytes
 39  *      x1: 32-byte key input
 40  *      x2: 8-byte counter input/output
 41  *      x3: number of 64-byte block to write to output
 42  */
 43 SYM_FUNC_START(__arch_chacha20_blocks_nostack)
 44 
 45         /* copy0 = "expand 32-byte k" */
 46         mov_q           x8, 0x3320646e61707865
 47         mov_q           x9, 0x6b20657479622d32
 48         mov             copy0.d[0], x8
 49         mov             copy0.d[1], x9
 50 
 51         /* copy1,copy2 = key */
 52         ld1             { copy1.4s, copy2.4s }, [x1]
 53         /* copy3 = counter || zero nonce  */
 54         ld1             { copy3.2s }, [x2]
 55 
 56         movi            one_v.2s, #1
 57         uzp1            one_v.4s, one_v.4s, one_v.4s
 58 
 59 .Lblock:
 60         /* copy state to auxiliary vectors for the final add after the permute.  */
 61         mov             state0.16b, copy0.16b
 62         mov             state1.16b, copy1.16b
 63         mov             state2.16b, copy2.16b
 64         mov             state3.16b, copy3.16b
 65 
 66         mov             w4, 20
 67 .Lpermute:
 68         /*
 69          * Permute one 64-byte block where the state matrix is stored in the four NEON
 70          * registers state0-state3.  It performs matrix operations on four words in parallel,
 71          * but requires shuffling to rearrange the words after each round.
 72          */
 73 
 74 .Ldoubleround:
 75         /* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */
 76         add             state0.4s, state0.4s, state1.4s
 77         eor             state3.16b, state3.16b, state0.16b
 78         rev32           state3.8h, state3.8h
 79 
 80         /* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */
 81         add             state2.4s, state2.4s, state3.4s
 82         eor             tmp.16b, state1.16b, state2.16b
 83         shl             state1.4s, tmp.4s, #12
 84         sri             state1.4s, tmp.4s, #20
 85 
 86         /* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */
 87         add             state0.4s, state0.4s, state1.4s
 88         eor             tmp.16b, state3.16b, state0.16b
 89         shl             state3.4s, tmp.4s, #8
 90         sri             state3.4s, tmp.4s, #24
 91 
 92         /* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */
 93         add             state2.4s, state2.4s, state3.4s
 94         eor             tmp.16b, state1.16b, state2.16b
 95         shl             state1.4s, tmp.4s, #7
 96         sri             state1.4s, tmp.4s, #25
 97 
 98         /* state1[0,1,2,3] = state1[1,2,3,0] */
 99         ext             state1.16b, state1.16b, state1.16b, #4
100         /* state2[0,1,2,3] = state2[2,3,0,1] */
101         ext             state2.16b, state2.16b, state2.16b, #8
102         /* state3[0,1,2,3] = state3[1,2,3,0] */
103         ext             state3.16b, state3.16b, state3.16b, #12
104 
105         /* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */
106         add             state0.4s, state0.4s, state1.4s
107         eor             state3.16b, state3.16b, state0.16b
108         rev32           state3.8h, state3.8h
109 
110         /* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */
111         add             state2.4s, state2.4s, state3.4s
112         eor             tmp.16b, state1.16b, state2.16b
113         shl             state1.4s, tmp.4s, #12
114         sri             state1.4s, tmp.4s, #20
115 
116         /* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */
117         add             state0.4s, state0.4s, state1.4s
118         eor             tmp.16b, state3.16b, state0.16b
119         shl             state3.4s, tmp.4s, #8
120         sri             state3.4s, tmp.4s, #24
121 
122         /* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */
123         add             state2.4s, state2.4s, state3.4s
124         eor             tmp.16b, state1.16b, state2.16b
125         shl             state1.4s, tmp.4s, #7
126         sri             state1.4s, tmp.4s, #25
127 
128         /* state1[0,1,2,3] = state1[3,0,1,2] */
129         ext             state1.16b, state1.16b, state1.16b, #12
130         /* state2[0,1,2,3] = state2[2,3,0,1] */
131         ext             state2.16b, state2.16b, state2.16b, #8
132         /* state3[0,1,2,3] = state3[1,2,3,0] */
133         ext             state3.16b, state3.16b, state3.16b, #4
134 
135         subs            w4, w4, #2
136         b.ne            .Ldoubleround
137 
138         /* output0 = state0 + state0 */
139         add             state0.4s, state0.4s, copy0.4s
140         /* output1 = state1 + state1 */
141         add             state1.4s, state1.4s, copy1.4s
142         /* output2 = state2 + state2 */
143         add             state2.4s, state2.4s, copy2.4s
144         /* output2 = state3 + state3 */
145         add             state3.4s, state3.4s, copy3.4s
146         st1             { state0.16b - state3.16b }, [x0]
147 
148         /*
149          * ++copy3.counter, the 'add' clears the upper half of the SIMD register
150          * which is the expected behaviour here.
151          */
152         add             copy3_d, copy3_d, one_d
153 
154         /* output += 64, --nblocks */
155         add             x0, x0, 64
156         subs            x3, x3, #1
157         b.ne            .Lblock
158 
159         /* counter = copy3.counter */
160         st1             { copy3.2s }, [x2]
161 
162         /* Zero out the potentially sensitive regs, in case nothing uses these again. */
163         movi            state0.16b, #0
164         movi            state1.16b, #0
165         movi            state2.16b, #0
166         movi            state3.16b, #0
167         movi            copy1.16b, #0
168         movi            copy2.16b, #0
169         ret
170 SYM_FUNC_END(__arch_chacha20_blocks_nostack)
171 
172 emit_aarch64_feature_1_and

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

sflogo.php