Linux/arch/arm64/kernel/vdso/vgetrandom-chacha.S

Version: ~ [ linux-6.12-rc7 ] ~ [ linux-6.11.7 ] ~ [ linux-6.10.14 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.60 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.116 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.171 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.229 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.285 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.323 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.12 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

1 // SPDX-License-Identifier: GPL-2.0 2 3 #include <linux/linkage.h> 4 #include <asm/cache.h> 5 #include <asm/assembler.h> 6 7 .text 8 9 #define state0 v0 10 #define state1 v1 11 #define state2 v2 12 #define state3 v3 13 #define copy0 v4 14 #define copy0_q q4 15 #define copy1 v5 16 #define copy2 v6 17 #define copy3 v7 18 #define copy3_d d7 19 #define one_d d16 20 #define one_q q16 21 #define one_v v16 22 #define tmp v17 23 #define rot8 v18 24 25 /* 26 * ARM64 ChaCha20 implementation meant for vDSO. Produces a given positive 27 * number of blocks of output with nonce 0, taking an input key and 8-bytes 28 * counter. Importantly does not spill to the stack. 29 * 30 * This implementation avoids d8-d15 because they are callee-save in user 31 * space. 32 * 33 * void __arch_chacha20_blocks_nostack(uint8_t *dst_bytes, 34 * const uint8_t *key, 35 * uint32_t *counter, 36 * size_t nblocks) 37 * 38 * x0: output bytes 39 * x1: 32-byte key input 40 * x2: 8-byte counter input/output 41 * x3: number of 64-byte block to write to output 42 */ 43 SYM_FUNC_START(__arch_chacha20_blocks_nostack) 44 45 /* copy0 = "expand 32-byte k" */ 46 mov_q x8, 0x3320646e61707865 47 mov_q x9, 0x6b20657479622d32 48 mov copy0.d[0], x8 49 mov copy0.d[1], x9 50 51 /* copy1,copy2 = key */ 52 ld1 { copy1.4s, copy2.4s }, [x1] 53 /* copy3 = counter || zero nonce */ 54 ld1 { copy3.2s }, [x2] 55 56 movi one_v.2s, #1 57 uzp1 one_v.4s, one_v.4s, one_v.4s 58 59 .Lblock: 60 /* copy state to auxiliary vectors for the final add after the permute. */ 61 mov state0.16b, copy0.16b 62 mov state1.16b, copy1.16b 63 mov state2.16b, copy2.16b 64 mov state3.16b, copy3.16b 65 66 mov w4, 20 67 .Lpermute: 68 /* 69 * Permute one 64-byte block where the state matrix is stored in the four NEON 70 * registers state0-state3. It performs matrix operations on four words in parallel, 71 * but requires shuffling to rearrange the words after each round. 72 */ 73 74 .Ldoubleround: 75 /* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */ 76 add state0.4s, state0.4s, state1.4s 77 eor state3.16b, state3.16b, state0.16b 78 rev32 state3.8h, state3.8h 79 80 /* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */ 81 add state2.4s, state2.4s, state3.4s 82 eor tmp.16b, state1.16b, state2.16b 83 shl state1.4s, tmp.4s, #12 84 sri state1.4s, tmp.4s, #20 85 86 /* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */ 87 add state0.4s, state0.4s, state1.4s 88 eor tmp.16b, state3.16b, state0.16b 89 shl state3.4s, tmp.4s, #8 90 sri state3.4s, tmp.4s, #24 91 92 /* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */ 93 add state2.4s, state2.4s, state3.4s 94 eor tmp.16b, state1.16b, state2.16b 95 shl state1.4s, tmp.4s, #7 96 sri state1.4s, tmp.4s, #25 97 98 /* state1[0,1,2,3] = state1[1,2,3,0] */ 99 ext state1.16b, state1.16b, state1.16b, #4 100 /* state2[0,1,2,3] = state2[2,3,0,1] */ 101 ext state2.16b, state2.16b, state2.16b, #8 102 /* state3[0,1,2,3] = state3[1,2,3,0] */ 103 ext state3.16b, state3.16b, state3.16b, #12 104 105 /* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */ 106 add state0.4s, state0.4s, state1.4s 107 eor state3.16b, state3.16b, state0.16b 108 rev32 state3.8h, state3.8h 109 110 /* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */ 111 add state2.4s, state2.4s, state3.4s 112 eor tmp.16b, state1.16b, state2.16b 113 shl state1.4s, tmp.4s, #12 114 sri state1.4s, tmp.4s, #20 115 116 /* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */ 117 add state0.4s, state0.4s, state1.4s 118 eor tmp.16b, state3.16b, state0.16b 119 shl state3.4s, tmp.4s, #8 120 sri state3.4s, tmp.4s, #24 121 122 /* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */ 123 add state2.4s, state2.4s, state3.4s 124 eor tmp.16b, state1.16b, state2.16b 125 shl state1.4s, tmp.4s, #7 126 sri state1.4s, tmp.4s, #25 127 128 /* state1[0,1,2,3] = state1[3,0,1,2] */ 129 ext state1.16b, state1.16b, state1.16b, #12 130 /* state2[0,1,2,3] = state2[2,3,0,1] */ 131 ext state2.16b, state2.16b, state2.16b, #8 132 /* state3[0,1,2,3] = state3[1,2,3,0] */ 133 ext state3.16b, state3.16b, state3.16b, #4 134 135 subs w4, w4, #2 136 b.ne .Ldoubleround 137 138 /* output0 = state0 + state0 */ 139 add state0.4s, state0.4s, copy0.4s 140 /* output1 = state1 + state1 */ 141 add state1.4s, state1.4s, copy1.4s 142 /* output2 = state2 + state2 */ 143 add state2.4s, state2.4s, copy2.4s 144 /* output2 = state3 + state3 */ 145 add state3.4s, state3.4s, copy3.4s 146 st1 { state0.16b - state3.16b }, [x0] 147 148 /* 149 * ++copy3.counter, the 'add' clears the upper half of the SIMD register 150 * which is the expected behaviour here. 151 */ 152 add copy3_d, copy3_d, one_d 153 154 /* output += 64, --nblocks */ 155 add x0, x0, 64 156 subs x3, x3, #1 157 b.ne .Lblock 158 159 /* counter = copy3.counter */ 160 st1 { copy3.2s }, [x2] 161 162 /* Zero out the potentially sensitive regs, in case nothing uses these again. */ 163 movi state0.16b, #0 164 movi state1.16b, #0 165 movi state2.16b, #0 166 movi state3.16b, #0 167 movi copy1.16b, #0 168 movi copy2.16b, #0 169 ret 170 SYM_FUNC_END(__arch_chacha20_blocks_nostack) 171 172 emit_aarch64_feature_1_and

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

TOMOYO Linux Cross Reference Linux/arch/arm64/kernel/vdso/vgetrandom-chacha.S

TOMOYO Linux Cross Reference
Linux/arch/arm64/kernel/vdso/vgetrandom-chacha.S