1 /* SPDX-License-Identifier: GPL-2.0 */ 2 /* 3 * Copyright (C) 2018 Google, Inc. 4 */ 5 6 #include <linux/linkage.h> 7 #include <asm/assembler.h> 8 9 /* 10 * Design notes: 11 * 12 * 16 registers would be needed to hold the st 13 * available because 'sp' and 'pc' cannot be u 14 * (x8, x9) to the stack and swap them out wit 15 * 'ldrd' and one 'strd' instruction per round 16 * 17 * All rotates are performed using the implici 18 * 'add' and 'eor' instructions. This is fast 19 * instructions. To make this work, we allow 20 * rows of the ChaCha state matrix (rows 'b' a 21 * wrong rotation amount. The rotation amount 22 * when the values are used. 'brot' is the nu 23 * need to be rotated right to arrive at the c 24 * similarly for row 'd'. (brot, drot) start 25 * that they end up as (25, 24) after every ro 26 */ 27 28 // ChaCha state registers 29 X0 .req r0 30 X1 .req r1 31 X2 .req r2 32 X3 .req r3 33 X4 .req r4 34 X5 .req r5 35 X6 .req r6 36 X7 .req r7 37 X8_X10 .req r8 // shared by x 38 X9_X11 .req r9 // shared by x 39 X12 .req r10 40 X13 .req r11 41 X14 .req r12 42 X15 .req r14 43 44 .macro _le32_bswap_4x a, b, c, d, tmp 45 #ifdef __ARMEB__ 46 rev_l \a, \tmp 47 rev_l \b, \tmp 48 rev_l \c, \tmp 49 rev_l \d, \tmp 50 #endif 51 .endm 52 53 .macro __ldrd a, b, src, offset 54 #if __LINUX_ARM_ARCH__ >= 6 55 ldrd \a, \b, [\src, #\offse 56 #else 57 ldr \a, [\src, #\offset] 58 ldr \b, [\src, #\offset + 59 #endif 60 .endm 61 62 .macro __strd a, b, dst, offset 63 #if __LINUX_ARM_ARCH__ >= 6 64 strd \a, \b, [\dst, #\offse 65 #else 66 str \a, [\dst, #\offset] 67 str \b, [\dst, #\offset + 68 #endif 69 .endm 70 71 .macro _halfround a1, b1, c1, d1, a2, b 72 73 // a += b; d ^= a; d = rol(d, 16); 74 add \a1, \a1, \b1, ror #br 75 add \a2, \a2, \b2, ror #br 76 eor \d1, \a1, \d1, ror #dr 77 eor \d2, \a2, \d2, ror #dr 78 // drot == 32 - 16 == 16 79 80 // c += d; b ^= c; b = rol(b, 12); 81 add \c1, \c1, \d1, ror #16 82 add \c2, \c2, \d2, ror #16 83 eor \b1, \c1, \b1, ror #br 84 eor \b2, \c2, \b2, ror #br 85 // brot == 32 - 12 == 20 86 87 // a += b; d ^= a; d = rol(d, 8); 88 add \a1, \a1, \b1, ror #20 89 add \a2, \a2, \b2, ror #20 90 eor \d1, \a1, \d1, ror #16 91 eor \d2, \a2, \d2, ror #16 92 // drot == 32 - 8 == 24 93 94 // c += d; b ^= c; b = rol(b, 7); 95 add \c1, \c1, \d1, ror #24 96 add \c2, \c2, \d2, ror #24 97 eor \b1, \c1, \b1, ror #20 98 eor \b2, \c2, \b2, ror #20 99 // brot == 32 - 7 == 25 100 .endm 101 102 .macro _doubleround 103 104 // column round 105 106 // quarterrounds: (x0, x4, x8, x12) an 107 _halfround X0, X4, X8_X10, X12, 108 109 // save (x8, x9); restore (x10, x11) 110 __strd X8_X10, X9_X11, sp, 0 111 __ldrd X8_X10, X9_X11, sp, 8 112 113 // quarterrounds: (x2, x6, x10, x14) a 114 _halfround X2, X6, X8_X10, X14, 115 116 .set brot, 25 117 .set drot, 24 118 119 // diagonal round 120 121 // quarterrounds: (x0, x5, x10, x15) a 122 _halfround X0, X5, X8_X10, X15, 123 124 // save (x10, x11); restore (x8, x9) 125 __strd X8_X10, X9_X11, sp, 8 126 __ldrd X8_X10, X9_X11, sp, 0 127 128 // quarterrounds: (x2, x7, x8, x13) an 129 _halfround X2, X7, X8_X10, X13, 130 .endm 131 132 .macro _chacha_permute nrounds 133 .set brot, 0 134 .set drot, 0 135 .rept \nrounds / 2 136 _doubleround 137 .endr 138 .endm 139 140 .macro _chacha nrounds 141 142 .Lnext_block\@: 143 // Stack: unused0-unused1 x10-x11 x0-x 144 // Registers contain x0-x9,x12-x15. 145 146 // Do the core ChaCha permutation to u 147 _chacha_permute \nrounds 148 149 add sp, #8 150 // Stack: x10-x11 orig_x0-orig_x15 OUT 151 // Registers contain x0-x9,x12-x15. 152 // x4-x7 are rotated by 'brot'; x12-x1 153 154 // Free up some registers (r8-r12,r14) 155 push {X8_X10, X9_X11, X12, 156 157 // Load (OUT, IN, LEN). 158 ldr r14, [sp, #96] 159 ldr r12, [sp, #100] 160 ldr r11, [sp, #104] 161 162 orr r10, r14, r12 163 164 // Use slow path if fewer than 64 byte 165 cmp r11, #64 166 blt .Lxor_slowpath\@ 167 168 // Use slow path if IN and/or OUT isn' 169 // ARMv6+, since ldmia and stmia (used 170 tst r10, #3 171 bne .Lxor_slowpath\@ 172 173 // Fast path: XOR 64 bytes of aligned 174 175 // Stack: x8-x9 x12-x15 x10-x11 orig_x 176 // Registers: r0-r7 are x0-x7; r8-r11 177 // x4-x7 are rotated by 'brot'; x12-x1 178 179 // x0-x3 180 __ldrd r8, r9, sp, 32 181 __ldrd r10, r11, sp, 40 182 add X0, X0, r8 183 add X1, X1, r9 184 add X2, X2, r10 185 add X3, X3, r11 186 _le32_bswap_4x X0, X1, X2, X3, r8 187 ldmia r12!, {r8-r11} 188 eor X0, X0, r8 189 eor X1, X1, r9 190 eor X2, X2, r10 191 eor X3, X3, r11 192 stmia r14!, {X0-X3} 193 194 // x4-x7 195 __ldrd r8, r9, sp, 48 196 __ldrd r10, r11, sp, 56 197 add X4, r8, X4, ror #brot 198 add X5, r9, X5, ror #brot 199 ldmia r12!, {X0-X3} 200 add X6, r10, X6, ror #brot 201 add X7, r11, X7, ror #brot 202 _le32_bswap_4x X4, X5, X6, X7, r8 203 eor X4, X4, X0 204 eor X5, X5, X1 205 eor X6, X6, X2 206 eor X7, X7, X3 207 stmia r14!, {X4-X7} 208 209 // x8-x15 210 pop {r0-r7} 211 __ldrd r8, r9, sp, 32 212 __ldrd r10, r11, sp, 40 213 add r0, r0, r8 214 add r1, r1, r9 215 add r6, r6, r10 216 add r7, r7, r11 217 _le32_bswap_4x r0, r1, r6, r7, r8 218 ldmia r12!, {r8-r11} 219 eor r0, r0, r8 220 eor r1, r1, r9 221 eor r6, r6, r10 222 eor r7, r7, r11 223 stmia r14!, {r0,r1,r6,r7} 224 ldmia r12!, {r0,r1,r6,r7} 225 __ldrd r8, r9, sp, 48 226 __ldrd r10, r11, sp, 56 227 add r2, r8, r2, ror #drot 228 add r3, r9, r3, ror #drot 229 add r4, r10, r4, ror #drot 230 add r5, r11, r5, ror #drot 231 _le32_bswap_4x r2, r3, r4, r5, r9 232 ldr r9, [sp, #72] 233 eor r2, r2, r0 234 eor r3, r3, r1 235 eor r4, r4, r6 236 eor r5, r5, r7 237 subs r9, #64 238 stmia r14!, {r2-r5} 239 240 beq .Ldone\@ 241 242 .Lprepare_for_next_block\@: 243 244 // Stack: x0-x15 OUT IN LEN 245 246 // Increment block counter (x12) 247 add r8, #1 248 249 // Store updated (OUT, IN, LEN) 250 str r14, [sp, #64] 251 str r12, [sp, #68] 252 str r9, [sp, #72] 253 254 mov r14, sp 255 256 // Store updated block counter (x12) 257 str r8, [sp, #48] 258 259 sub sp, #16 260 261 // Reload state and do next block 262 ldmia r14!, {r0-r11} 263 __strd r10, r11, sp, 8 264 ldmia r14, {r10-r12,r14} 265 b .Lnext_block\@ 266 267 .Lxor_slowpath\@: 268 // Slow path: < 64 bytes remaining, or 269 // We handle it by storing the 64 byte 270 // XOR-ing the needed portion with the 271 272 // Allocate keystream buffer 273 sub sp, #64 274 mov r14, sp 275 276 // Stack: ks0-ks15 x8-x9 x12-x15 x10-x 277 // Registers: r0-r7 are x0-x7; r8-r11 278 // x4-x7 are rotated by 'brot'; x12-x1 279 280 // Save keystream for x0-x3 281 __ldrd r8, r9, sp, 96 282 __ldrd r10, r11, sp, 104 283 add X0, X0, r8 284 add X1, X1, r9 285 add X2, X2, r10 286 add X3, X3, r11 287 _le32_bswap_4x X0, X1, X2, X3, r8 288 stmia r14!, {X0-X3} 289 290 // Save keystream for x4-x7 291 __ldrd r8, r9, sp, 112 292 __ldrd r10, r11, sp, 120 293 add X4, r8, X4, ror #brot 294 add X5, r9, X5, ror #brot 295 add X6, r10, X6, ror #brot 296 add X7, r11, X7, ror #brot 297 _le32_bswap_4x X4, X5, X6, X7, r8 298 add r8, sp, #64 299 stmia r14!, {X4-X7} 300 301 // Save keystream for x8-x15 302 ldm r8, {r0-r7} 303 __ldrd r8, r9, sp, 128 304 __ldrd r10, r11, sp, 136 305 add r0, r0, r8 306 add r1, r1, r9 307 add r6, r6, r10 308 add r7, r7, r11 309 _le32_bswap_4x r0, r1, r6, r7, r8 310 stmia r14!, {r0,r1,r6,r7} 311 __ldrd r8, r9, sp, 144 312 __ldrd r10, r11, sp, 152 313 add r2, r8, r2, ror #drot 314 add r3, r9, r3, ror #drot 315 add r4, r10, r4, ror #drot 316 add r5, r11, r5, ror #drot 317 _le32_bswap_4x r2, r3, r4, r5, r9 318 stmia r14, {r2-r5} 319 320 // Stack: ks0-ks15 unused0-unused7 x0- 321 // Registers: r8 is block counter, r12 322 323 ldr r9, [sp, #168] 324 ldr r14, [sp, #160] 325 cmp r9, #64 326 mov r0, sp 327 movle r1, r9 328 movgt r1, #64 329 // r1 is number of bytes to XOR, in ra 330 331 .if __LINUX_ARM_ARCH__ < 6 332 orr r2, r12, r14 333 tst r2, #3 334 bne .Lxor_next_byte\@ 335 .endif 336 337 // XOR a word at a time 338 .rept 16 339 subs r1, #4 340 blt .Lxor_words_done\@ 341 ldr r2, [r12], #4 342 ldr r3, [r0], #4 343 eor r2, r2, r3 344 str r2, [r14], #4 345 .endr 346 b .Lxor_slowpath_done\@ 347 .Lxor_words_done\@: 348 ands r1, r1, #3 349 beq .Lxor_slowpath_done\@ 350 351 // XOR a byte at a time 352 .Lxor_next_byte\@: 353 ldrb r2, [r12], #1 354 ldrb r3, [r0], #1 355 eor r2, r2, r3 356 strb r2, [r14], #1 357 subs r1, #1 358 bne .Lxor_next_byte\@ 359 360 .Lxor_slowpath_done\@: 361 subs r9, #64 362 add sp, #96 363 bgt .Lprepare_for_next_blo 364 365 .Ldone\@: 366 .endm // _chacha 367 368 /* 369 * void chacha_doarm(u8 *dst, const u8 *src, u 370 * const u32 *state, int nro 371 */ 372 ENTRY(chacha_doarm) 373 cmp r2, #0 374 reteq lr 375 376 ldr ip, [sp] 377 cmp ip, #12 378 379 push {r0-r2,r4-r11,lr} 380 381 // Push state x0-x15 onto stack. 382 // Also store an extra copy of x10-x11 383 384 add X12, r3, #48 385 ldm X12, {X12,X13,X14,X15} 386 push {X12,X13,X14,X15} 387 sub sp, sp, #64 388 389 __ldrd X8_X10, X9_X11, r3, 40 390 __strd X8_X10, X9_X11, sp, 8 391 __strd X8_X10, X9_X11, sp, 56 392 ldm r3, {X0-X9_X11} 393 __strd X0, X1, sp, 16 394 __strd X2, X3, sp, 24 395 __strd X4, X5, sp, 32 396 __strd X6, X7, sp, 40 397 __strd X8_X10, X9_X11, sp, 48 398 399 beq 1f 400 _chacha 20 401 402 0: add sp, #76 403 pop {r4-r11, pc} 404 405 1: _chacha 12 406 b 0b 407 ENDPROC(chacha_doarm) 408 409 /* 410 * void hchacha_block_arm(const u32 state[16], 411 */ 412 ENTRY(hchacha_block_arm) 413 push {r1,r4-r11,lr} 414 415 cmp r2, #12 416 417 mov r14, r0 418 ldmia r14!, {r0-r11} 419 push {r10-r11} 420 ldm r14, {r10-r12,r14} 421 sub sp, #8 422 423 beq 1f 424 _chacha_permute 20 425 426 // Skip over (unused0-unused1, x10-x11 427 0: add sp, #16 428 429 // Fix up rotations of x12-x15 430 ror X12, X12, #drot 431 ror X13, X13, #drot 432 pop {r4} 433 ror X14, X14, #drot 434 ror X15, X15, #drot 435 436 // Store (x0-x3,x12-x15) to 'out' 437 stm r4, {X0,X1,X2,X3,X12,X 438 439 pop {r4-r11,pc} 440 441 1: _chacha_permute 12 442 b 0b 443 ENDPROC(hchacha_block_arm)
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.