1 /* SPDX-License-Identifier: GPL-2.0-or-later */ 2 /* 3 * Cast6 Cipher 8-way parallel algorithm (AVX/x86_64) 4 * 5 * Copyright (C) 2012 Johannes Goetzfried 6 * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> 7 * 8 * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> 9 */ 10 11 #include <linux/linkage.h> 12 #include <asm/frame.h> 13 #include "glue_helper-asm-avx.S" 14 15 .file "cast6-avx-x86_64-asm_64.S" 16 17 .extern cast_s1 18 .extern cast_s2 19 .extern cast_s3 20 .extern cast_s4 21 22 /* structure of crypto context */ 23 #define km 0 24 #define kr (12*4*4) 25 26 /* s-boxes */ 27 #define s1 cast_s1 28 #define s2 cast_s2 29 #define s3 cast_s3 30 #define s4 cast_s4 31 32 /********************************************************************** 33 8-way AVX cast6 34 **********************************************************************/ 35 #define CTX %r15 36 37 #define RA1 %xmm0 38 #define RB1 %xmm1 39 #define RC1 %xmm2 40 #define RD1 %xmm3 41 42 #define RA2 %xmm4 43 #define RB2 %xmm5 44 #define RC2 %xmm6 45 #define RD2 %xmm7 46 47 #define RX %xmm8 48 49 #define RKM %xmm9 50 #define RKR %xmm10 51 #define RKRF %xmm11 52 #define RKRR %xmm12 53 #define R32 %xmm13 54 #define R1ST %xmm14 55 56 #define RTMP %xmm15 57 58 #define RID1 %rdi 59 #define RID1d %edi 60 #define RID2 %rsi 61 #define RID2d %esi 62 63 #define RGI1 %rdx 64 #define RGI1bl %dl 65 #define RGI1bh %dh 66 #define RGI2 %rcx 67 #define RGI2bl %cl 68 #define RGI2bh %ch 69 70 #define RGI3 %rax 71 #define RGI3bl %al 72 #define RGI3bh %ah 73 #define RGI4 %rbx 74 #define RGI4bl %bl 75 #define RGI4bh %bh 76 77 #define RFS1 %r8 78 #define RFS1d %r8d 79 #define RFS2 %r9 80 #define RFS2d %r9d 81 #define RFS3 %r10 82 #define RFS3d %r10d 83 84 85 #define lookup_32bit(src, dst, op1, op2, op3, interleave_op, il_reg) \ 86 movzbl src ## bh, RID1d; \ 87 leaq s1(%rip), RID2; \ 88 movl (RID2,RID1,4), dst ## d; \ 89 movzbl src ## bl, RID2d; \ 90 leaq s2(%rip), RID1; \ 91 op1 (RID1,RID2,4), dst ## d; \ 92 shrq $16, src; \ 93 movzbl src ## bh, RID1d; \ 94 leaq s3(%rip), RID2; \ 95 op2 (RID2,RID1,4), dst ## d; \ 96 movzbl src ## bl, RID2d; \ 97 interleave_op(il_reg); \ 98 leaq s4(%rip), RID1; \ 99 op3 (RID1,RID2,4), dst ## d; 100 101 #define dummy(d) /* do nothing */ 102 103 #define shr_next(reg) \ 104 shrq $16, reg; 105 106 #define F_head(a, x, gi1, gi2, op0) \ 107 op0 a, RKM, x; \ 108 vpslld RKRF, x, RTMP; \ 109 vpsrld RKRR, x, x; \ 110 vpor RTMP, x, x; \ 111 \ 112 vmovq x, gi1; \ 113 vpextrq $1, x, gi2; 114 115 #define F_tail(a, x, gi1, gi2, op1, op2, op3) \ 116 lookup_32bit(##gi1, RFS1, op1, op2, op3, shr_next, ##gi1); \ 117 lookup_32bit(##gi2, RFS3, op1, op2, op3, shr_next, ##gi2); \ 118 \ 119 lookup_32bit(##gi1, RFS2, op1, op2, op3, dummy, none); \ 120 shlq $32, RFS2; \ 121 orq RFS1, RFS2; \ 122 lookup_32bit(##gi2, RFS1, op1, op2, op3, dummy, none); \ 123 shlq $32, RFS1; \ 124 orq RFS1, RFS3; \ 125 \ 126 vmovq RFS2, x; \ 127 vpinsrq $1, RFS3, x, x; 128 129 #define F_2(a1, b1, a2, b2, op0, op1, op2, op3) \ 130 F_head(b1, RX, RGI1, RGI2, op0); \ 131 F_head(b2, RX, RGI3, RGI4, op0); \ 132 \ 133 F_tail(b1, RX, RGI1, RGI2, op1, op2, op3); \ 134 F_tail(b2, RTMP, RGI3, RGI4, op1, op2, op3); \ 135 \ 136 vpxor a1, RX, a1; \ 137 vpxor a2, RTMP, a2; 138 139 #define F1_2(a1, b1, a2, b2) \ 140 F_2(a1, b1, a2, b2, vpaddd, xorl, subl, addl) 141 #define F2_2(a1, b1, a2, b2) \ 142 F_2(a1, b1, a2, b2, vpxor, subl, addl, xorl) 143 #define F3_2(a1, b1, a2, b2) \ 144 F_2(a1, b1, a2, b2, vpsubd, addl, xorl, subl) 145 146 #define qop(in, out, f) \ 147 F ## f ## _2(out ## 1, in ## 1, out ## 2, in ## 2); 148 149 #define get_round_keys(nn) \ 150 vbroadcastss (km+(4*(nn)))(CTX), RKM; \ 151 vpand R1ST, RKR, RKRF; \ 152 vpsubq RKRF, R32, RKRR; \ 153 vpsrldq $1, RKR, RKR; 154 155 #define Q(n) \ 156 get_round_keys(4*n+0); \ 157 qop(RD, RC, 1); \ 158 \ 159 get_round_keys(4*n+1); \ 160 qop(RC, RB, 2); \ 161 \ 162 get_round_keys(4*n+2); \ 163 qop(RB, RA, 3); \ 164 \ 165 get_round_keys(4*n+3); \ 166 qop(RA, RD, 1); 167 168 #define QBAR(n) \ 169 get_round_keys(4*n+3); \ 170 qop(RA, RD, 1); \ 171 \ 172 get_round_keys(4*n+2); \ 173 qop(RB, RA, 3); \ 174 \ 175 get_round_keys(4*n+1); \ 176 qop(RC, RB, 2); \ 177 \ 178 get_round_keys(4*n+0); \ 179 qop(RD, RC, 1); 180 181 #define shuffle(mask) \ 182 vpshufb mask(%rip), RKR, RKR; 183 184 #define preload_rkr(n, do_mask, mask) \ 185 vbroadcastss .L16_mask(%rip), RKR; \ 186 /* add 16-bit rotation to key rotations (mod 32) */ \ 187 vpxor (kr+n*16)(CTX), RKR, RKR; \ 188 do_mask(mask); 189 190 #define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ 191 vpunpckldq x1, x0, t0; \ 192 vpunpckhdq x1, x0, t2; \ 193 vpunpckldq x3, x2, t1; \ 194 vpunpckhdq x3, x2, x3; \ 195 \ 196 vpunpcklqdq t1, t0, x0; \ 197 vpunpckhqdq t1, t0, x1; \ 198 vpunpcklqdq x3, t2, x2; \ 199 vpunpckhqdq x3, t2, x3; 200 201 #define inpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \ 202 vpshufb rmask, x0, x0; \ 203 vpshufb rmask, x1, x1; \ 204 vpshufb rmask, x2, x2; \ 205 vpshufb rmask, x3, x3; \ 206 \ 207 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) 208 209 #define outunpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \ 210 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ 211 \ 212 vpshufb rmask, x0, x0; \ 213 vpshufb rmask, x1, x1; \ 214 vpshufb rmask, x2, x2; \ 215 vpshufb rmask, x3, x3; 216 217 .section .rodata.cst16, "aM", @progbits, 16 218 .align 16 219 .Lbswap_mask: 220 .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 221 .Lbswap128_mask: 222 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 223 .Lrkr_enc_Q_Q_QBAR_QBAR: 224 .byte 0, 1, 2, 3, 4, 5, 6, 7, 11, 10, 9, 8, 15, 14, 13, 12 225 .Lrkr_enc_QBAR_QBAR_QBAR_QBAR: 226 .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 227 .Lrkr_dec_Q_Q_Q_Q: 228 .byte 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3 229 .Lrkr_dec_Q_Q_QBAR_QBAR: 230 .byte 12, 13, 14, 15, 8, 9, 10, 11, 7, 6, 5, 4, 3, 2, 1, 0 231 .Lrkr_dec_QBAR_QBAR_QBAR_QBAR: 232 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 233 234 .section .rodata.cst4.L16_mask, "aM", @progbits, 4 235 .align 4 236 .L16_mask: 237 .byte 16, 16, 16, 16 238 239 .section .rodata.cst4.L32_mask, "aM", @progbits, 4 240 .align 4 241 .L32_mask: 242 .byte 32, 0, 0, 0 243 244 .section .rodata.cst4.first_mask, "aM", @progbits, 4 245 .align 4 246 .Lfirst_mask: 247 .byte 0x1f, 0, 0, 0 248 249 .text 250 251 .align 8 252 SYM_FUNC_START_LOCAL(__cast6_enc_blk8) 253 /* input: 254 * %rdi: ctx 255 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks 256 * output: 257 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks 258 */ 259 260 pushq %r15; 261 pushq %rbx; 262 263 movq %rdi, CTX; 264 265 vmovdqa .Lbswap_mask(%rip), RKM; 266 vmovd .Lfirst_mask(%rip), R1ST; 267 vmovd .L32_mask(%rip), R32; 268 269 inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); 270 inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); 271 272 preload_rkr(0, dummy, none); 273 Q(0); 274 Q(1); 275 Q(2); 276 Q(3); 277 preload_rkr(1, shuffle, .Lrkr_enc_Q_Q_QBAR_QBAR); 278 Q(4); 279 Q(5); 280 QBAR(6); 281 QBAR(7); 282 preload_rkr(2, shuffle, .Lrkr_enc_QBAR_QBAR_QBAR_QBAR); 283 QBAR(8); 284 QBAR(9); 285 QBAR(10); 286 QBAR(11); 287 288 popq %rbx; 289 popq %r15; 290 291 vmovdqa .Lbswap_mask(%rip), RKM; 292 293 outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); 294 outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); 295 296 RET; 297 SYM_FUNC_END(__cast6_enc_blk8) 298 299 .align 8 300 SYM_FUNC_START_LOCAL(__cast6_dec_blk8) 301 /* input: 302 * %rdi: ctx 303 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks 304 * output: 305 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: decrypted blocks 306 */ 307 308 pushq %r15; 309 pushq %rbx; 310 311 movq %rdi, CTX; 312 313 vmovdqa .Lbswap_mask(%rip), RKM; 314 vmovd .Lfirst_mask(%rip), R1ST; 315 vmovd .L32_mask(%rip), R32; 316 317 inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); 318 inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); 319 320 preload_rkr(2, shuffle, .Lrkr_dec_Q_Q_Q_Q); 321 Q(11); 322 Q(10); 323 Q(9); 324 Q(8); 325 preload_rkr(1, shuffle, .Lrkr_dec_Q_Q_QBAR_QBAR); 326 Q(7); 327 Q(6); 328 QBAR(5); 329 QBAR(4); 330 preload_rkr(0, shuffle, .Lrkr_dec_QBAR_QBAR_QBAR_QBAR); 331 QBAR(3); 332 QBAR(2); 333 QBAR(1); 334 QBAR(0); 335 336 popq %rbx; 337 popq %r15; 338 339 vmovdqa .Lbswap_mask(%rip), RKM; 340 outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); 341 outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); 342 343 RET; 344 SYM_FUNC_END(__cast6_dec_blk8) 345 346 SYM_FUNC_START(cast6_ecb_enc_8way) 347 /* input: 348 * %rdi: ctx 349 * %rsi: dst 350 * %rdx: src 351 */ 352 FRAME_BEGIN 353 pushq %r15; 354 355 movq %rdi, CTX; 356 movq %rsi, %r11; 357 358 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 359 360 call __cast6_enc_blk8; 361 362 store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 363 364 popq %r15; 365 FRAME_END 366 RET; 367 SYM_FUNC_END(cast6_ecb_enc_8way) 368 369 SYM_FUNC_START(cast6_ecb_dec_8way) 370 /* input: 371 * %rdi: ctx 372 * %rsi: dst 373 * %rdx: src 374 */ 375 FRAME_BEGIN 376 pushq %r15; 377 378 movq %rdi, CTX; 379 movq %rsi, %r11; 380 381 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 382 383 call __cast6_dec_blk8; 384 385 store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 386 387 popq %r15; 388 FRAME_END 389 RET; 390 SYM_FUNC_END(cast6_ecb_dec_8way) 391 392 SYM_FUNC_START(cast6_cbc_dec_8way) 393 /* input: 394 * %rdi: ctx 395 * %rsi: dst 396 * %rdx: src 397 */ 398 FRAME_BEGIN 399 pushq %r12; 400 pushq %r15; 401 402 movq %rdi, CTX; 403 movq %rsi, %r11; 404 movq %rdx, %r12; 405 406 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 407 408 call __cast6_dec_blk8; 409 410 store_cbc_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 411 412 popq %r15; 413 popq %r12; 414 FRAME_END 415 RET; 416 SYM_FUNC_END(cast6_cbc_dec_8way)
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.