1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * SM4 Cipher Algorithm, AES-NI/AVX2 optimized 4 * as specified in 5 * https://tools.ietf.org/id/draft-ribose-cfrg 6 * 7 * Copyright (C) 2018 Markku-Juhani O. Saarine< 8 * Copyright (C) 2020 Jussi Kivilinna <jussi.ki 9 * Copyright (c) 2021 Tianjia Zhang <tianjia.zh 10 */ 11 12 /* Based on SM4 AES-NI work by libgcrypt and M 13 * https://github.com/mjosaarinen/sm4ni 14 */ 15 16 #include <linux/linkage.h> 17 #include <linux/cfi_types.h> 18 #include <asm/frame.h> 19 20 #define rRIP (%rip) 21 22 /* vector registers */ 23 #define RX0 %ymm0 24 #define RX1 %ymm1 25 #define MASK_4BIT %ymm2 26 #define RTMP0 %ymm3 27 #define RTMP1 %ymm4 28 #define RTMP2 %ymm5 29 #define RTMP3 %ymm6 30 #define RTMP4 %ymm7 31 32 #define RA0 %ymm8 33 #define RA1 %ymm9 34 #define RA2 %ymm10 35 #define RA3 %ymm11 36 37 #define RB0 %ymm12 38 #define RB1 %ymm13 39 #define RB2 %ymm14 40 #define RB3 %ymm15 41 42 #define RNOT %ymm0 43 #define RBSWAP %ymm1 44 45 #define RX0x %xmm0 46 #define RX1x %xmm1 47 #define MASK_4BITx %xmm2 48 49 #define RNOTx %xmm0 50 #define RBSWAPx %xmm1 51 52 #define RTMP0x %xmm3 53 #define RTMP1x %xmm4 54 #define RTMP2x %xmm5 55 #define RTMP3x %xmm6 56 #define RTMP4x %xmm7 57 58 59 /* helper macros */ 60 61 /* Transpose four 32-bit words between 128-bit 62 #define transpose_4x4(x0, x1, x2, x3, t1, t2) 63 vpunpckhdq x1, x0, t2; 64 vpunpckldq x1, x0, x0; 65 66 vpunpckldq x3, x2, t1; 67 vpunpckhdq x3, x2, x2; 68 69 vpunpckhqdq t1, x0, x1; 70 vpunpcklqdq t1, x0, x0; 71 72 vpunpckhqdq x2, t2, x3; 73 vpunpcklqdq x2, t2, x2; 74 75 /* post-SubByte transform. */ 76 #define transform_pre(x, lo_t, hi_t, mask4bit, 77 vpand x, mask4bit, tmp0; 78 vpandn x, mask4bit, x; 79 vpsrld $4, x, x; 80 81 vpshufb tmp0, lo_t, tmp0; 82 vpshufb x, hi_t, x; 83 vpxor tmp0, x, x; 84 85 /* post-SubByte transform. Note: x has been XO 86 * 'vaeslastenc' instruction. */ 87 #define transform_post(x, lo_t, hi_t, mask4bit 88 vpandn mask4bit, x, tmp0; 89 vpsrld $4, x, x; 90 vpand x, mask4bit, x; 91 92 vpshufb tmp0, lo_t, tmp0; 93 vpshufb x, hi_t, x; 94 vpxor tmp0, x, x; 95 96 97 .section .rodata.cst16, "aM", @progbits 98 .align 16 99 100 /* 101 * Following four affine transform look-up tab 102 * Markku-Juhani O. Saarinen, at https://githu 103 * 104 * These allow exposing SM4 S-Box from AES Sub 105 */ 106 107 /* pre-SubByte affine transform, from SM4 fiel 108 .Lpre_tf_lo_s: 109 .quad 0x9197E2E474720701, 0xC7C1B4B222 110 .Lpre_tf_hi_s: 111 .quad 0xE240AB09EB49A200, 0xF052B91BF9 112 113 /* post-SubByte affine transform, from AES fie 114 .Lpost_tf_lo_s: 115 .quad 0x5B67F2CEA19D0834, 0xEDD1447817 116 .Lpost_tf_hi_s: 117 .quad 0xAE7201DD73AFDC00, 0x11CDBE62CC 118 119 /* For isolating SubBytes from AESENCLAST, inv 120 .Linv_shift_row: 121 .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x 122 .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x 123 124 /* Inverse shift row + Rotate left by 8 bits o 125 .Linv_shift_row_rol_8: 126 .byte 0x07, 0x00, 0x0d, 0x0a, 0x0b, 0x 127 .byte 0x0f, 0x08, 0x05, 0x02, 0x03, 0x 128 129 /* Inverse shift row + Rotate left by 16 bits 130 .Linv_shift_row_rol_16: 131 .byte 0x0a, 0x07, 0x00, 0x0d, 0x0e, 0x 132 .byte 0x02, 0x0f, 0x08, 0x05, 0x06, 0x 133 134 /* Inverse shift row + Rotate left by 24 bits 135 .Linv_shift_row_rol_24: 136 .byte 0x0d, 0x0a, 0x07, 0x00, 0x01, 0x 137 .byte 0x05, 0x02, 0x0f, 0x08, 0x09, 0x 138 139 /* For CTR-mode IV byteswap */ 140 .Lbswap128_mask: 141 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 142 143 /* For input word byte-swap */ 144 .Lbswap32_mask: 145 .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 146 147 .align 4 148 /* 4-bit mask */ 149 .L0f0f0f0f: 150 .long 0x0f0f0f0f 151 152 /* 12 bytes, only for padding */ 153 .Lpadding_deadbeef: 154 .long 0xdeadbeef, 0xdeadbeef, 0xdeadbe 155 156 .text 157 SYM_FUNC_START_LOCAL(__sm4_crypt_blk16) 158 /* input: 159 * %rdi: round key array, CTX 160 * RA0, RA1, RA2, RA3, RB0, RB1, 161 * 162 * output: 163 * RA0, RA1, RA2, RA3, RB0, RB1, 164 * 165 */ 166 FRAME_BEGIN 167 168 vbroadcasti128 .Lbswap32_mask rRIP, RT 169 vpshufb RTMP2, RA0, RA0; 170 vpshufb RTMP2, RA1, RA1; 171 vpshufb RTMP2, RA2, RA2; 172 vpshufb RTMP2, RA3, RA3; 173 vpshufb RTMP2, RB0, RB0; 174 vpshufb RTMP2, RB1, RB1; 175 vpshufb RTMP2, RB2, RB2; 176 vpshufb RTMP2, RB3, RB3; 177 178 vpbroadcastd .L0f0f0f0f rRIP, MASK_4BI 179 transpose_4x4(RA0, RA1, RA2, RA3, RTMP 180 transpose_4x4(RB0, RB1, RB2, RB3, RTMP 181 182 #define ROUND(round, s0, s1, s2, s3, r0, r1, r 183 vpbroadcastd (4*(round))(%rdi), RX0; 184 vbroadcasti128 .Lpre_tf_lo_s rRIP, RTM 185 vbroadcasti128 .Lpre_tf_hi_s rRIP, RTM 186 vmovdqa RX0, RX1; 187 vpxor s1, RX0, RX0; 188 vpxor s2, RX0, RX0; 189 vpxor s3, RX0, RX0; /* s1 ^ s2 ^ s3 ^ 190 vbroadcasti128 .Lpost_tf_lo_s rRIP, RT 191 vbroadcasti128 .Lpost_tf_hi_s rRIP, RT 192 vpxor r1, RX1, RX1; 193 vpxor r2, RX1, RX1; 194 vpxor r3, RX1, RX1; /* r1 ^ r2 ^ r3 ^ 195 196 /* sbox, non-linear part */ 197 transform_pre(RX0, RTMP4, RTMP1, MASK_ 198 transform_pre(RX1, RTMP4, RTMP1, MASK_ 199 vextracti128 $1, RX0, RTMP4x; 200 vextracti128 $1, RX1, RTMP0x; 201 vaesenclast MASK_4BITx, RX0x, RX0x; 202 vaesenclast MASK_4BITx, RTMP4x, RTMP4x 203 vaesenclast MASK_4BITx, RX1x, RX1x; 204 vaesenclast MASK_4BITx, RTMP0x, RTMP0x 205 vinserti128 $1, RTMP4x, RX0, RX0; 206 vbroadcasti128 .Linv_shift_row rRIP, R 207 vinserti128 $1, RTMP0x, RX1, RX1; 208 transform_post(RX0, RTMP2, RTMP3, MASK 209 transform_post(RX1, RTMP2, RTMP3, MASK 210 211 /* linear part */ 212 vpshufb RTMP4, RX0, RTMP0; 213 vpxor RTMP0, s0, s0; /* s0 ^ x */ 214 vpshufb RTMP4, RX1, RTMP2; 215 vbroadcasti128 .Linv_shift_row_rol_8 r 216 vpxor RTMP2, r0, r0; /* r0 ^ x */ 217 vpshufb RTMP4, RX0, RTMP1; 218 vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol( 219 vpshufb RTMP4, RX1, RTMP3; 220 vbroadcasti128 .Linv_shift_row_rol_16 221 vpxor RTMP3, RTMP2, RTMP2; /* x ^ rol( 222 vpshufb RTMP4, RX0, RTMP1; 223 vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol( 224 vpshufb RTMP4, RX1, RTMP3; 225 vbroadcasti128 .Linv_shift_row_rol_24 226 vpxor RTMP3, RTMP2, RTMP2; /* x ^ rol( 227 vpshufb RTMP4, RX0, RTMP1; 228 vpxor RTMP1, s0, s0; /* s0 ^ x ^ rol(x 229 vpslld $2, RTMP0, RTMP1; 230 vpsrld $30, RTMP0, RTMP0; 231 vpxor RTMP0, s0, s0; 232 /* s0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol 233 vpxor RTMP1, s0, s0; 234 vpshufb RTMP4, RX1, RTMP3; 235 vpxor RTMP3, r0, r0; /* r0 ^ x ^ rol(x 236 vpslld $2, RTMP2, RTMP3; 237 vpsrld $30, RTMP2, RTMP2; 238 vpxor RTMP2, r0, r0; 239 /* r0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol 240 vpxor RTMP3, r0, r0; 241 242 leaq (32*4)(%rdi), %rax; 243 .align 16 244 .Lroundloop_blk8: 245 ROUND(0, RA0, RA1, RA2, RA3, RB0, RB1, 246 ROUND(1, RA1, RA2, RA3, RA0, RB1, RB2, 247 ROUND(2, RA2, RA3, RA0, RA1, RB2, RB3, 248 ROUND(3, RA3, RA0, RA1, RA2, RB3, RB0, 249 leaq (4*4)(%rdi), %rdi; 250 cmpq %rax, %rdi; 251 jne .Lroundloop_blk8; 252 253 #undef ROUND 254 255 vbroadcasti128 .Lbswap128_mask rRIP, R 256 257 transpose_4x4(RA0, RA1, RA2, RA3, RTMP 258 transpose_4x4(RB0, RB1, RB2, RB3, RTMP 259 vpshufb RTMP2, RA0, RA0; 260 vpshufb RTMP2, RA1, RA1; 261 vpshufb RTMP2, RA2, RA2; 262 vpshufb RTMP2, RA3, RA3; 263 vpshufb RTMP2, RB0, RB0; 264 vpshufb RTMP2, RB1, RB1; 265 vpshufb RTMP2, RB2, RB2; 266 vpshufb RTMP2, RB3, RB3; 267 268 FRAME_END 269 RET; 270 SYM_FUNC_END(__sm4_crypt_blk16) 271 272 #define inc_le128(x, minus_one, tmp) \ 273 vpcmpeqq minus_one, x, tmp; \ 274 vpsubq minus_one, x, x; \ 275 vpslldq $8, tmp, tmp; \ 276 vpsubq tmp, x, x; 277 278 /* 279 * void sm4_aesni_avx2_ctr_enc_blk16(const u32 280 * const u8 281 */ 282 SYM_TYPED_FUNC_START(sm4_aesni_avx2_ctr_enc_bl 283 /* input: 284 * %rdi: round key array, CTX 285 * %rsi: dst (16 blocks) 286 * %rdx: src (16 blocks) 287 * %rcx: iv (big endian, 128bit) 288 */ 289 FRAME_BEGIN 290 291 movq 8(%rcx), %rax; 292 bswapq %rax; 293 294 vzeroupper; 295 296 vbroadcasti128 .Lbswap128_mask rRIP, R 297 vpcmpeqd RNOT, RNOT, RNOT; 298 vpsrldq $8, RNOT, RNOT; /* ab: -1:0 299 vpaddq RNOT, RNOT, RTMP2; /* ab: -2:0 300 301 /* load IV and byteswap */ 302 vmovdqu (%rcx), RTMP4x; 303 vpshufb RTMP3x, RTMP4x, RTMP4x; 304 vmovdqa RTMP4x, RTMP0x; 305 inc_le128(RTMP4x, RNOTx, RTMP1x); 306 vinserti128 $1, RTMP4x, RTMP0, RTMP0; 307 vpshufb RTMP3, RTMP0, RA0; /* +1 ; +0 308 309 /* check need for handling 64-bit over 310 cmpq $(0xffffffffffffffff - 16), %rax; 311 ja .Lhandle_ctr_carry; 312 313 /* construct IVs */ 314 vpsubq RTMP2, RTMP0, RTMP0; /* +3 ; +2 315 vpshufb RTMP3, RTMP0, RA1; 316 vpsubq RTMP2, RTMP0, RTMP0; /* +5 ; +4 317 vpshufb RTMP3, RTMP0, RA2; 318 vpsubq RTMP2, RTMP0, RTMP0; /* +7 ; +6 319 vpshufb RTMP3, RTMP0, RA3; 320 vpsubq RTMP2, RTMP0, RTMP0; /* +9 ; +8 321 vpshufb RTMP3, RTMP0, RB0; 322 vpsubq RTMP2, RTMP0, RTMP0; /* +11 ; + 323 vpshufb RTMP3, RTMP0, RB1; 324 vpsubq RTMP2, RTMP0, RTMP0; /* +13 ; + 325 vpshufb RTMP3, RTMP0, RB2; 326 vpsubq RTMP2, RTMP0, RTMP0; /* +15 ; + 327 vpshufb RTMP3, RTMP0, RB3; 328 vpsubq RTMP2, RTMP0, RTMP0; /* +16 */ 329 vpshufb RTMP3x, RTMP0x, RTMP0x; 330 331 jmp .Lctr_carry_done; 332 333 .Lhandle_ctr_carry: 334 /* construct IVs */ 335 inc_le128(RTMP0, RNOT, RTMP1); 336 inc_le128(RTMP0, RNOT, RTMP1); 337 vpshufb RTMP3, RTMP0, RA1; /* +3 ; +2 338 inc_le128(RTMP0, RNOT, RTMP1); 339 inc_le128(RTMP0, RNOT, RTMP1); 340 vpshufb RTMP3, RTMP0, RA2; /* +5 ; +4 341 inc_le128(RTMP0, RNOT, RTMP1); 342 inc_le128(RTMP0, RNOT, RTMP1); 343 vpshufb RTMP3, RTMP0, RA3; /* +7 ; +6 344 inc_le128(RTMP0, RNOT, RTMP1); 345 inc_le128(RTMP0, RNOT, RTMP1); 346 vpshufb RTMP3, RTMP0, RB0; /* +9 ; +8 347 inc_le128(RTMP0, RNOT, RTMP1); 348 inc_le128(RTMP0, RNOT, RTMP1); 349 vpshufb RTMP3, RTMP0, RB1; /* +11 ; +1 350 inc_le128(RTMP0, RNOT, RTMP1); 351 inc_le128(RTMP0, RNOT, RTMP1); 352 vpshufb RTMP3, RTMP0, RB2; /* +13 ; +1 353 inc_le128(RTMP0, RNOT, RTMP1); 354 inc_le128(RTMP0, RNOT, RTMP1); 355 vpshufb RTMP3, RTMP0, RB3; /* +15 ; +1 356 inc_le128(RTMP0, RNOT, RTMP1); 357 vextracti128 $1, RTMP0, RTMP0x; 358 vpshufb RTMP3x, RTMP0x, RTMP0x; /* +16 359 360 .align 4 361 .Lctr_carry_done: 362 /* store new IV */ 363 vmovdqu RTMP0x, (%rcx); 364 365 call __sm4_crypt_blk16; 366 367 vpxor (0 * 32)(%rdx), RA0, RA0; 368 vpxor (1 * 32)(%rdx), RA1, RA1; 369 vpxor (2 * 32)(%rdx), RA2, RA2; 370 vpxor (3 * 32)(%rdx), RA3, RA3; 371 vpxor (4 * 32)(%rdx), RB0, RB0; 372 vpxor (5 * 32)(%rdx), RB1, RB1; 373 vpxor (6 * 32)(%rdx), RB2, RB2; 374 vpxor (7 * 32)(%rdx), RB3, RB3; 375 376 vmovdqu RA0, (0 * 32)(%rsi); 377 vmovdqu RA1, (1 * 32)(%rsi); 378 vmovdqu RA2, (2 * 32)(%rsi); 379 vmovdqu RA3, (3 * 32)(%rsi); 380 vmovdqu RB0, (4 * 32)(%rsi); 381 vmovdqu RB1, (5 * 32)(%rsi); 382 vmovdqu RB2, (6 * 32)(%rsi); 383 vmovdqu RB3, (7 * 32)(%rsi); 384 385 vzeroall; 386 FRAME_END 387 RET; 388 SYM_FUNC_END(sm4_aesni_avx2_ctr_enc_blk16) 389 390 /* 391 * void sm4_aesni_avx2_cbc_dec_blk16(const u32 392 * const u8 393 */ 394 SYM_TYPED_FUNC_START(sm4_aesni_avx2_cbc_dec_bl 395 /* input: 396 * %rdi: round key array, CTX 397 * %rsi: dst (16 blocks) 398 * %rdx: src (16 blocks) 399 * %rcx: iv 400 */ 401 FRAME_BEGIN 402 403 vzeroupper; 404 405 vmovdqu (0 * 32)(%rdx), RA0; 406 vmovdqu (1 * 32)(%rdx), RA1; 407 vmovdqu (2 * 32)(%rdx), RA2; 408 vmovdqu (3 * 32)(%rdx), RA3; 409 vmovdqu (4 * 32)(%rdx), RB0; 410 vmovdqu (5 * 32)(%rdx), RB1; 411 vmovdqu (6 * 32)(%rdx), RB2; 412 vmovdqu (7 * 32)(%rdx), RB3; 413 414 call __sm4_crypt_blk16; 415 416 vmovdqu (%rcx), RNOTx; 417 vinserti128 $1, (%rdx), RNOT, RNOT; 418 vpxor RNOT, RA0, RA0; 419 vpxor (0 * 32 + 16)(%rdx), RA1, RA1; 420 vpxor (1 * 32 + 16)(%rdx), RA2, RA2; 421 vpxor (2 * 32 + 16)(%rdx), RA3, RA3; 422 vpxor (3 * 32 + 16)(%rdx), RB0, RB0; 423 vpxor (4 * 32 + 16)(%rdx), RB1, RB1; 424 vpxor (5 * 32 + 16)(%rdx), RB2, RB2; 425 vpxor (6 * 32 + 16)(%rdx), RB3, RB3; 426 vmovdqu (7 * 32 + 16)(%rdx), RNOTx; 427 vmovdqu RNOTx, (%rcx); /* store new IV 428 429 vmovdqu RA0, (0 * 32)(%rsi); 430 vmovdqu RA1, (1 * 32)(%rsi); 431 vmovdqu RA2, (2 * 32)(%rsi); 432 vmovdqu RA3, (3 * 32)(%rsi); 433 vmovdqu RB0, (4 * 32)(%rsi); 434 vmovdqu RB1, (5 * 32)(%rsi); 435 vmovdqu RB2, (6 * 32)(%rsi); 436 vmovdqu RB3, (7 * 32)(%rsi); 437 438 vzeroall; 439 FRAME_END 440 RET; 441 SYM_FUNC_END(sm4_aesni_avx2_cbc_dec_blk16)
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.