1 /* SPDX-License-Identifier: GPL-2.0-or-later * 2 /* 3 * SM4 Cipher Algorithm, AES-NI/AVX optimized. 4 * as specified in 5 * https://tools.ietf.org/id/draft-ribose-cfrg 6 * 7 * Copyright (C) 2018 Markku-Juhani O. Saarine< 8 * Copyright (C) 2020 Jussi Kivilinna <jussi.ki 9 * Copyright (c) 2021 Tianjia Zhang <tianjia.zh 10 */ 11 12 /* Based on SM4 AES-NI work by libgcrypt and M 13 * https://github.com/mjosaarinen/sm4ni 14 */ 15 16 #include <linux/linkage.h> 17 #include <linux/cfi_types.h> 18 #include <asm/frame.h> 19 20 #define rRIP (%rip) 21 22 #define RX0 %xmm0 23 #define RX1 %xmm1 24 #define MASK_4BIT %xmm2 25 #define RTMP0 %xmm3 26 #define RTMP1 %xmm4 27 #define RTMP2 %xmm5 28 #define RTMP3 %xmm6 29 #define RTMP4 %xmm7 30 31 #define RA0 %xmm8 32 #define RA1 %xmm9 33 #define RA2 %xmm10 34 #define RA3 %xmm11 35 36 #define RB0 %xmm12 37 #define RB1 %xmm13 38 #define RB2 %xmm14 39 #define RB3 %xmm15 40 41 #define RNOT %xmm0 42 #define RBSWAP %xmm1 43 44 45 /* Transpose four 32-bit words between 128-bit 46 #define transpose_4x4(x0, x1, x2, x3, t1, t2) 47 vpunpckhdq x1, x0, t2; 48 vpunpckldq x1, x0, x0; 49 50 vpunpckldq x3, x2, t1; 51 vpunpckhdq x3, x2, x2; 52 53 vpunpckhqdq t1, x0, x1; 54 vpunpcklqdq t1, x0, x0; 55 56 vpunpckhqdq x2, t2, x3; 57 vpunpcklqdq x2, t2, x2; 58 59 /* pre-SubByte transform. */ 60 #define transform_pre(x, lo_t, hi_t, mask4bit, 61 vpand x, mask4bit, tmp0; 62 vpandn x, mask4bit, x; 63 vpsrld $4, x, x; 64 65 vpshufb tmp0, lo_t, tmp0; 66 vpshufb x, hi_t, x; 67 vpxor tmp0, x, x; 68 69 /* post-SubByte transform. Note: x has been XO 70 * 'vaeslastenc' instruction. 71 */ 72 #define transform_post(x, lo_t, hi_t, mask4bit 73 vpandn mask4bit, x, tmp0; 74 vpsrld $4, x, x; 75 vpand x, mask4bit, x; 76 77 vpshufb tmp0, lo_t, tmp0; 78 vpshufb x, hi_t, x; 79 vpxor tmp0, x, x; 80 81 82 .section .rodata.cst16, "aM", @progbits 83 .align 16 84 85 /* 86 * Following four affine transform look-up tab 87 * Markku-Juhani O. Saarinen, at https://githu 88 * 89 * These allow exposing SM4 S-Box from AES Sub 90 */ 91 92 /* pre-SubByte affine transform, from SM4 fiel 93 .Lpre_tf_lo_s: 94 .quad 0x9197E2E474720701, 0xC7C1B4B222 95 .Lpre_tf_hi_s: 96 .quad 0xE240AB09EB49A200, 0xF052B91BF9 97 98 /* post-SubByte affine transform, from AES fie 99 .Lpost_tf_lo_s: 100 .quad 0x5B67F2CEA19D0834, 0xEDD1447817 101 .Lpost_tf_hi_s: 102 .quad 0xAE7201DD73AFDC00, 0x11CDBE62CC 103 104 /* For isolating SubBytes from AESENCLAST, inv 105 .Linv_shift_row: 106 .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x 107 .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x 108 109 /* Inverse shift row + Rotate left by 8 bits o 110 .Linv_shift_row_rol_8: 111 .byte 0x07, 0x00, 0x0d, 0x0a, 0x0b, 0x 112 .byte 0x0f, 0x08, 0x05, 0x02, 0x03, 0x 113 114 /* Inverse shift row + Rotate left by 16 bits 115 .Linv_shift_row_rol_16: 116 .byte 0x0a, 0x07, 0x00, 0x0d, 0x0e, 0x 117 .byte 0x02, 0x0f, 0x08, 0x05, 0x06, 0x 118 119 /* Inverse shift row + Rotate left by 24 bits 120 .Linv_shift_row_rol_24: 121 .byte 0x0d, 0x0a, 0x07, 0x00, 0x01, 0x 122 .byte 0x05, 0x02, 0x0f, 0x08, 0x09, 0x 123 124 /* For CTR-mode IV byteswap */ 125 .Lbswap128_mask: 126 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 127 128 /* For input word byte-swap */ 129 .Lbswap32_mask: 130 .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 131 132 .align 4 133 /* 4-bit mask */ 134 .L0f0f0f0f: 135 .long 0x0f0f0f0f 136 137 /* 12 bytes, only for padding */ 138 .Lpadding_deadbeef: 139 .long 0xdeadbeef, 0xdeadbeef, 0xdeadbe 140 141 142 .text 143 144 /* 145 * void sm4_aesni_avx_crypt4(const u32 *rk, u8 146 * const u8 *src, in 147 */ 148 SYM_FUNC_START(sm4_aesni_avx_crypt4) 149 /* input: 150 * %rdi: round key array, CTX 151 * %rsi: dst (1..4 blocks) 152 * %rdx: src (1..4 blocks) 153 * %rcx: num blocks (1..4) 154 */ 155 FRAME_BEGIN 156 157 vmovdqu 0*16(%rdx), RA0; 158 vmovdqa RA0, RA1; 159 vmovdqa RA0, RA2; 160 vmovdqa RA0, RA3; 161 cmpq $2, %rcx; 162 jb .Lblk4_load_input_done; 163 vmovdqu 1*16(%rdx), RA1; 164 je .Lblk4_load_input_done; 165 vmovdqu 2*16(%rdx), RA2; 166 cmpq $3, %rcx; 167 je .Lblk4_load_input_done; 168 vmovdqu 3*16(%rdx), RA3; 169 170 .Lblk4_load_input_done: 171 172 vmovdqa .Lbswap32_mask rRIP, RTMP2; 173 vpshufb RTMP2, RA0, RA0; 174 vpshufb RTMP2, RA1, RA1; 175 vpshufb RTMP2, RA2, RA2; 176 vpshufb RTMP2, RA3, RA3; 177 178 vbroadcastss .L0f0f0f0f rRIP, MASK_4BI 179 vmovdqa .Lpre_tf_lo_s rRIP, RTMP4; 180 vmovdqa .Lpre_tf_hi_s rRIP, RB0; 181 vmovdqa .Lpost_tf_lo_s rRIP, RB1; 182 vmovdqa .Lpost_tf_hi_s rRIP, RB2; 183 vmovdqa .Linv_shift_row rRIP, RB3; 184 vmovdqa .Linv_shift_row_rol_8 rRIP, RT 185 vmovdqa .Linv_shift_row_rol_16 rRIP, R 186 transpose_4x4(RA0, RA1, RA2, RA3, RTMP 187 188 #define ROUND(round, s0, s1, s2, s3) 189 vbroadcastss (4*(round))(%rdi), RX0; 190 vpxor s1, RX0, RX0; 191 vpxor s2, RX0, RX0; 192 vpxor s3, RX0, RX0; /* s1 ^ s2 ^ s3 ^ 193 194 /* sbox, non-linear part */ 195 transform_pre(RX0, RTMP4, RB0, MASK_4B 196 vaesenclast MASK_4BIT, RX0, RX0; 197 transform_post(RX0, RB1, RB2, MASK_4BI 198 199 /* linear part */ 200 vpshufb RB3, RX0, RTMP0; 201 vpxor RTMP0, s0, s0; /* s0 ^ x */ 202 vpshufb RTMP2, RX0, RTMP1; 203 vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol( 204 vpshufb RTMP3, RX0, RTMP1; 205 vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol( 206 vpshufb .Linv_shift_row_rol_24 rRIP, R 207 vpxor RTMP1, s0, s0; /* s0 ^ x ^ rol(x 208 vpslld $2, RTMP0, RTMP1; 209 vpsrld $30, RTMP0, RTMP0; 210 vpxor RTMP0, s0, s0; 211 /* s0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol 212 vpxor RTMP1, s0, s0; 213 214 leaq (32*4)(%rdi), %rax; 215 .align 16 216 .Lroundloop_blk4: 217 ROUND(0, RA0, RA1, RA2, RA3); 218 ROUND(1, RA1, RA2, RA3, RA0); 219 ROUND(2, RA2, RA3, RA0, RA1); 220 ROUND(3, RA3, RA0, RA1, RA2); 221 leaq (4*4)(%rdi), %rdi; 222 cmpq %rax, %rdi; 223 jne .Lroundloop_blk4; 224 225 #undef ROUND 226 227 vmovdqa .Lbswap128_mask rRIP, RTMP2; 228 229 transpose_4x4(RA0, RA1, RA2, RA3, RTMP 230 vpshufb RTMP2, RA0, RA0; 231 vpshufb RTMP2, RA1, RA1; 232 vpshufb RTMP2, RA2, RA2; 233 vpshufb RTMP2, RA3, RA3; 234 235 vmovdqu RA0, 0*16(%rsi); 236 cmpq $2, %rcx; 237 jb .Lblk4_store_output_done; 238 vmovdqu RA1, 1*16(%rsi); 239 je .Lblk4_store_output_done; 240 vmovdqu RA2, 2*16(%rsi); 241 cmpq $3, %rcx; 242 je .Lblk4_store_output_done; 243 vmovdqu RA3, 3*16(%rsi); 244 245 .Lblk4_store_output_done: 246 vzeroall; 247 FRAME_END 248 RET; 249 SYM_FUNC_END(sm4_aesni_avx_crypt4) 250 251 SYM_FUNC_START_LOCAL(__sm4_crypt_blk8) 252 /* input: 253 * %rdi: round key array, CTX 254 * RA0, RA1, RA2, RA3, RB0, RB1, 255 * 256 * output: 257 * RA0, RA1, RA2, RA3, RB0, RB1, 258 * 259 */ 260 FRAME_BEGIN 261 262 vmovdqa .Lbswap32_mask rRIP, RTMP2; 263 vpshufb RTMP2, RA0, RA0; 264 vpshufb RTMP2, RA1, RA1; 265 vpshufb RTMP2, RA2, RA2; 266 vpshufb RTMP2, RA3, RA3; 267 vpshufb RTMP2, RB0, RB0; 268 vpshufb RTMP2, RB1, RB1; 269 vpshufb RTMP2, RB2, RB2; 270 vpshufb RTMP2, RB3, RB3; 271 272 vbroadcastss .L0f0f0f0f rRIP, MASK_4BI 273 transpose_4x4(RA0, RA1, RA2, RA3, RTMP 274 transpose_4x4(RB0, RB1, RB2, RB3, RTMP 275 276 #define ROUND(round, s0, s1, s2, s3, r0, r1, r 277 vbroadcastss (4*(round))(%rdi), RX0; 278 vmovdqa .Lpre_tf_lo_s rRIP, RTMP4; 279 vmovdqa .Lpre_tf_hi_s rRIP, RTMP1; 280 vmovdqa RX0, RX1; 281 vpxor s1, RX0, RX0; 282 vpxor s2, RX0, RX0; 283 vpxor s3, RX0, RX0; /* s1 ^ s2 ^ s3 ^ 284 vmovdqa .Lpost_tf_lo_s rRIP, RTMP2; 285 vmovdqa .Lpost_tf_hi_s rRIP, RTMP3; 286 vpxor r1, RX1, RX1; 287 vpxor r2, RX1, RX1; 288 vpxor r3, RX1, RX1; /* r1 ^ r2 ^ r3 ^ 289 290 /* sbox, non-linear part */ 291 transform_pre(RX0, RTMP4, RTMP1, MASK_ 292 transform_pre(RX1, RTMP4, RTMP1, MASK_ 293 vmovdqa .Linv_shift_row rRIP, RTMP4; 294 vaesenclast MASK_4BIT, RX0, RX0; 295 vaesenclast MASK_4BIT, RX1, RX1; 296 transform_post(RX0, RTMP2, RTMP3, MASK 297 transform_post(RX1, RTMP2, RTMP3, MASK 298 299 /* linear part */ 300 vpshufb RTMP4, RX0, RTMP0; 301 vpxor RTMP0, s0, s0; /* s0 ^ x */ 302 vpshufb RTMP4, RX1, RTMP2; 303 vmovdqa .Linv_shift_row_rol_8 rRIP, RT 304 vpxor RTMP2, r0, r0; /* r0 ^ x */ 305 vpshufb RTMP4, RX0, RTMP1; 306 vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol( 307 vpshufb RTMP4, RX1, RTMP3; 308 vmovdqa .Linv_shift_row_rol_16 rRIP, R 309 vpxor RTMP3, RTMP2, RTMP2; /* x ^ rol( 310 vpshufb RTMP4, RX0, RTMP1; 311 vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol( 312 vpshufb RTMP4, RX1, RTMP3; 313 vmovdqa .Linv_shift_row_rol_24 rRIP, R 314 vpxor RTMP3, RTMP2, RTMP2; /* x ^ rol( 315 vpshufb RTMP4, RX0, RTMP1; 316 vpxor RTMP1, s0, s0; /* s0 ^ x ^ rol(x 317 /* s0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol 318 vpslld $2, RTMP0, RTMP1; 319 vpsrld $30, RTMP0, RTMP0; 320 vpxor RTMP0, s0, s0; 321 vpxor RTMP1, s0, s0; 322 vpshufb RTMP4, RX1, RTMP3; 323 vpxor RTMP3, r0, r0; /* r0 ^ x ^ rol(x 324 /* r0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol 325 vpslld $2, RTMP2, RTMP3; 326 vpsrld $30, RTMP2, RTMP2; 327 vpxor RTMP2, r0, r0; 328 vpxor RTMP3, r0, r0; 329 330 leaq (32*4)(%rdi), %rax; 331 .align 16 332 .Lroundloop_blk8: 333 ROUND(0, RA0, RA1, RA2, RA3, RB0, RB1, 334 ROUND(1, RA1, RA2, RA3, RA0, RB1, RB2, 335 ROUND(2, RA2, RA3, RA0, RA1, RB2, RB3, 336 ROUND(3, RA3, RA0, RA1, RA2, RB3, RB0, 337 leaq (4*4)(%rdi), %rdi; 338 cmpq %rax, %rdi; 339 jne .Lroundloop_blk8; 340 341 #undef ROUND 342 343 vmovdqa .Lbswap128_mask rRIP, RTMP2; 344 345 transpose_4x4(RA0, RA1, RA2, RA3, RTMP 346 transpose_4x4(RB0, RB1, RB2, RB3, RTMP 347 vpshufb RTMP2, RA0, RA0; 348 vpshufb RTMP2, RA1, RA1; 349 vpshufb RTMP2, RA2, RA2; 350 vpshufb RTMP2, RA3, RA3; 351 vpshufb RTMP2, RB0, RB0; 352 vpshufb RTMP2, RB1, RB1; 353 vpshufb RTMP2, RB2, RB2; 354 vpshufb RTMP2, RB3, RB3; 355 356 FRAME_END 357 RET; 358 SYM_FUNC_END(__sm4_crypt_blk8) 359 360 /* 361 * void sm4_aesni_avx_crypt8(const u32 *rk, u8 362 * const u8 *src, in 363 */ 364 SYM_FUNC_START(sm4_aesni_avx_crypt8) 365 /* input: 366 * %rdi: round key array, CTX 367 * %rsi: dst (1..8 blocks) 368 * %rdx: src (1..8 blocks) 369 * %rcx: num blocks (1..8) 370 */ 371 cmpq $5, %rcx; 372 jb sm4_aesni_avx_crypt4; 373 374 FRAME_BEGIN 375 376 vmovdqu (0 * 16)(%rdx), RA0; 377 vmovdqu (1 * 16)(%rdx), RA1; 378 vmovdqu (2 * 16)(%rdx), RA2; 379 vmovdqu (3 * 16)(%rdx), RA3; 380 vmovdqu (4 * 16)(%rdx), RB0; 381 vmovdqa RB0, RB1; 382 vmovdqa RB0, RB2; 383 vmovdqa RB0, RB3; 384 je .Lblk8_load_input_done; 385 vmovdqu (5 * 16)(%rdx), RB1; 386 cmpq $7, %rcx; 387 jb .Lblk8_load_input_done; 388 vmovdqu (6 * 16)(%rdx), RB2; 389 je .Lblk8_load_input_done; 390 vmovdqu (7 * 16)(%rdx), RB3; 391 392 .Lblk8_load_input_done: 393 call __sm4_crypt_blk8; 394 395 cmpq $6, %rcx; 396 vmovdqu RA0, (0 * 16)(%rsi); 397 vmovdqu RA1, (1 * 16)(%rsi); 398 vmovdqu RA2, (2 * 16)(%rsi); 399 vmovdqu RA3, (3 * 16)(%rsi); 400 vmovdqu RB0, (4 * 16)(%rsi); 401 jb .Lblk8_store_output_done; 402 vmovdqu RB1, (5 * 16)(%rsi); 403 je .Lblk8_store_output_done; 404 vmovdqu RB2, (6 * 16)(%rsi); 405 cmpq $7, %rcx; 406 je .Lblk8_store_output_done; 407 vmovdqu RB3, (7 * 16)(%rsi); 408 409 .Lblk8_store_output_done: 410 vzeroall; 411 FRAME_END 412 RET; 413 SYM_FUNC_END(sm4_aesni_avx_crypt8) 414 415 /* 416 * void sm4_aesni_avx_ctr_enc_blk8(const u32 * 417 * const u8 *s 418 */ 419 SYM_TYPED_FUNC_START(sm4_aesni_avx_ctr_enc_blk 420 /* input: 421 * %rdi: round key array, CTX 422 * %rsi: dst (8 blocks) 423 * %rdx: src (8 blocks) 424 * %rcx: iv (big endian, 128bit) 425 */ 426 FRAME_BEGIN 427 428 /* load IV and byteswap */ 429 vmovdqu (%rcx), RA0; 430 431 vmovdqa .Lbswap128_mask rRIP, RBSWAP; 432 vpshufb RBSWAP, RA0, RTMP0; /* be => l 433 434 vpcmpeqd RNOT, RNOT, RNOT; 435 vpsrldq $8, RNOT, RNOT; /* low: -1, hi 436 437 #define inc_le128(x, minus_one, tmp) \ 438 vpcmpeqq minus_one, x, tmp; \ 439 vpsubq minus_one, x, x; \ 440 vpslldq $8, tmp, tmp; \ 441 vpsubq tmp, x, x; 442 443 /* construct IVs */ 444 inc_le128(RTMP0, RNOT, RTMP2); /* +1 * 445 vpshufb RBSWAP, RTMP0, RA1; 446 inc_le128(RTMP0, RNOT, RTMP2); /* +2 * 447 vpshufb RBSWAP, RTMP0, RA2; 448 inc_le128(RTMP0, RNOT, RTMP2); /* +3 * 449 vpshufb RBSWAP, RTMP0, RA3; 450 inc_le128(RTMP0, RNOT, RTMP2); /* +4 * 451 vpshufb RBSWAP, RTMP0, RB0; 452 inc_le128(RTMP0, RNOT, RTMP2); /* +5 * 453 vpshufb RBSWAP, RTMP0, RB1; 454 inc_le128(RTMP0, RNOT, RTMP2); /* +6 * 455 vpshufb RBSWAP, RTMP0, RB2; 456 inc_le128(RTMP0, RNOT, RTMP2); /* +7 * 457 vpshufb RBSWAP, RTMP0, RB3; 458 inc_le128(RTMP0, RNOT, RTMP2); /* +8 * 459 vpshufb RBSWAP, RTMP0, RTMP1; 460 461 /* store new IV */ 462 vmovdqu RTMP1, (%rcx); 463 464 call __sm4_crypt_blk8; 465 466 vpxor (0 * 16)(%rdx), RA0, RA0; 467 vpxor (1 * 16)(%rdx), RA1, RA1; 468 vpxor (2 * 16)(%rdx), RA2, RA2; 469 vpxor (3 * 16)(%rdx), RA3, RA3; 470 vpxor (4 * 16)(%rdx), RB0, RB0; 471 vpxor (5 * 16)(%rdx), RB1, RB1; 472 vpxor (6 * 16)(%rdx), RB2, RB2; 473 vpxor (7 * 16)(%rdx), RB3, RB3; 474 475 vmovdqu RA0, (0 * 16)(%rsi); 476 vmovdqu RA1, (1 * 16)(%rsi); 477 vmovdqu RA2, (2 * 16)(%rsi); 478 vmovdqu RA3, (3 * 16)(%rsi); 479 vmovdqu RB0, (4 * 16)(%rsi); 480 vmovdqu RB1, (5 * 16)(%rsi); 481 vmovdqu RB2, (6 * 16)(%rsi); 482 vmovdqu RB3, (7 * 16)(%rsi); 483 484 vzeroall; 485 FRAME_END 486 RET; 487 SYM_FUNC_END(sm4_aesni_avx_ctr_enc_blk8) 488 489 /* 490 * void sm4_aesni_avx_cbc_dec_blk8(const u32 * 491 * const u8 *s 492 */ 493 SYM_TYPED_FUNC_START(sm4_aesni_avx_cbc_dec_blk 494 /* input: 495 * %rdi: round key array, CTX 496 * %rsi: dst (8 blocks) 497 * %rdx: src (8 blocks) 498 * %rcx: iv 499 */ 500 FRAME_BEGIN 501 502 vmovdqu (0 * 16)(%rdx), RA0; 503 vmovdqu (1 * 16)(%rdx), RA1; 504 vmovdqu (2 * 16)(%rdx), RA2; 505 vmovdqu (3 * 16)(%rdx), RA3; 506 vmovdqu (4 * 16)(%rdx), RB0; 507 vmovdqu (5 * 16)(%rdx), RB1; 508 vmovdqu (6 * 16)(%rdx), RB2; 509 vmovdqu (7 * 16)(%rdx), RB3; 510 511 call __sm4_crypt_blk8; 512 513 vmovdqu (7 * 16)(%rdx), RNOT; 514 vpxor (%rcx), RA0, RA0; 515 vpxor (0 * 16)(%rdx), RA1, RA1; 516 vpxor (1 * 16)(%rdx), RA2, RA2; 517 vpxor (2 * 16)(%rdx), RA3, RA3; 518 vpxor (3 * 16)(%rdx), RB0, RB0; 519 vpxor (4 * 16)(%rdx), RB1, RB1; 520 vpxor (5 * 16)(%rdx), RB2, RB2; 521 vpxor (6 * 16)(%rdx), RB3, RB3; 522 vmovdqu RNOT, (%rcx); /* store new IV 523 524 vmovdqu RA0, (0 * 16)(%rsi); 525 vmovdqu RA1, (1 * 16)(%rsi); 526 vmovdqu RA2, (2 * 16)(%rsi); 527 vmovdqu RA3, (3 * 16)(%rsi); 528 vmovdqu RB0, (4 * 16)(%rsi); 529 vmovdqu RB1, (5 * 16)(%rsi); 530 vmovdqu RB2, (6 * 16)(%rsi); 531 vmovdqu RB3, (7 * 16)(%rsi); 532 533 vzeroall; 534 FRAME_END 535 RET; 536 SYM_FUNC_END(sm4_aesni_avx_cbc_dec_blk8)
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.