1 /* SPDX-License-Identifier: GPL-2.0-or-later * 2 /* 3 * AES-XTS for modern x86_64 CPUs 4 * 5 * Copyright 2024 Google LLC 6 * 7 * Author: Eric Biggers <ebiggers@google.com> 8 */ 9 10 /* 11 * This file implements AES-XTS for modern x86 12 * complexities of coding for x86 SIMD, e.g. w 13 * different code, it uses a macro to generate 14 * share similar source code but are targeted 15 * 16 * AES-NI + AVX 17 * - 128-bit vectors (1 AES block per vecto 18 * - VEX-coded instructions 19 * - xmm0-xmm15 20 * - This is for older CPUs that lack VAES 21 * 22 * VAES + VPCLMULQDQ + AVX2 23 * - 256-bit vectors (2 AES blocks per vect 24 * - VEX-coded instructions 25 * - ymm0-ymm15 26 * - This is for CPUs that have VAES but la 27 * e.g. Intel's Alder Lake and AMD's Zen 28 * 29 * VAES + VPCLMULQDQ + AVX10/256 + BMI2 30 * - 256-bit vectors (2 AES blocks per vect 31 * - EVEX-coded instructions 32 * - ymm0-ymm31 33 * - This is for CPUs that have AVX512 but 34 * downclocking, and for CPUs that have A 35 * - By "AVX10/256" we really mean (AVX512B 36 * To avoid confusion with 512-bit, we ju 37 * 38 * VAES + VPCLMULQDQ + AVX10/512 + BMI2 39 * - Same as the previous one, but upgrades 40 * (4 AES blocks per vector) in zmm0-zmm3 41 * - This is for CPUs that have good AVX512 42 * 43 * This file doesn't have an implementation fo 44 * the lack of VEX would make all the assembly 45 * 46 * When we use VAES, we also use VPCLMULQDQ to 47 * the XTS tweaks. This avoids a bottleneck. 48 * any CPUs that support VAES but not VPCLMULQ 49 * need to start also providing an implementat 50 * 51 * The AES-XTS implementations in this file su 52 * crypto API, including support for arbitrary 53 * processing. However, they are most heavily 54 * power-of-2 length inputs that are processed 55 */ 56 57 #include <linux/linkage.h> 58 #include <linux/cfi_types.h> 59 60 .section .rodata 61 .p2align 4 62 .Lgf_poly: 63 // The low 64 bits of this value repre 64 // + 1. It is the value that must be 65 // tweak each time a 1 is carried out 66 // 67 // The high 64 bits of this value is j 68 // exists when there's a carry out of 69 .quad 0x87, 1 70 71 // This table contains constants for v 72 // handle variable byte shifts and ble 73 // on CPUs that don't support AVX10-st 74 .Lcts_permute_table: 75 .byte 0x80, 0x80, 0x80, 0x80, 0x80, 76 .byte 0x80, 0x80, 0x80, 0x80, 0x80, 77 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 78 .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 79 .byte 0x80, 0x80, 0x80, 0x80, 0x80, 80 .byte 0x80, 0x80, 0x80, 0x80, 0x80, 81 .text 82 83 // Function parameters 84 .set KEY, %rdi // Initially p 85 // advanced to 86 .set SRC, %rsi // Pointer to 87 .set DST, %rdx // Pointer to 88 .set LEN, %ecx // Remaining l 89 .set LEN8, %cl 90 .set LEN64, %rcx 91 .set TWEAK, %r8 // Pointer to 92 93 // %rax holds the AES key length in bytes. 94 .set KEYLEN, %eax 95 .set KEYLEN64, %rax 96 97 // %r9-r11 are available as temporaries. 98 99 .macro _define_Vi i 100 .if VL == 16 101 .set V\i, %xmm\i 102 .elseif VL == 32 103 .set V\i, %ymm\i 104 .elseif VL == 64 105 .set V\i, %zmm\i 106 .else 107 .error "Unsupported Vector Length (VL) 108 .endif 109 .endm 110 111 .macro _define_aliases 112 // Define register aliases V0-V15, or 113 // are available, that map to the xmm, 114 // to the selected Vector Length (VL). 115 _define_Vi 0 116 _define_Vi 1 117 _define_Vi 2 118 _define_Vi 3 119 _define_Vi 4 120 _define_Vi 5 121 _define_Vi 6 122 _define_Vi 7 123 _define_Vi 8 124 _define_Vi 9 125 _define_Vi 10 126 _define_Vi 11 127 _define_Vi 12 128 _define_Vi 13 129 _define_Vi 14 130 _define_Vi 15 131 .if USE_AVX10 132 _define_Vi 16 133 _define_Vi 17 134 _define_Vi 18 135 _define_Vi 19 136 _define_Vi 20 137 _define_Vi 21 138 _define_Vi 22 139 _define_Vi 23 140 _define_Vi 24 141 _define_Vi 25 142 _define_Vi 26 143 _define_Vi 27 144 _define_Vi 28 145 _define_Vi 29 146 _define_Vi 30 147 _define_Vi 31 148 .endif 149 150 // V0-V3 hold the data blocks during t 151 // otherwise. V4-V5 hold temporary va 152 153 // V6-V9 hold XTS tweaks. Each 128-bi 154 .set TWEAK0_XMM, %xmm6 155 .set TWEAK0, V6 156 .set TWEAK1_XMM, %xmm7 157 .set TWEAK1, V7 158 .set TWEAK2, V8 159 .set TWEAK3, V9 160 161 // V10-V13 are used for computing the 162 .set NEXT_TWEAK0, V10 163 .set NEXT_TWEAK1, V11 164 .set NEXT_TWEAK2, V12 165 .set NEXT_TWEAK3, V13 166 167 // V14 holds the constant from .Lgf_po 168 .set GF_POLY_XMM, %xmm14 169 .set GF_POLY, V14 170 171 // V15 holds the key for AES "round 0" 172 .set KEY0_XMM, %xmm15 173 .set KEY0, V15 174 175 // If 32 SIMD registers are available, 176 // AES round keys, copied to all 128-b 177 // 178 // AES-128, AES-192, and AES-256 use d 179 // To allow handling all three variant 180 // keys to the *end* of this register 181 // KEY5-KEY14, AES-192 uses KEY3-KEY14 182 // (All also use KEY0 for the XOR-only 183 .if USE_AVX10 184 .set KEY1_XMM, %xmm16 185 .set KEY1, V16 186 .set KEY2_XMM, %xmm17 187 .set KEY2, V17 188 .set KEY3_XMM, %xmm18 189 .set KEY3, V18 190 .set KEY4_XMM, %xmm19 191 .set KEY4, V19 192 .set KEY5_XMM, %xmm20 193 .set KEY5, V20 194 .set KEY6_XMM, %xmm21 195 .set KEY6, V21 196 .set KEY7_XMM, %xmm22 197 .set KEY7, V22 198 .set KEY8_XMM, %xmm23 199 .set KEY8, V23 200 .set KEY9_XMM, %xmm24 201 .set KEY9, V24 202 .set KEY10_XMM, %xmm25 203 .set KEY10, V25 204 .set KEY11_XMM, %xmm26 205 .set KEY11, V26 206 .set KEY12_XMM, %xmm27 207 .set KEY12, V27 208 .set KEY13_XMM, %xmm28 209 .set KEY13, V28 210 .set KEY14_XMM, %xmm29 211 .set KEY14, V29 212 .endif 213 // V30-V31 are currently unused. 214 .endm 215 216 // Move a vector between memory and a register 217 .macro _vmovdqu src, dst 218 .if VL < 64 219 vmovdqu \src, \dst 220 .else 221 vmovdqu8 \src, \dst 222 .endif 223 .endm 224 225 // Broadcast a 128-bit value into a vector. 226 .macro _vbroadcast128 src, dst 227 .if VL == 16 && !USE_AVX10 228 vmovdqu \src, \dst 229 .elseif VL == 32 && !USE_AVX10 230 vbroadcasti128 \src, \dst 231 .else 232 vbroadcasti32x4 \src, \dst 233 .endif 234 .endm 235 236 // XOR two vectors together. 237 .macro _vpxor src1, src2, dst 238 .if USE_AVX10 239 vpxord \src1, \src2, \dst 240 .else 241 vpxor \src1, \src2, \dst 242 .endif 243 .endm 244 245 // XOR three vectors together. 246 .macro _xor3 src1, src2, src3_and_dst 247 .if USE_AVX10 248 // vpternlogd with immediate 0x96 is a 249 vpternlogd $0x96, \src1, \src2, \ 250 .else 251 vpxor \src1, \src3_and_dst, 252 vpxor \src2, \src3_and_dst, 253 .endif 254 .endm 255 256 // Given a 128-bit XTS tweak in the xmm regist 257 // (by multiplying by the polynomial 'x') and 258 .macro _next_tweak src, tmp, dst 259 vpshufd $0x13, \src, \tmp 260 vpaddq \src, \src, \dst 261 vpsrad $31, \tmp, \tmp 262 vpand GF_POLY_XMM, \tmp, \tm 263 vpxor \tmp, \dst, \dst 264 .endm 265 266 // Given the XTS tweak(s) in the vector \src, 267 // tweak(s) (by multiplying by the polynomial 268 // 269 // If VL > 16, then there are multiple tweaks, 270 // all tweaks in the vector in parallel. If V 271 // computation without vpclmulqdq, as it's the 272 .macro _next_tweakvec src, tmp1, tmp2, dst 273 .if VL == 16 274 _next_tweak \src, \tmp1, \dst 275 .else 276 vpsrlq $64 - VL/16, \src, \tm 277 vpclmulqdq $0x01, GF_POLY, \tmp1, 278 vpslldq $8, \tmp1, \tmp1 279 vpsllq $VL/16, \src, \dst 280 _xor3 \tmp1, \tmp2, \dst 281 .endif 282 .endm 283 284 // Given the first XTS tweak at (TWEAK), compu 285 // store them in the vector registers TWEAK0-T 286 .macro _compute_first_set_of_tweaks 287 vmovdqu (TWEAK), TWEAK0_XMM 288 _vbroadcast128 .Lgf_poly(%rip), GF_PO 289 .if VL == 16 290 // With VL=16, multiplying by x serial 291 _next_tweak TWEAK0, %xmm0, TWEAK1 292 _next_tweak TWEAK1, %xmm0, TWEAK2 293 _next_tweak TWEAK2, %xmm0, TWEAK3 294 .else 295 .if VL == 32 296 // Compute the second block of TWEAK0. 297 _next_tweak TWEAK0_XMM, %xmm0, %xm 298 vinserti128 $1, %xmm1, TWEAK0, TWE 299 .elseif VL == 64 300 // Compute the remaining blocks of TWE 301 _next_tweak TWEAK0_XMM, %xmm0, %xm 302 _next_tweak %xmm1, %xmm0, %xmm2 303 _next_tweak %xmm2, %xmm0, %xmm3 304 vinserti32x4 $1, %xmm1, TWEAK0, TWE 305 vinserti32x4 $2, %xmm2, TWEAK0, TWE 306 vinserti32x4 $3, %xmm3, TWEAK0, TWE 307 .endif 308 // Compute TWEAK[1-3] from TWEAK0. 309 vpsrlq $64 - 1*VL/16, TWEAK0, 310 vpsrlq $64 - 2*VL/16, TWEAK0, 311 vpsrlq $64 - 3*VL/16, TWEAK0, 312 vpclmulqdq $0x01, GF_POLY, V0, V1 313 vpclmulqdq $0x01, GF_POLY, V2, V3 314 vpclmulqdq $0x01, GF_POLY, V4, V5 315 vpslldq $8, V0, V0 316 vpslldq $8, V2, V2 317 vpslldq $8, V4, V4 318 vpsllq $1*VL/16, TWEAK0, TWEA 319 vpsllq $2*VL/16, TWEAK0, TWEA 320 vpsllq $3*VL/16, TWEAK0, TWEA 321 .if USE_AVX10 322 vpternlogd $0x96, V0, V1, TWEAK1 323 vpternlogd $0x96, V2, V3, TWEAK2 324 vpternlogd $0x96, V4, V5, TWEAK3 325 .else 326 vpxor V0, TWEAK1, TWEAK1 327 vpxor V2, TWEAK2, TWEAK2 328 vpxor V4, TWEAK3, TWEAK3 329 vpxor V1, TWEAK1, TWEAK1 330 vpxor V3, TWEAK2, TWEAK2 331 vpxor V5, TWEAK3, TWEAK3 332 .endif 333 .endif 334 .endm 335 336 // Do one step in computing the next set of tw 337 // multiplying by x repeatedly (the same metho 338 .macro _tweak_step_mulx i 339 .if \i == 0 340 .set PREV_TWEAK, TWEAK3 341 .set NEXT_TWEAK, NEXT_TWEAK0 342 .elseif \i == 5 343 .set PREV_TWEAK, NEXT_TWEAK0 344 .set NEXT_TWEAK, NEXT_TWEAK1 345 .elseif \i == 10 346 .set PREV_TWEAK, NEXT_TWEAK1 347 .set NEXT_TWEAK, NEXT_TWEAK2 348 .elseif \i == 15 349 .set PREV_TWEAK, NEXT_TWEAK2 350 .set NEXT_TWEAK, NEXT_TWEAK3 351 .endif 352 .if \i >= 0 && \i < 20 && \i % 5 == 0 353 vpshufd $0x13, PREV_TWEAK, V5 354 .elseif \i >= 0 && \i < 20 && \i % 5 == 1 355 vpaddq PREV_TWEAK, PREV_TWEAK 356 .elseif \i >= 0 && \i < 20 && \i % 5 == 2 357 vpsrad $31, V5, V5 358 .elseif \i >= 0 && \i < 20 && \i % 5 == 3 359 vpand GF_POLY, V5, V5 360 .elseif \i >= 0 && \i < 20 && \i % 5 == 4 361 vpxor V5, NEXT_TWEAK, NEXT_T 362 .elseif \i == 1000 363 vmovdqa NEXT_TWEAK0, TWEAK0 364 vmovdqa NEXT_TWEAK1, TWEAK1 365 vmovdqa NEXT_TWEAK2, TWEAK2 366 vmovdqa NEXT_TWEAK3, TWEAK3 367 .endif 368 .endm 369 370 // Do one step in computing the next set of tw 371 // (the same method _next_tweakvec uses for VL 372 // each tweak by x^(4*VL/16) independently. S 373 // when VL > 16 (which it is here), the needed 374 // which allows the use of vpsrldq and vpslldq 375 .macro _tweak_step_pclmul i 376 .if \i == 0 377 vpsrldq $(128 - 4*VL/16) / 8, 378 .elseif \i == 2 379 vpsrldq $(128 - 4*VL/16) / 8, 380 .elseif \i == 4 381 vpsrldq $(128 - 4*VL/16) / 8, 382 .elseif \i == 6 383 vpsrldq $(128 - 4*VL/16) / 8, 384 .elseif \i == 8 385 vpclmulqdq $0x00, GF_POLY, NEXT_T 386 .elseif \i == 10 387 vpclmulqdq $0x00, GF_POLY, NEXT_T 388 .elseif \i == 12 389 vpclmulqdq $0x00, GF_POLY, NEXT_T 390 .elseif \i == 14 391 vpclmulqdq $0x00, GF_POLY, NEXT_T 392 .elseif \i == 1000 393 vpslldq $(4*VL/16) / 8, TWEAK0 394 vpslldq $(4*VL/16) / 8, TWEAK1 395 vpslldq $(4*VL/16) / 8, TWEAK2 396 vpslldq $(4*VL/16) / 8, TWEAK3 397 _vpxor NEXT_TWEAK0, TWEAK0, T 398 _vpxor NEXT_TWEAK1, TWEAK1, T 399 _vpxor NEXT_TWEAK2, TWEAK2, T 400 _vpxor NEXT_TWEAK3, TWEAK3, T 401 .endif 402 .endm 403 404 // _tweak_step does one step of the computatio 405 // TWEAK[0-3]. To complete all steps, this is 406 // \i that include at least 0 through 19, then 407 // 408 // This is used to interleave the computation 409 // AES en/decryptions, which increases perform 410 .macro _tweak_step i 411 .if VL == 16 412 _tweak_step_mulx \i 413 .else 414 _tweak_step_pclmul \i 415 .endif 416 .endm 417 418 .macro _setup_round_keys enc 419 420 // Select either the encryption round 421 .if \enc 422 .set OFFS, 0 423 .else 424 .set OFFS, 240 425 .endif 426 427 // Load the round key for "round 0". 428 _vbroadcast128 OFFS(KEY), KEY0 429 430 // Increment KEY to make it so that 7* 431 // For AES-128, increment by 3*16, res 432 // counting the zero-th round key whic 433 // -2*16(KEY) through 7*16(KEY). For 434 // 12 round keys -4*16(KEY) through 7* 435 // by 7*16 and use 14 round keys -6*16 436 // 437 // This rebasing provides two benefits 438 // any round key be in the range [-96, 439 // This shortens VEX-encoded instructi 440 // keys which otherwise would need 4-b 441 // easy to do AES-128 and AES-192 by s 442 // beginning. Skipping rounds at the 443 // the last round needs different inst 444 // 445 // An alternative approach would be to 446 // don't do that because it isn't comp 447 // in registers which we do when possi 448 // it seems unwise to rely *too* heavi 449 lea OFFS-16(KEY, KEYLEN64, 450 451 // If all 32 SIMD registers are availa 452 .if USE_AVX10 453 cmp $24, KEYLEN 454 jl .Laes128\@ 455 je .Laes192\@ 456 _vbroadcast128 -6*16(KEY), KEY1 457 _vbroadcast128 -5*16(KEY), KEY2 458 .Laes192\@: 459 _vbroadcast128 -4*16(KEY), KEY3 460 _vbroadcast128 -3*16(KEY), KEY4 461 .Laes128\@: 462 _vbroadcast128 -2*16(KEY), KEY5 463 _vbroadcast128 -1*16(KEY), KEY6 464 _vbroadcast128 0*16(KEY), KEY7 465 _vbroadcast128 1*16(KEY), KEY8 466 _vbroadcast128 2*16(KEY), KEY9 467 _vbroadcast128 3*16(KEY), KEY10 468 _vbroadcast128 4*16(KEY), KEY11 469 _vbroadcast128 5*16(KEY), KEY12 470 _vbroadcast128 6*16(KEY), KEY13 471 _vbroadcast128 7*16(KEY), KEY14 472 .endif 473 .endm 474 475 // Do a single round of AES encryption (if \en 476 // on the block(s) in \data using the round ke 477 // determines the number of AES blocks en/decr 478 .macro _vaes enc, last, key, data 479 .if \enc 480 .if \last 481 vaesenclast \key, \data, \data 482 .else 483 vaesenc \key, \data, \data 484 .endif 485 .else 486 .if \last 487 vaesdeclast \key, \data, \data 488 .else 489 vaesdec \key, \data, \data 490 .endif 491 .endif 492 .endm 493 494 // Do a single round of AES en/decryption on t 495 // same key for all block(s). The round key i 496 // register or memory location for round \i. 497 .macro _vaes_1x enc, last, i, xmm_suff 498 .if USE_AVX10 499 _vaes \enc, \last, KEY\i\xmm 500 .else 501 .ifnb \xmm_suffix 502 _vaes \enc, \last, (\i-7)*16 503 .else 504 _vbroadcast128 (\i-7)*16(KEY), V4 505 _vaes \enc, \last, V4, \data 506 .endif 507 .endif 508 .endm 509 510 // Do a single round of AES en/decryption on t 511 // using the same key for all blocks. The rou 512 // appropriate register or memory location for 513 // steps of the computation of the next set of 514 .macro _vaes_4x enc, last, i 515 .if USE_AVX10 516 _tweak_step (2*(\i-5)) 517 _vaes \enc, \last, KEY\i, V0 518 _vaes \enc, \last, KEY\i, V1 519 _tweak_step (2*(\i-5) + 1) 520 _vaes \enc, \last, KEY\i, V2 521 _vaes \enc, \last, KEY\i, V3 522 .else 523 _vbroadcast128 (\i-7)*16(KEY), V4 524 _tweak_step (2*(\i-5)) 525 _vaes \enc, \last, V4, V0 526 _vaes \enc, \last, V4, V1 527 _tweak_step (2*(\i-5) + 1) 528 _vaes \enc, \last, V4, V2 529 _vaes \enc, \last, V4, V3 530 .endif 531 .endm 532 533 // Do tweaked AES en/decryption (i.e., XOR wit 534 // then XOR with \tweak again) of the block(s) 535 // block, use xmm registers and set \xmm_suffi 536 // length VL, use V* registers and leave \xmm_ 537 .macro _aes_crypt enc, xmm_suffix, tweak 538 _xor3 KEY0\xmm_suffix, \twea 539 cmp $24, KEYLEN 540 jl .Laes128\@ 541 je .Laes192\@ 542 _vaes_1x \enc, 0, 1, \xmm_suffi 543 _vaes_1x \enc, 0, 2, \xmm_suffi 544 .Laes192\@: 545 _vaes_1x \enc, 0, 3, \xmm_suffi 546 _vaes_1x \enc, 0, 4, \xmm_suffi 547 .Laes128\@: 548 _vaes_1x \enc, 0, 5, \xmm_suffi 549 _vaes_1x \enc, 0, 6, \xmm_suffi 550 _vaes_1x \enc, 0, 7, \xmm_suffi 551 _vaes_1x \enc, 0, 8, \xmm_suffi 552 _vaes_1x \enc, 0, 9, \xmm_suffi 553 _vaes_1x \enc, 0, 10, \xmm_suff 554 _vaes_1x \enc, 0, 11, \xmm_suff 555 _vaes_1x \enc, 0, 12, \xmm_suff 556 _vaes_1x \enc, 0, 13, \xmm_suff 557 _vaes_1x \enc, 1, 14, \xmm_suff 558 _vpxor \tweak, \data, \data 559 .endm 560 561 .macro _aes_xts_crypt enc 562 _define_aliases 563 564 .if !\enc 565 // When decrypting a message whose len 566 // block length, exclude the last full 567 // subtracting 16 from LEN. This is n 568 // decryption uses the last two tweaks 569 // the last full block and the partial 570 lea -16(LEN), %eax 571 test $15, LEN8 572 cmovnz %eax, LEN 573 .endif 574 575 // Load the AES key length: 16 (AES-12 576 movl 480(KEY), KEYLEN 577 578 // Setup the pointer to the round keys 579 _setup_round_keys \enc 580 581 // Compute the first set of tweaks TWE 582 _compute_first_set_of_tweaks 583 584 sub $4*VL, LEN 585 jl .Lhandle_remainder\@ 586 587 .Lmain_loop\@: 588 // This is the main loop, en/decryptin 589 590 // XOR each source block with its twea 591 .if USE_AVX10 592 vmovdqu8 0*VL(SRC), V0 593 vmovdqu8 1*VL(SRC), V1 594 vmovdqu8 2*VL(SRC), V2 595 vmovdqu8 3*VL(SRC), V3 596 vpternlogd $0x96, TWEAK0, KEY0, V 597 vpternlogd $0x96, TWEAK1, KEY0, V 598 vpternlogd $0x96, TWEAK2, KEY0, V 599 vpternlogd $0x96, TWEAK3, KEY0, V 600 .else 601 vpxor 0*VL(SRC), KEY0, V0 602 vpxor 1*VL(SRC), KEY0, V1 603 vpxor 2*VL(SRC), KEY0, V2 604 vpxor 3*VL(SRC), KEY0, V3 605 vpxor TWEAK0, V0, V0 606 vpxor TWEAK1, V1, V1 607 vpxor TWEAK2, V2, V2 608 vpxor TWEAK3, V3, V3 609 .endif 610 cmp $24, KEYLEN 611 jl .Laes128\@ 612 je .Laes192\@ 613 // Do all the AES rounds on the data b 614 // the computation of the next set of 615 _vaes_4x \enc, 0, 1 616 _vaes_4x \enc, 0, 2 617 .Laes192\@: 618 _vaes_4x \enc, 0, 3 619 _vaes_4x \enc, 0, 4 620 .Laes128\@: 621 _vaes_4x \enc, 0, 5 622 _vaes_4x \enc, 0, 6 623 _vaes_4x \enc, 0, 7 624 _vaes_4x \enc, 0, 8 625 _vaes_4x \enc, 0, 9 626 _vaes_4x \enc, 0, 10 627 _vaes_4x \enc, 0, 11 628 _vaes_4x \enc, 0, 12 629 _vaes_4x \enc, 0, 13 630 _vaes_4x \enc, 1, 14 631 632 // XOR in the tweaks again. 633 _vpxor TWEAK0, V0, V0 634 _vpxor TWEAK1, V1, V1 635 _vpxor TWEAK2, V2, V2 636 _vpxor TWEAK3, V3, V3 637 638 // Store the destination blocks. 639 _vmovdqu V0, 0*VL(DST) 640 _vmovdqu V1, 1*VL(DST) 641 _vmovdqu V2, 2*VL(DST) 642 _vmovdqu V3, 3*VL(DST) 643 644 // Finish computing the next set of tw 645 _tweak_step 1000 646 647 add $4*VL, SRC 648 add $4*VL, DST 649 sub $4*VL, LEN 650 jge .Lmain_loop\@ 651 652 // Check for the uncommon case where t 653 // 4*VL. Handle it out-of-line in ord 654 // case. In the common case, just fal 655 test $4*VL-1, LEN8 656 jnz .Lhandle_remainder\@ 657 .Ldone\@: 658 // Store the next tweak back to *TWEAK 659 vmovdqu TWEAK0_XMM, (TWEAK) 660 .if VL > 16 661 vzeroupper 662 .endif 663 RET 664 665 .Lhandle_remainder\@: 666 667 // En/decrypt any remaining full block 668 .if VL > 16 669 add $3*VL, LEN // Und 670 jl .Lvec_at_a_time_done\@ 671 .Lvec_at_a_time\@: 672 _vmovdqu (SRC), V0 673 _aes_crypt \enc, , TWEAK0, V0 674 _vmovdqu V0, (DST) 675 _next_tweakvec TWEAK0, V0, V1, TWEAK0 676 add $VL, SRC 677 add $VL, DST 678 sub $VL, LEN 679 jge .Lvec_at_a_time\@ 680 .Lvec_at_a_time_done\@: 681 add $VL-16, LEN // Und 682 .else 683 add $4*VL-16, LEN // Und 684 .endif 685 686 // En/decrypt any remaining full block 687 jl .Lblock_at_a_time_done 688 .Lblock_at_a_time\@: 689 vmovdqu (SRC), %xmm0 690 _aes_crypt \enc, _XMM, TWEAK0_XMM 691 vmovdqu %xmm0, (DST) 692 _next_tweak TWEAK0_XMM, %xmm0, TWE 693 add $16, SRC 694 add $16, DST 695 sub $16, LEN 696 jge .Lblock_at_a_time\@ 697 .Lblock_at_a_time_done\@: 698 add $16, LEN // Und 699 // Now 0 <= LEN <= 15. If LEN is zero 700 jz .Ldone\@ 701 702 // Otherwise 1 <= LEN <= 15, but the r 703 // Do ciphertext stealing to process t 704 705 .if \enc 706 // If encrypting, the main loop alread 707 // create the CTS intermediate ciphert 708 // by rewinding the pointers and loadi 709 sub $16, SRC 710 sub $16, DST 711 vmovdqu (DST), %xmm0 712 .else 713 // If decrypting, the main loop didn't 714 // because CTS decryption uses the las 715 // Do it now by advancing the tweak an 716 _next_tweak TWEAK0_XMM, %xmm0, TWE 717 vmovdqu (SRC), %xmm0 718 _aes_crypt \enc, _XMM, TWEAK1_XMM 719 .endif 720 721 .if USE_AVX10 722 // Create a mask that has the first LE 723 mov $-1, %r9d 724 bzhi LEN, %r9d, %r9d 725 kmovd %r9d, %k1 726 727 // Swap the first LEN bytes of the en/ 728 // with the partial block. Note that 729 // the load from the src partial block 730 // the dst partial block. 731 vmovdqa %xmm0, %xmm1 732 vmovdqu8 16(SRC), %xmm0{%k1} 733 vmovdqu8 %xmm1, 16(DST){%k1} 734 .else 735 lea .Lcts_permute_table(%r 736 737 // Load the src partial block, left-al 738 // in-place en/decryption, this must h 739 // partial block. 740 vmovdqu (SRC, LEN64, 1), %xmm1 741 742 // Shift the first LEN bytes of the en 743 // to the end of a register, then stor 744 // dst partial block. It also writes 745 // full block, but that part is overwr 746 vpshufb (%r9, LEN64, 1), %xmm0 747 vmovdqu %xmm2, (DST, LEN64, 1) 748 749 // Make xmm3 contain [16-LEN,16-LEN+1, 750 sub LEN64, %r9 751 vmovdqu 32(%r9), %xmm3 752 753 // Shift the src partial block to the 754 vpshufb %xmm3, %xmm1, %xmm1 755 756 // Do a blend to generate the src part 757 // part of the en/decryption of the la 758 vpblendvb %xmm3, %xmm0, %xmm1, % 759 .endif 760 // En/decrypt again and store the last 761 _aes_crypt \enc, _XMM, TWEAK0_XMM 762 vmovdqu %xmm0, (DST) 763 jmp .Ldone\@ 764 .endm 765 766 // void aes_xts_encrypt_iv(const struct crypto 767 // u8 iv[AES_BLOCK_SIZ 768 SYM_TYPED_FUNC_START(aes_xts_encrypt_iv) 769 vmovdqu (%rsi), %xmm0 770 vpxor (%rdi), %xmm0, %xmm0 771 movl 480(%rdi), %eax 772 lea -16(%rdi, %rax, 4), %r 773 cmp $24, %eax 774 jl .Lencrypt_iv_aes128 775 je .Lencrypt_iv_aes192 776 vaesenc -6*16(%rdi), %xmm0, %x 777 vaesenc -5*16(%rdi), %xmm0, %x 778 .Lencrypt_iv_aes192: 779 vaesenc -4*16(%rdi), %xmm0, %x 780 vaesenc -3*16(%rdi), %xmm0, %x 781 .Lencrypt_iv_aes128: 782 vaesenc -2*16(%rdi), %xmm0, %x 783 vaesenc -1*16(%rdi), %xmm0, %x 784 vaesenc 0*16(%rdi), %xmm0, %xm 785 vaesenc 1*16(%rdi), %xmm0, %xm 786 vaesenc 2*16(%rdi), %xmm0, %xm 787 vaesenc 3*16(%rdi), %xmm0, %xm 788 vaesenc 4*16(%rdi), %xmm0, %xm 789 vaesenc 5*16(%rdi), %xmm0, %xm 790 vaesenc 6*16(%rdi), %xmm0, %xm 791 vaesenclast 7*16(%rdi), %xmm0, %xm 792 vmovdqu %xmm0, (%rsi) 793 RET 794 SYM_FUNC_END(aes_xts_encrypt_iv) 795 796 // Below are the actual AES-XTS encryption and 797 // instantiated from the above macro. They al 798 // 799 // void (*xts_asm_func)(const struct crypto_ae 800 // const u8 *src, u8 *dst 801 // u8 tweak[AES_BLOCK_SIZ 802 // 803 // |key| is the data key. |tweak| contains th 804 // the original IV with the tweak key was alre 805 // incremental computation, but |len| must alw 806 // |len| must be a multiple of 16 except on th 807 // multiple of 16, then this function updates 808 809 .set VL, 16 810 .set USE_AVX10, 0 811 SYM_TYPED_FUNC_START(aes_xts_encrypt_aesni_avx 812 _aes_xts_crypt 1 813 SYM_FUNC_END(aes_xts_encrypt_aesni_avx) 814 SYM_TYPED_FUNC_START(aes_xts_decrypt_aesni_avx 815 _aes_xts_crypt 0 816 SYM_FUNC_END(aes_xts_decrypt_aesni_avx) 817 818 #if defined(CONFIG_AS_VAES) && defined(CONFIG_ 819 .set VL, 32 820 .set USE_AVX10, 0 821 SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx2 822 _aes_xts_crypt 1 823 SYM_FUNC_END(aes_xts_encrypt_vaes_avx2) 824 SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx2 825 _aes_xts_crypt 0 826 SYM_FUNC_END(aes_xts_decrypt_vaes_avx2) 827 828 .set VL, 32 829 .set USE_AVX10, 1 830 SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx1 831 _aes_xts_crypt 1 832 SYM_FUNC_END(aes_xts_encrypt_vaes_avx10_256) 833 SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx1 834 _aes_xts_crypt 0 835 SYM_FUNC_END(aes_xts_decrypt_vaes_avx10_256) 836 837 .set VL, 64 838 .set USE_AVX10, 1 839 SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx1 840 _aes_xts_crypt 1 841 SYM_FUNC_END(aes_xts_encrypt_vaes_avx10_512) 842 SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx1 843 _aes_xts_crypt 0 844 SYM_FUNC_END(aes_xts_decrypt_vaes_avx10_512) 845 #endif /* CONFIG_AS_VAES && CONFIG_AS_VPCLMULQ
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.