1 /* SPDX-License-Identifier: GPL-2.0-or-later * 2 /* 3 * ARIA Cipher 32-way parallel algorithm (AVX2 4 * 5 * Copyright (c) 2022 Taehee Yoo <ap420073@gmai 6 * 7 */ 8 9 #include <linux/linkage.h> 10 #include <asm/frame.h> 11 #include <asm/asm-offsets.h> 12 #include <linux/cfi_types.h> 13 14 /* register macros */ 15 #define CTX %rdi 16 17 #define ymm0_x xmm0 18 #define ymm1_x xmm1 19 #define ymm2_x xmm2 20 #define ymm3_x xmm3 21 #define ymm4_x xmm4 22 #define ymm5_x xmm5 23 #define ymm6_x xmm6 24 #define ymm7_x xmm7 25 #define ymm8_x xmm8 26 #define ymm9_x xmm9 27 #define ymm10_x xmm10 28 #define ymm11_x xmm11 29 #define ymm12_x xmm12 30 #define ymm13_x xmm13 31 #define ymm14_x xmm14 32 #define ymm15_x xmm15 33 34 #define BV8(a0, a1, a2, a3, a4, a5, a6, a7) 35 ( (((a0) & 1) << 0) | 36 (((a1) & 1) << 1) | 37 (((a2) & 1) << 2) | 38 (((a3) & 1) << 3) | 39 (((a4) & 1) << 4) | 40 (((a5) & 1) << 5) | 41 (((a6) & 1) << 6) | 42 (((a7) & 1) << 7) ) 43 44 #define BM8X8(l0, l1, l2, l3, l4, l5, l6, l7) 45 ( ((l7) << (0 * 8)) | 46 ((l6) << (1 * 8)) | 47 ((l5) << (2 * 8)) | 48 ((l4) << (3 * 8)) | 49 ((l3) << (4 * 8)) | 50 ((l2) << (5 * 8)) | 51 ((l1) << (6 * 8)) | 52 ((l0) << (7 * 8)) ) 53 54 #define inc_le128(x, minus_one, tmp) 55 vpcmpeqq minus_one, x, tmp; 56 vpsubq minus_one, x, x; 57 vpslldq $8, tmp, tmp; 58 vpsubq tmp, x, x; 59 60 #define filter_8bit(x, lo_t, hi_t, mask4bit, t 61 vpand x, mask4bit, tmp0; 62 vpandn x, mask4bit, x; 63 vpsrld $4, x, x; 64 65 vpshufb tmp0, lo_t, tmp0; 66 vpshufb x, hi_t, x; 67 vpxor tmp0, x, x; 68 69 #define transpose_4x4(x0, x1, x2, x3, t1, t2) 70 vpunpckhdq x1, x0, t2; 71 vpunpckldq x1, x0, x0; 72 73 vpunpckldq x3, x2, t1; 74 vpunpckhdq x3, x2, x2; 75 76 vpunpckhqdq t1, x0, x1; 77 vpunpcklqdq t1, x0, x0; 78 79 vpunpckhqdq x2, t2, x3; 80 vpunpcklqdq x2, t2, x2; 81 82 #define byteslice_16x16b(a0, b0, c0, d0, 83 a1, b1, c1, d1, 84 a2, b2, c2, d2, 85 a3, b3, c3, d3, 86 st0, st1) 87 vmovdqu d2, st0; 88 vmovdqu d3, st1; 89 transpose_4x4(a0, a1, a2, a3, d2, d3); 90 transpose_4x4(b0, b1, b2, b3, d2, d3); 91 vmovdqu st0, d2; 92 vmovdqu st1, d3; 93 94 vmovdqu a0, st0; 95 vmovdqu a1, st1; 96 transpose_4x4(c0, c1, c2, c3, a0, a1); 97 transpose_4x4(d0, d1, d2, d3, a0, a1); 98 99 vbroadcasti128 .Lshufb_16x16b(%rip), a 100 vmovdqu st1, a1; 101 vpshufb a0, a2, a2; 102 vpshufb a0, a3, a3; 103 vpshufb a0, b0, b0; 104 vpshufb a0, b1, b1; 105 vpshufb a0, b2, b2; 106 vpshufb a0, b3, b3; 107 vpshufb a0, a1, a1; 108 vpshufb a0, c0, c0; 109 vpshufb a0, c1, c1; 110 vpshufb a0, c2, c2; 111 vpshufb a0, c3, c3; 112 vpshufb a0, d0, d0; 113 vpshufb a0, d1, d1; 114 vpshufb a0, d2, d2; 115 vpshufb a0, d3, d3; 116 vmovdqu d3, st1; 117 vmovdqu st0, d3; 118 vpshufb a0, d3, a0; 119 vmovdqu d2, st0; 120 121 transpose_4x4(a0, b0, c0, d0, d2, d3); 122 transpose_4x4(a1, b1, c1, d1, d2, d3); 123 vmovdqu st0, d2; 124 vmovdqu st1, d3; 125 126 vmovdqu b0, st0; 127 vmovdqu b1, st1; 128 transpose_4x4(a2, b2, c2, d2, b0, b1); 129 transpose_4x4(a3, b3, c3, d3, b0, b1); 130 vmovdqu st0, b0; 131 vmovdqu st1, b1; 132 /* does not adjust output bytes inside 133 134 #define debyteslice_16x16b(a0, b0, c0, d0, 135 a1, b1, c1, d1, 136 a2, b2, c2, d2, 137 a3, b3, c3, d3, 138 st0, st1) 139 vmovdqu d2, st0; 140 vmovdqu d3, st1; 141 transpose_4x4(a0, a1, a2, a3, d2, d3); 142 transpose_4x4(b0, b1, b2, b3, d2, d3); 143 vmovdqu st0, d2; 144 vmovdqu st1, d3; 145 146 vmovdqu a0, st0; 147 vmovdqu a1, st1; 148 transpose_4x4(c0, c1, c2, c3, a0, a1); 149 transpose_4x4(d0, d1, d2, d3, a0, a1); 150 151 vbroadcasti128 .Lshufb_16x16b(%rip), a 152 vmovdqu st1, a1; 153 vpshufb a0, a2, a2; 154 vpshufb a0, a3, a3; 155 vpshufb a0, b0, b0; 156 vpshufb a0, b1, b1; 157 vpshufb a0, b2, b2; 158 vpshufb a0, b3, b3; 159 vpshufb a0, a1, a1; 160 vpshufb a0, c0, c0; 161 vpshufb a0, c1, c1; 162 vpshufb a0, c2, c2; 163 vpshufb a0, c3, c3; 164 vpshufb a0, d0, d0; 165 vpshufb a0, d1, d1; 166 vpshufb a0, d2, d2; 167 vpshufb a0, d3, d3; 168 vmovdqu d3, st1; 169 vmovdqu st0, d3; 170 vpshufb a0, d3, a0; 171 vmovdqu d2, st0; 172 173 transpose_4x4(c0, d0, a0, b0, d2, d3); 174 transpose_4x4(c1, d1, a1, b1, d2, d3); 175 vmovdqu st0, d2; 176 vmovdqu st1, d3; 177 178 vmovdqu b0, st0; 179 vmovdqu b1, st1; 180 transpose_4x4(c2, d2, a2, b2, b0, b1); 181 transpose_4x4(c3, d3, a3, b3, b0, b1); 182 vmovdqu st0, b0; 183 vmovdqu st1, b1; 184 /* does not adjust output bytes inside 185 186 /* load blocks to registers and apply pre-whit 187 #define inpack16_pre(x0, x1, x2, x3, 188 x4, x5, x6, x7, 189 y0, y1, y2, y3, 190 y4, y5, y6, y7, 191 rio) 192 vmovdqu (0 * 32)(rio), x0; 193 vmovdqu (1 * 32)(rio), x1; 194 vmovdqu (2 * 32)(rio), x2; 195 vmovdqu (3 * 32)(rio), x3; 196 vmovdqu (4 * 32)(rio), x4; 197 vmovdqu (5 * 32)(rio), x5; 198 vmovdqu (6 * 32)(rio), x6; 199 vmovdqu (7 * 32)(rio), x7; 200 vmovdqu (8 * 32)(rio), y0; 201 vmovdqu (9 * 32)(rio), y1; 202 vmovdqu (10 * 32)(rio), y2; 203 vmovdqu (11 * 32)(rio), y3; 204 vmovdqu (12 * 32)(rio), y4; 205 vmovdqu (13 * 32)(rio), y5; 206 vmovdqu (14 * 32)(rio), y6; 207 vmovdqu (15 * 32)(rio), y7; 208 209 /* byteslice pre-whitened blocks and store to 210 #define inpack16_post(x0, x1, x2, x3, 211 x4, x5, x6, x7, 212 y0, y1, y2, y3, 213 y4, y5, y6, y7, 214 mem_ab, mem_cd) 215 byteslice_16x16b(x0, x1, x2, x3, 216 x4, x5, x6, x7, 217 y0, y1, y2, y3, 218 y4, y5, y6, y7, 219 (mem_ab), (mem_cd)); 220 221 vmovdqu x0, 0 * 32(mem_ab); 222 vmovdqu x1, 1 * 32(mem_ab); 223 vmovdqu x2, 2 * 32(mem_ab); 224 vmovdqu x3, 3 * 32(mem_ab); 225 vmovdqu x4, 4 * 32(mem_ab); 226 vmovdqu x5, 5 * 32(mem_ab); 227 vmovdqu x6, 6 * 32(mem_ab); 228 vmovdqu x7, 7 * 32(mem_ab); 229 vmovdqu y0, 0 * 32(mem_cd); 230 vmovdqu y1, 1 * 32(mem_cd); 231 vmovdqu y2, 2 * 32(mem_cd); 232 vmovdqu y3, 3 * 32(mem_cd); 233 vmovdqu y4, 4 * 32(mem_cd); 234 vmovdqu y5, 5 * 32(mem_cd); 235 vmovdqu y6, 6 * 32(mem_cd); 236 vmovdqu y7, 7 * 32(mem_cd); 237 238 #define write_output(x0, x1, x2, x3, 239 x4, x5, x6, x7, 240 y0, y1, y2, y3, 241 y4, y5, y6, y7, 242 mem) 243 vmovdqu x0, 0 * 32(mem); 244 vmovdqu x1, 1 * 32(mem); 245 vmovdqu x2, 2 * 32(mem); 246 vmovdqu x3, 3 * 32(mem); 247 vmovdqu x4, 4 * 32(mem); 248 vmovdqu x5, 5 * 32(mem); 249 vmovdqu x6, 6 * 32(mem); 250 vmovdqu x7, 7 * 32(mem); 251 vmovdqu y0, 8 * 32(mem); 252 vmovdqu y1, 9 * 32(mem); 253 vmovdqu y2, 10 * 32(mem); 254 vmovdqu y3, 11 * 32(mem); 255 vmovdqu y4, 12 * 32(mem); 256 vmovdqu y5, 13 * 32(mem); 257 vmovdqu y6, 14 * 32(mem); 258 vmovdqu y7, 15 * 32(mem); 259 260 #define aria_store_state_8way(x0, x1, x2, x3, 261 x4, x5, x6, x7, 262 mem_tmp, idx) 263 vmovdqu x0, ((idx + 0) * 32)(mem_tmp); 264 vmovdqu x1, ((idx + 1) * 32)(mem_tmp); 265 vmovdqu x2, ((idx + 2) * 32)(mem_tmp); 266 vmovdqu x3, ((idx + 3) * 32)(mem_tmp); 267 vmovdqu x4, ((idx + 4) * 32)(mem_tmp); 268 vmovdqu x5, ((idx + 5) * 32)(mem_tmp); 269 vmovdqu x6, ((idx + 6) * 32)(mem_tmp); 270 vmovdqu x7, ((idx + 7) * 32)(mem_tmp); 271 272 #define aria_load_state_8way(x0, x1, x2, x3, 273 x4, x5, x6, x7, 274 mem_tmp, idx) 275 vmovdqu ((idx + 0) * 32)(mem_tmp), x0; 276 vmovdqu ((idx + 1) * 32)(mem_tmp), x1; 277 vmovdqu ((idx + 2) * 32)(mem_tmp), x2; 278 vmovdqu ((idx + 3) * 32)(mem_tmp), x3; 279 vmovdqu ((idx + 4) * 32)(mem_tmp), x4; 280 vmovdqu ((idx + 5) * 32)(mem_tmp), x5; 281 vmovdqu ((idx + 6) * 32)(mem_tmp), x6; 282 vmovdqu ((idx + 7) * 32)(mem_tmp), x7; 283 284 #define aria_ark_8way(x0, x1, x2, x3, 285 x4, x5, x6, x7, 286 t0, rk, idx, round) 287 /* AddRoundKey */ 288 vpbroadcastb ((round * 16) + idx + 3)( 289 vpxor t0, x0, x0; 290 vpbroadcastb ((round * 16) + idx + 2)( 291 vpxor t0, x1, x1; 292 vpbroadcastb ((round * 16) + idx + 1)( 293 vpxor t0, x2, x2; 294 vpbroadcastb ((round * 16) + idx + 0)( 295 vpxor t0, x3, x3; 296 vpbroadcastb ((round * 16) + idx + 7)( 297 vpxor t0, x4, x4; 298 vpbroadcastb ((round * 16) + idx + 6)( 299 vpxor t0, x5, x5; 300 vpbroadcastb ((round * 16) + idx + 5)( 301 vpxor t0, x6, x6; 302 vpbroadcastb ((round * 16) + idx + 4)( 303 vpxor t0, x7, x7; 304 305 #ifdef CONFIG_AS_GFNI 306 #define aria_sbox_8way_gfni(x0, x1, x2, x3, 307 x4, x5, x6, x7, 308 t0, t1, t2, t3, 309 t4, t5, t6, t7) 310 vpbroadcastq .Ltf_s2_bitmatrix(%rip), 311 vpbroadcastq .Ltf_inv_bitmatrix(%rip), 312 vpbroadcastq .Ltf_id_bitmatrix(%rip), 313 vpbroadcastq .Ltf_aff_bitmatrix(%rip), 314 vpbroadcastq .Ltf_x2_bitmatrix(%rip), 315 vgf2p8affineinvqb $(tf_s2_const), t0, 316 vgf2p8affineinvqb $(tf_s2_const), t0, 317 vgf2p8affineqb $(tf_inv_const), t1, x2 318 vgf2p8affineqb $(tf_inv_const), t1, x6 319 vgf2p8affineinvqb $0, t2, x2, x2; 320 vgf2p8affineinvqb $0, t2, x6, x6; 321 vgf2p8affineinvqb $(tf_aff_const), t3, 322 vgf2p8affineinvqb $(tf_aff_const), t3, 323 vgf2p8affineqb $(tf_x2_const), t4, x3, 324 vgf2p8affineqb $(tf_x2_const), t4, x7, 325 vgf2p8affineinvqb $0, t2, x3, x3; 326 vgf2p8affineinvqb $0, t2, x7, x7 327 328 #endif /* CONFIG_AS_GFNI */ 329 #define aria_sbox_8way(x0, x1, x2, x3, 330 x4, x5, x6, x7, 331 t0, t1, t2, t3, 332 t4, t5, t6, t7) 333 vpxor t7, t7, t7; 334 vpxor t6, t6, t6; 335 vbroadcasti128 .Linv_shift_row(%rip), 336 vbroadcasti128 .Lshift_row(%rip), t1; 337 vbroadcasti128 .Ltf_lo__inv_aff__and__ 338 vbroadcasti128 .Ltf_hi__inv_aff__and__ 339 vbroadcasti128 .Ltf_lo__x2__and__fwd_a 340 vbroadcasti128 .Ltf_hi__x2__and__fwd_a 341 342 vextracti128 $1, x0, t6##_x; 343 vaesenclast t7##_x, x0##_x, x0##_x; 344 vaesenclast t7##_x, t6##_x, t6##_x; 345 vinserti128 $1, t6##_x, x0, x0; 346 347 vextracti128 $1, x4, t6##_x; 348 vaesenclast t7##_x, x4##_x, x4##_x; 349 vaesenclast t7##_x, t6##_x, t6##_x; 350 vinserti128 $1, t6##_x, x4, x4; 351 352 vextracti128 $1, x1, t6##_x; 353 vaesenclast t7##_x, x1##_x, x1##_x; 354 vaesenclast t7##_x, t6##_x, t6##_x; 355 vinserti128 $1, t6##_x, x1, x1; 356 357 vextracti128 $1, x5, t6##_x; 358 vaesenclast t7##_x, x5##_x, x5##_x; 359 vaesenclast t7##_x, t6##_x, t6##_x; 360 vinserti128 $1, t6##_x, x5, x5; 361 362 vextracti128 $1, x2, t6##_x; 363 vaesdeclast t7##_x, x2##_x, x2##_x; 364 vaesdeclast t7##_x, t6##_x, t6##_x; 365 vinserti128 $1, t6##_x, x2, x2; 366 367 vextracti128 $1, x6, t6##_x; 368 vaesdeclast t7##_x, x6##_x, x6##_x; 369 vaesdeclast t7##_x, t6##_x, t6##_x; 370 vinserti128 $1, t6##_x, x6, x6; 371 372 vpbroadcastd .L0f0f0f0f(%rip), t6; 373 374 /* AES inverse shift rows */ 375 vpshufb t0, x0, x0; 376 vpshufb t0, x4, x4; 377 vpshufb t0, x1, x1; 378 vpshufb t0, x5, x5; 379 vpshufb t1, x3, x3; 380 vpshufb t1, x7, x7; 381 vpshufb t1, x2, x2; 382 vpshufb t1, x6, x6; 383 384 /* affine transformation for S2 */ 385 filter_8bit(x1, t2, t3, t6, t0); 386 /* affine transformation for S2 */ 387 filter_8bit(x5, t2, t3, t6, t0); 388 389 /* affine transformation for X2 */ 390 filter_8bit(x3, t4, t5, t6, t0); 391 /* affine transformation for X2 */ 392 filter_8bit(x7, t4, t5, t6, t0); 393 394 vpxor t6, t6, t6; 395 vextracti128 $1, x3, t6##_x; 396 vaesdeclast t7##_x, x3##_x, x3##_x; 397 vaesdeclast t7##_x, t6##_x, t6##_x; 398 vinserti128 $1, t6##_x, x3, x3; 399 400 vextracti128 $1, x7, t6##_x; 401 vaesdeclast t7##_x, x7##_x, x7##_x; 402 vaesdeclast t7##_x, t6##_x, t6##_x; 403 vinserti128 $1, t6##_x, x7, x7; 404 405 #define aria_diff_m(x0, x1, x2, x3, 406 t0, t1, t2, t3) 407 /* T = rotr32(X, 8); */ 408 /* X ^= T */ 409 vpxor x0, x3, t0; 410 vpxor x1, x0, t1; 411 vpxor x2, x1, t2; 412 vpxor x3, x2, t3; 413 /* X = T ^ rotr(X, 16); */ 414 vpxor t2, x0, x0; 415 vpxor x1, t3, t3; 416 vpxor t0, x2, x2; 417 vpxor t1, x3, x1; 418 vmovdqu t3, x3; 419 420 #define aria_diff_word(x0, x1, x2, x3, 421 x4, x5, x6, x7, 422 y0, y1, y2, y3, 423 y4, y5, y6, y7) 424 /* t1 ^= t2; */ 425 vpxor y0, x4, x4; 426 vpxor y1, x5, x5; 427 vpxor y2, x6, x6; 428 vpxor y3, x7, x7; 429 430 /* t2 ^= t3; */ 431 vpxor y4, y0, y0; 432 vpxor y5, y1, y1; 433 vpxor y6, y2, y2; 434 vpxor y7, y3, y3; 435 436 /* t0 ^= t1; */ 437 vpxor x4, x0, x0; 438 vpxor x5, x1, x1; 439 vpxor x6, x2, x2; 440 vpxor x7, x3, x3; 441 442 /* t3 ^= t1; */ 443 vpxor x4, y4, y4; 444 vpxor x5, y5, y5; 445 vpxor x6, y6, y6; 446 vpxor x7, y7, y7; 447 448 /* t2 ^= t0; */ 449 vpxor x0, y0, y0; 450 vpxor x1, y1, y1; 451 vpxor x2, y2, y2; 452 vpxor x3, y3, y3; 453 454 /* t1 ^= t2; */ 455 vpxor y0, x4, x4; 456 vpxor y1, x5, x5; 457 vpxor y2, x6, x6; 458 vpxor y3, x7, x7; 459 460 #define aria_fe(x0, x1, x2, x3, 461 x4, x5, x6, x7, 462 y0, y1, y2, y3, 463 y4, y5, y6, y7, 464 mem_tmp, rk, round) 465 aria_ark_8way(x0, x1, x2, x3, x4, x5, 466 y0, rk, 8, round); 467 468 aria_sbox_8way(x2, x3, x0, x1, x6, x7, 469 y0, y1, y2, y3, y4, y5, 470 471 aria_diff_m(x0, x1, x2, x3, y0, y1, y2 472 aria_diff_m(x4, x5, x6, x7, y0, y1, y2 473 aria_store_state_8way(x0, x1, x2, x3, 474 x4, x5, x6, x7, 475 mem_tmp, 8); 476 477 aria_load_state_8way(x0, x1, x2, x3, 478 x4, x5, x6, x7, 479 mem_tmp, 0); 480 aria_ark_8way(x0, x1, x2, x3, x4, x5, 481 y0, rk, 0, round); 482 483 aria_sbox_8way(x2, x3, x0, x1, x6, x7, 484 y0, y1, y2, y3, y4, y5, 485 486 aria_diff_m(x0, x1, x2, x3, y0, y1, y2 487 aria_diff_m(x4, x5, x6, x7, y0, y1, y2 488 aria_store_state_8way(x0, x1, x2, x3, 489 x4, x5, x6, x7, 490 mem_tmp, 0); 491 aria_load_state_8way(y0, y1, y2, y3, 492 y4, y5, y6, y7, 493 mem_tmp, 8); 494 aria_diff_word(x0, x1, x2, x3, 495 x4, x5, x6, x7, 496 y0, y1, y2, y3, 497 y4, y5, y6, y7); 498 /* aria_diff_byte() 499 * T3 = ABCD -> BADC 500 * T3 = y4, y5, y6, y7 -> y5, y4, y7, 501 * T0 = ABCD -> CDAB 502 * T0 = x0, x1, x2, x3 -> x2, x3, x0, 503 * T1 = ABCD -> DCBA 504 * T1 = x4, x5, x6, x7 -> x7, x6, x5, 505 */ 506 aria_diff_word(x2, x3, x0, x1, 507 x7, x6, x5, x4, 508 y0, y1, y2, y3, 509 y5, y4, y7, y6); 510 aria_store_state_8way(x3, x2, x1, x0, 511 x6, x7, x4, x5, 512 mem_tmp, 0); 513 514 #define aria_fo(x0, x1, x2, x3, 515 x4, x5, x6, x7, 516 y0, y1, y2, y3, 517 y4, y5, y6, y7, 518 mem_tmp, rk, round) 519 aria_ark_8way(x0, x1, x2, x3, x4, x5, 520 y0, rk, 8, round); 521 522 aria_sbox_8way(x0, x1, x2, x3, x4, x5, 523 y0, y1, y2, y3, y4, y5, 524 525 aria_diff_m(x0, x1, x2, x3, y0, y1, y2 526 aria_diff_m(x4, x5, x6, x7, y0, y1, y2 527 aria_store_state_8way(x0, x1, x2, x3, 528 x4, x5, x6, x7, 529 mem_tmp, 8); 530 531 aria_load_state_8way(x0, x1, x2, x3, 532 x4, x5, x6, x7, 533 mem_tmp, 0); 534 aria_ark_8way(x0, x1, x2, x3, x4, x5, 535 y0, rk, 0, round); 536 537 aria_sbox_8way(x0, x1, x2, x3, x4, x5, 538 y0, y1, y2, y3, y4, y5, 539 540 aria_diff_m(x0, x1, x2, x3, y0, y1, y2 541 aria_diff_m(x4, x5, x6, x7, y0, y1, y2 542 aria_store_state_8way(x0, x1, x2, x3, 543 x4, x5, x6, x7, 544 mem_tmp, 0); 545 aria_load_state_8way(y0, y1, y2, y3, 546 y4, y5, y6, y7, 547 mem_tmp, 8); 548 aria_diff_word(x0, x1, x2, x3, 549 x4, x5, x6, x7, 550 y0, y1, y2, y3, 551 y4, y5, y6, y7); 552 /* aria_diff_byte() 553 * T1 = ABCD -> BADC 554 * T1 = x4, x5, x6, x7 -> x5, x4, x7, 555 * T2 = ABCD -> CDAB 556 * T2 = y0, y1, y2, y3, -> y2, y3, y0, 557 * T3 = ABCD -> DCBA 558 * T3 = y4, y5, y6, y7 -> y7, y6, y5, 559 */ 560 aria_diff_word(x0, x1, x2, x3, 561 x5, x4, x7, x6, 562 y2, y3, y0, y1, 563 y7, y6, y5, y4); 564 aria_store_state_8way(x3, x2, x1, x0, 565 x6, x7, x4, x5, 566 mem_tmp, 0); 567 568 #define aria_ff(x0, x1, x2, x3, 569 x4, x5, x6, x7, 570 y0, y1, y2, y3, 571 y4, y5, y6, y7, 572 mem_tmp, rk, round, last_round 573 aria_ark_8way(x0, x1, x2, x3, x4, x5, 574 y0, rk, 8, round); 575 576 aria_sbox_8way(x2, x3, x0, x1, x6, x7, 577 y0, y1, y2, y3, y4, y5, 578 579 aria_ark_8way(x0, x1, x2, x3, x4, x5, 580 y0, rk, 8, last_round); 581 582 aria_store_state_8way(x0, x1, x2, x3, 583 x4, x5, x6, x7, 584 mem_tmp, 8); 585 586 aria_load_state_8way(x0, x1, x2, x3, 587 x4, x5, x6, x7, 588 mem_tmp, 0); 589 aria_ark_8way(x0, x1, x2, x3, x4, x5, 590 y0, rk, 0, round); 591 592 aria_sbox_8way(x2, x3, x0, x1, x6, x7, 593 y0, y1, y2, y3, y4, y5, 594 595 aria_ark_8way(x0, x1, x2, x3, x4, x5, 596 y0, rk, 0, last_round); 597 598 aria_load_state_8way(y0, y1, y2, y3, 599 y4, y5, y6, y7, 600 mem_tmp, 8); 601 #ifdef CONFIG_AS_GFNI 602 #define aria_fe_gfni(x0, x1, x2, x3, 603 x4, x5, x6, x7, 604 y0, y1, y2, y3, 605 y4, y5, y6, y7, 606 mem_tmp, rk, round) 607 aria_ark_8way(x0, x1, x2, x3, x4, x5, 608 y0, rk, 8, round); 609 610 aria_sbox_8way_gfni(x2, x3, x0, x1, 611 x6, x7, x4, x5, 612 y0, y1, y2, y3, 613 y4, y5, y6, y7); 614 615 aria_diff_m(x0, x1, x2, x3, y0, y1, y2 616 aria_diff_m(x4, x5, x6, x7, y0, y1, y2 617 aria_store_state_8way(x0, x1, x2, x3, 618 x4, x5, x6, x7, 619 mem_tmp, 8); 620 621 aria_load_state_8way(x0, x1, x2, x3, 622 x4, x5, x6, x7, 623 mem_tmp, 0); 624 aria_ark_8way(x0, x1, x2, x3, x4, x5, 625 y0, rk, 0, round); 626 627 aria_sbox_8way_gfni(x2, x3, x0, x1, 628 x6, x7, x4, x5, 629 y0, y1, y2, y3, 630 y4, y5, y6, y7); 631 632 aria_diff_m(x0, x1, x2, x3, y0, y1, y2 633 aria_diff_m(x4, x5, x6, x7, y0, y1, y2 634 aria_store_state_8way(x0, x1, x2, x3, 635 x4, x5, x6, x7, 636 mem_tmp, 0); 637 aria_load_state_8way(y0, y1, y2, y3, 638 y4, y5, y6, y7, 639 mem_tmp, 8); 640 aria_diff_word(x0, x1, x2, x3, 641 x4, x5, x6, x7, 642 y0, y1, y2, y3, 643 y4, y5, y6, y7); 644 /* aria_diff_byte() 645 * T3 = ABCD -> BADC 646 * T3 = y4, y5, y6, y7 -> y5, y4, y7, 647 * T0 = ABCD -> CDAB 648 * T0 = x0, x1, x2, x3 -> x2, x3, x0, 649 * T1 = ABCD -> DCBA 650 * T1 = x4, x5, x6, x7 -> x7, x6, x5, 651 */ 652 aria_diff_word(x2, x3, x0, x1, 653 x7, x6, x5, x4, 654 y0, y1, y2, y3, 655 y5, y4, y7, y6); 656 aria_store_state_8way(x3, x2, x1, x0, 657 x6, x7, x4, x5, 658 mem_tmp, 0); 659 660 #define aria_fo_gfni(x0, x1, x2, x3, 661 x4, x5, x6, x7, 662 y0, y1, y2, y3, 663 y4, y5, y6, y7, 664 mem_tmp, rk, round) 665 aria_ark_8way(x0, x1, x2, x3, x4, x5, 666 y0, rk, 8, round); 667 668 aria_sbox_8way_gfni(x0, x1, x2, x3, 669 x4, x5, x6, x7, 670 y0, y1, y2, y3, 671 y4, y5, y6, y7); 672 673 aria_diff_m(x0, x1, x2, x3, y0, y1, y2 674 aria_diff_m(x4, x5, x6, x7, y0, y1, y2 675 aria_store_state_8way(x0, x1, x2, x3, 676 x4, x5, x6, x7, 677 mem_tmp, 8); 678 679 aria_load_state_8way(x0, x1, x2, x3, 680 x4, x5, x6, x7, 681 mem_tmp, 0); 682 aria_ark_8way(x0, x1, x2, x3, x4, x5, 683 y0, rk, 0, round); 684 685 aria_sbox_8way_gfni(x0, x1, x2, x3, 686 x4, x5, x6, x7, 687 y0, y1, y2, y3, 688 y4, y5, y6, y7); 689 690 aria_diff_m(x0, x1, x2, x3, y0, y1, y2 691 aria_diff_m(x4, x5, x6, x7, y0, y1, y2 692 aria_store_state_8way(x0, x1, x2, x3, 693 x4, x5, x6, x7, 694 mem_tmp, 0); 695 aria_load_state_8way(y0, y1, y2, y3, 696 y4, y5, y6, y7, 697 mem_tmp, 8); 698 aria_diff_word(x0, x1, x2, x3, 699 x4, x5, x6, x7, 700 y0, y1, y2, y3, 701 y4, y5, y6, y7); 702 /* aria_diff_byte() 703 * T1 = ABCD -> BADC 704 * T1 = x4, x5, x6, x7 -> x5, x4, x7, 705 * T2 = ABCD -> CDAB 706 * T2 = y0, y1, y2, y3, -> y2, y3, y0, 707 * T3 = ABCD -> DCBA 708 * T3 = y4, y5, y6, y7 -> y7, y6, y5, 709 */ 710 aria_diff_word(x0, x1, x2, x3, 711 x5, x4, x7, x6, 712 y2, y3, y0, y1, 713 y7, y6, y5, y4); 714 aria_store_state_8way(x3, x2, x1, x0, 715 x6, x7, x4, x5, 716 mem_tmp, 0); 717 718 #define aria_ff_gfni(x0, x1, x2, x3, 719 x4, x5, x6, x7, 720 y0, y1, y2, y3, 721 y4, y5, y6, y7, 722 mem_tmp, rk, round, last_round 723 aria_ark_8way(x0, x1, x2, x3, x4, x5, 724 y0, rk, 8, round); 725 726 aria_sbox_8way_gfni(x2, x3, x0, x1, 727 x6, x7, x4, x5, 728 y0, y1, y2, y3, 729 y4, y5, y6, y7); 730 731 aria_ark_8way(x0, x1, x2, x3, x4, x5, 732 y0, rk, 8, last_round); 733 734 aria_store_state_8way(x0, x1, x2, x3, 735 x4, x5, x6, x7, 736 mem_tmp, 8); 737 738 aria_load_state_8way(x0, x1, x2, x3, 739 x4, x5, x6, x7, 740 mem_tmp, 0); 741 aria_ark_8way(x0, x1, x2, x3, x4, x5, 742 y0, rk, 0, round); 743 744 aria_sbox_8way_gfni(x2, x3, x0, x1, 745 x6, x7, x4, x5, 746 y0, y1, y2, y3, 747 y4, y5, y6, y7); 748 749 aria_ark_8way(x0, x1, x2, x3, x4, x5, 750 y0, rk, 0, last_round); 751 752 aria_load_state_8way(y0, y1, y2, y3, 753 y4, y5, y6, y7, 754 mem_tmp, 8); 755 #endif /* CONFIG_AS_GFNI */ 756 757 .section .rodata.cst32.shufb_16x16b, "a 758 .align 32 759 #define SHUFB_BYTES(idx) \ 760 0 + (idx), 4 + (idx), 8 + (idx), 12 + 761 .Lshufb_16x16b: 762 .byte SHUFB_BYTES(0), SHUFB_BYTES(1), 763 .byte SHUFB_BYTES(0), SHUFB_BYTES(1), 764 765 .section .rodata.cst16, "aM", @progbits 766 .align 16 767 /* For isolating SubBytes from AESENCLAST, inv 768 .Linv_shift_row: 769 .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x 770 .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x 771 .Lshift_row: 772 .byte 0x00, 0x05, 0x0a, 0x0f, 0x04, 0x 773 .byte 0x08, 0x0d, 0x02, 0x07, 0x0c, 0x 774 /* For CTR-mode IV byteswap */ 775 .Lbswap128_mask: 776 .byte 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x 777 .byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x 778 779 /* AES inverse affine and S2 combined: 780 * 1 1 0 0 0 0 0 1 x0 0 781 * 0 1 0 0 1 0 0 0 x1 0 782 * 1 1 0 0 1 1 1 1 x2 0 783 * 0 1 1 0 1 0 0 1 x3 1 784 * 0 1 0 0 1 1 0 0 * x4 + 0 785 * 0 1 0 1 1 0 0 0 x5 0 786 * 0 0 0 0 0 1 0 1 x6 0 787 * 1 1 1 0 0 1 1 1 x7 1 788 */ 789 .Ltf_lo__inv_aff__and__s2: 790 .octa 0x92172DA81A9FA520B2370D883ABF85 791 .Ltf_hi__inv_aff__and__s2: 792 .octa 0x2B15FFC1AF917B45E6D8320C625CB6 793 794 /* X2 and AES forward affine combined: 795 * 1 0 1 1 0 0 0 1 x0 0 796 * 0 1 1 1 1 0 1 1 x1 0 797 * 0 0 0 1 1 0 1 0 x2 1 798 * 0 1 0 0 0 1 0 0 x3 0 799 * 0 0 1 1 1 0 1 1 * x4 + 0 800 * 0 1 0 0 1 0 0 0 x5 0 801 * 1 1 0 1 0 0 1 1 x6 0 802 * 0 1 0 0 1 0 1 0 x7 0 803 */ 804 .Ltf_lo__x2__and__fwd_aff: 805 .octa 0xEFAE0544FCBD1657B8F95213ABEA41 806 .Ltf_hi__x2__and__fwd_aff: 807 .octa 0x3F893781E95FE1576CDA64D2BA0CB2 808 809 #ifdef CONFIG_AS_GFNI 810 .section .rodata.cst8, "aM", @progbits, 811 .align 8 812 /* AES affine: */ 813 #define tf_aff_const BV8(1, 1, 0, 0, 0, 1, 1, 814 .Ltf_aff_bitmatrix: 815 .quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1 816 BV8(1, 1, 0, 0, 0, 1, 1, 1 817 BV8(1, 1, 1, 0, 0, 0, 1, 1 818 BV8(1, 1, 1, 1, 0, 0, 0, 1 819 BV8(1, 1, 1, 1, 1, 0, 0, 0 820 BV8(0, 1, 1, 1, 1, 1, 0, 0 821 BV8(0, 0, 1, 1, 1, 1, 1, 0 822 BV8(0, 0, 0, 1, 1, 1, 1, 1 823 824 /* AES inverse affine: */ 825 #define tf_inv_const BV8(1, 0, 1, 0, 0, 0, 0, 826 .Ltf_inv_bitmatrix: 827 .quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1 828 BV8(1, 0, 0, 1, 0, 0, 1, 0 829 BV8(0, 1, 0, 0, 1, 0, 0, 1 830 BV8(1, 0, 1, 0, 0, 1, 0, 0 831 BV8(0, 1, 0, 1, 0, 0, 1, 0 832 BV8(0, 0, 1, 0, 1, 0, 0, 1 833 BV8(1, 0, 0, 1, 0, 1, 0, 0 834 BV8(0, 1, 0, 0, 1, 0, 1, 0 835 836 /* S2: */ 837 #define tf_s2_const BV8(0, 1, 0, 0, 0, 1, 1, 1 838 .Ltf_s2_bitmatrix: 839 .quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1 840 BV8(0, 0, 1, 1, 1, 1, 1, 1 841 BV8(1, 1, 1, 0, 1, 1, 0, 1 842 BV8(1, 1, 0, 0, 0, 0, 1, 1 843 BV8(0, 1, 0, 0, 0, 0, 1, 1 844 BV8(1, 1, 0, 0, 1, 1, 1, 0 845 BV8(0, 1, 1, 0, 0, 0, 1, 1 846 BV8(1, 1, 1, 1, 0, 1, 1, 0 847 848 /* X2: */ 849 #define tf_x2_const BV8(0, 0, 1, 1, 0, 1, 0, 0 850 .Ltf_x2_bitmatrix: 851 .quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0 852 BV8(0, 0, 1, 0, 0, 1, 1, 0 853 BV8(0, 0, 0, 0, 1, 0, 1, 0 854 BV8(1, 1, 1, 0, 0, 0, 1, 1 855 BV8(1, 1, 1, 0, 1, 1, 0, 0 856 BV8(0, 1, 1, 0, 1, 0, 1, 1 857 BV8(1, 0, 1, 1, 1, 1, 0, 1 858 BV8(1, 0, 0, 1, 0, 0, 1, 1 859 860 /* Identity matrix: */ 861 .Ltf_id_bitmatrix: 862 .quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0 863 BV8(0, 1, 0, 0, 0, 0, 0, 0 864 BV8(0, 0, 1, 0, 0, 0, 0, 0 865 BV8(0, 0, 0, 1, 0, 0, 0, 0 866 BV8(0, 0, 0, 0, 1, 0, 0, 0 867 BV8(0, 0, 0, 0, 0, 1, 0, 0 868 BV8(0, 0, 0, 0, 0, 0, 1, 0 869 BV8(0, 0, 0, 0, 0, 0, 0, 1 870 871 #endif /* CONFIG_AS_GFNI */ 872 873 /* 4-bit mask */ 874 .section .rodata.cst4.L0f0f0f0f, "aM", 875 .align 4 876 .L0f0f0f0f: 877 .long 0x0f0f0f0f 878 879 .text 880 881 SYM_FUNC_START_LOCAL(__aria_aesni_avx2_crypt_3 882 /* input: 883 * %r9: rk 884 * %rsi: dst 885 * %rdx: src 886 * %ymm0..%ymm15: byte-sliced blo 887 */ 888 889 FRAME_BEGIN 890 891 movq %rsi, %rax; 892 leaq 8 * 32(%rax), %r8; 893 894 inpack16_post(%ymm0, %ymm1, %ymm2, %ym 895 %ymm8, %ymm9, %ymm10, %y 896 %ymm15, %rax, %r8); 897 aria_fo(%ymm8, %ymm9, %ymm10, %ymm11, 898 %ymm0, %ymm1, %ymm2, %ymm3, %y 899 %rax, %r9, 0); 900 aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %y 901 %ymm8, %ymm9, %ymm10, %ymm11, 902 %ymm15, %rax, %r9, 1); 903 aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, 904 %ymm0, %ymm1, %ymm2, %ymm3, %y 905 %rax, %r9, 2); 906 aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %y 907 %ymm8, %ymm9, %ymm10, %ymm11, 908 %ymm15, %rax, %r9, 3); 909 aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, 910 %ymm0, %ymm1, %ymm2, %ymm3, %y 911 %rax, %r9, 4); 912 aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %y 913 %ymm8, %ymm9, %ymm10, %ymm11, 914 %ymm15, %rax, %r9, 5); 915 aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, 916 %ymm0, %ymm1, %ymm2, %ymm3, %y 917 %rax, %r9, 6); 918 aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %y 919 %ymm8, %ymm9, %ymm10, %ymm11, 920 %ymm15, %rax, %r9, 7); 921 aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, 922 %ymm0, %ymm1, %ymm2, %ymm3, %y 923 %rax, %r9, 8); 924 aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %y 925 %ymm8, %ymm9, %ymm10, %ymm11, 926 %ymm15, %rax, %r9, 9); 927 aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, 928 %ymm0, %ymm1, %ymm2, %ymm3, %y 929 %rax, %r9, 10); 930 cmpl $12, ARIA_CTX_rounds(CTX); 931 jne .Laria_192; 932 aria_ff(%ymm1, %ymm0, %ymm3, %ymm2, %y 933 %ymm8, %ymm9, %ymm10, %ymm11, 934 %ymm15, %rax, %r9, 11, 12); 935 jmp .Laria_end; 936 .Laria_192: 937 aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %y 938 %ymm8, %ymm9, %ymm10, %ymm11, 939 %ymm15, %rax, %r9, 11); 940 aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, 941 %ymm0, %ymm1, %ymm2, %ymm3, %y 942 %rax, %r9, 12); 943 cmpl $14, ARIA_CTX_rounds(CTX); 944 jne .Laria_256; 945 aria_ff(%ymm1, %ymm0, %ymm3, %ymm2, %y 946 %ymm8, %ymm9, %ymm10, %ymm11, 947 %ymm15, %rax, %r9, 13, 14); 948 jmp .Laria_end; 949 .Laria_256: 950 aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %y 951 %ymm8, %ymm9, %ymm10, %ymm11, 952 %ymm15, %rax, %r9, 13); 953 aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, 954 %ymm0, %ymm1, %ymm2, %ymm3, %y 955 %rax, %r9, 14); 956 aria_ff(%ymm1, %ymm0, %ymm3, %ymm2, %y 957 %ymm8, %ymm9, %ymm10, %ymm11, 958 %ymm15, %rax, %r9, 15, 16); 959 .Laria_end: 960 debyteslice_16x16b(%ymm8, %ymm12, %ymm 961 %ymm9, %ymm13, %ymm 962 %ymm10, %ymm14, %ym 963 %ymm11, %ymm15, %ym 964 (%rax), (%r8)); 965 966 FRAME_END 967 RET; 968 SYM_FUNC_END(__aria_aesni_avx2_crypt_32way) 969 970 SYM_TYPED_FUNC_START(aria_aesni_avx2_encrypt_3 971 /* input: 972 * %rdi: ctx, CTX 973 * %rsi: dst 974 * %rdx: src 975 */ 976 977 FRAME_BEGIN 978 979 leaq ARIA_CTX_enc_key(CTX), %r9; 980 981 inpack16_pre(%ymm0, %ymm1, %ymm2, %ymm 982 %ymm8, %ymm9, %ymm10, %ym 983 %ymm15, %rdx); 984 985 call __aria_aesni_avx2_crypt_32way; 986 987 write_output(%ymm1, %ymm0, %ymm3, %ymm 988 %ymm8, %ymm9, %ymm10, %ym 989 %ymm15, %rax); 990 991 FRAME_END 992 RET; 993 SYM_FUNC_END(aria_aesni_avx2_encrypt_32way) 994 995 SYM_TYPED_FUNC_START(aria_aesni_avx2_decrypt_3 996 /* input: 997 * %rdi: ctx, CTX 998 * %rsi: dst 999 * %rdx: src 1000 */ 1001 1002 FRAME_BEGIN 1003 1004 leaq ARIA_CTX_dec_key(CTX), %r9; 1005 1006 inpack16_pre(%ymm0, %ymm1, %ymm2, %ym 1007 %ymm8, %ymm9, %ymm10, %y 1008 %ymm15, %rdx); 1009 1010 call __aria_aesni_avx2_crypt_32way; 1011 1012 write_output(%ymm1, %ymm0, %ymm3, %ym 1013 %ymm8, %ymm9, %ymm10, %y 1014 %ymm15, %rax); 1015 1016 FRAME_END 1017 RET; 1018 SYM_FUNC_END(aria_aesni_avx2_decrypt_32way) 1019 1020 SYM_FUNC_START_LOCAL(__aria_aesni_avx2_ctr_ge 1021 /* input: 1022 * %rdi: ctx 1023 * %rsi: dst 1024 * %rdx: src 1025 * %rcx: keystream 1026 * %r8: iv (big endian, 128bit) 1027 */ 1028 1029 FRAME_BEGIN 1030 movq 8(%r8), %r11; 1031 bswapq %r11; 1032 1033 vbroadcasti128 .Lbswap128_mask (%rip) 1034 vpcmpeqd %ymm0, %ymm0, %ymm0; 1035 vpsrldq $8, %ymm0, %ymm0; /* ab: -1 1036 vpaddq %ymm0, %ymm0, %ymm5; /* ab: -2 1037 1038 /* load IV and byteswap */ 1039 vmovdqu (%r8), %xmm7; 1040 vpshufb %xmm6, %xmm7, %xmm7; 1041 vmovdqa %xmm7, %xmm3; 1042 inc_le128(%xmm7, %xmm0, %xmm4); 1043 vinserti128 $1, %xmm7, %ymm3, %ymm3; 1044 vpshufb %ymm6, %ymm3, %ymm8; /* +1 ; 1045 1046 /* check need for handling 64-bit ove 1047 cmpq $(0xffffffffffffffff - 32), %r11 1048 ja .Lhandle_ctr_carry; 1049 1050 /* construct IVs */ 1051 vpsubq %ymm5, %ymm3, %ymm3; /* +3 ; + 1052 vpshufb %ymm6, %ymm3, %ymm9; 1053 vpsubq %ymm5, %ymm3, %ymm3; /* +5 ; + 1054 vpshufb %ymm6, %ymm3, %ymm10; 1055 vpsubq %ymm5, %ymm3, %ymm3; /* +7 ; + 1056 vpshufb %ymm6, %ymm3, %ymm11; 1057 vpsubq %ymm5, %ymm3, %ymm3; /* +9 ; + 1058 vpshufb %ymm6, %ymm3, %ymm12; 1059 vpsubq %ymm5, %ymm3, %ymm3; /* +11 ; 1060 vpshufb %ymm6, %ymm3, %ymm13; 1061 vpsubq %ymm5, %ymm3, %ymm3; /* +13 ; 1062 vpshufb %ymm6, %ymm3, %ymm14; 1063 vpsubq %ymm5, %ymm3, %ymm3; /* +15 ; 1064 vpshufb %ymm6, %ymm3, %ymm15; 1065 vmovdqu %ymm8, (0 * 32)(%rcx); 1066 vmovdqu %ymm9, (1 * 32)(%rcx); 1067 vmovdqu %ymm10, (2 * 32)(%rcx); 1068 vmovdqu %ymm11, (3 * 32)(%rcx); 1069 vmovdqu %ymm12, (4 * 32)(%rcx); 1070 vmovdqu %ymm13, (5 * 32)(%rcx); 1071 vmovdqu %ymm14, (6 * 32)(%rcx); 1072 vmovdqu %ymm15, (7 * 32)(%rcx); 1073 1074 vpsubq %ymm5, %ymm3, %ymm3; /* +17 ; 1075 vpshufb %ymm6, %ymm3, %ymm8; 1076 vpsubq %ymm5, %ymm3, %ymm3; /* +19 ; 1077 vpshufb %ymm6, %ymm3, %ymm9; 1078 vpsubq %ymm5, %ymm3, %ymm3; /* +21 ; 1079 vpshufb %ymm6, %ymm3, %ymm10; 1080 vpsubq %ymm5, %ymm3, %ymm3; /* +23 ; 1081 vpshufb %ymm6, %ymm3, %ymm11; 1082 vpsubq %ymm5, %ymm3, %ymm3; /* +25 ; 1083 vpshufb %ymm6, %ymm3, %ymm12; 1084 vpsubq %ymm5, %ymm3, %ymm3; /* +27 ; 1085 vpshufb %ymm6, %ymm3, %ymm13; 1086 vpsubq %ymm5, %ymm3, %ymm3; /* +29 ; 1087 vpshufb %ymm6, %ymm3, %ymm14; 1088 vpsubq %ymm5, %ymm3, %ymm3; /* +31 ; 1089 vpshufb %ymm6, %ymm3, %ymm15; 1090 vpsubq %ymm5, %ymm3, %ymm3; /* +32 */ 1091 vpshufb %xmm6, %xmm3, %xmm3; 1092 vmovdqu %xmm3, (%r8); 1093 vmovdqu (0 * 32)(%rcx), %ymm0; 1094 vmovdqu (1 * 32)(%rcx), %ymm1; 1095 vmovdqu (2 * 32)(%rcx), %ymm2; 1096 vmovdqu (3 * 32)(%rcx), %ymm3; 1097 vmovdqu (4 * 32)(%rcx), %ymm4; 1098 vmovdqu (5 * 32)(%rcx), %ymm5; 1099 vmovdqu (6 * 32)(%rcx), %ymm6; 1100 vmovdqu (7 * 32)(%rcx), %ymm7; 1101 jmp .Lctr_carry_done; 1102 1103 .Lhandle_ctr_carry: 1104 /* construct IVs */ 1105 inc_le128(%ymm3, %ymm0, %ymm4); 1106 inc_le128(%ymm3, %ymm0, %ymm4); 1107 vpshufb %ymm6, %ymm3, %ymm9; /* +3 ; 1108 inc_le128(%ymm3, %ymm0, %ymm4); 1109 inc_le128(%ymm3, %ymm0, %ymm4); 1110 vpshufb %ymm6, %ymm3, %ymm10; /* +5 ; 1111 inc_le128(%ymm3, %ymm0, %ymm4); 1112 inc_le128(%ymm3, %ymm0, %ymm4); 1113 vpshufb %ymm6, %ymm3, %ymm11; /* +7 ; 1114 inc_le128(%ymm3, %ymm0, %ymm4); 1115 inc_le128(%ymm3, %ymm0, %ymm4); 1116 vpshufb %ymm6, %ymm3, %ymm12; /* +9 ; 1117 inc_le128(%ymm3, %ymm0, %ymm4); 1118 inc_le128(%ymm3, %ymm0, %ymm4); 1119 vpshufb %ymm6, %ymm3, %ymm13; /* +11 1120 inc_le128(%ymm3, %ymm0, %ymm4); 1121 inc_le128(%ymm3, %ymm0, %ymm4); 1122 vpshufb %ymm6, %ymm3, %ymm14; /* +13 1123 inc_le128(%ymm3, %ymm0, %ymm4); 1124 inc_le128(%ymm3, %ymm0, %ymm4); 1125 vpshufb %ymm6, %ymm3, %ymm15; /* +15 1126 vmovdqu %ymm8, (0 * 32)(%rcx); 1127 vmovdqu %ymm9, (1 * 32)(%rcx); 1128 vmovdqu %ymm10, (2 * 32)(%rcx); 1129 vmovdqu %ymm11, (3 * 32)(%rcx); 1130 vmovdqu %ymm12, (4 * 32)(%rcx); 1131 vmovdqu %ymm13, (5 * 32)(%rcx); 1132 vmovdqu %ymm14, (6 * 32)(%rcx); 1133 vmovdqu %ymm15, (7 * 32)(%rcx); 1134 1135 inc_le128(%ymm3, %ymm0, %ymm4); 1136 inc_le128(%ymm3, %ymm0, %ymm4); 1137 vpshufb %ymm6, %ymm3, %ymm8; /* +17 ; 1138 inc_le128(%ymm3, %ymm0, %ymm4); 1139 inc_le128(%ymm3, %ymm0, %ymm4); 1140 vpshufb %ymm6, %ymm3, %ymm9; /* +19 ; 1141 inc_le128(%ymm3, %ymm0, %ymm4); 1142 inc_le128(%ymm3, %ymm0, %ymm4); 1143 vpshufb %ymm6, %ymm3, %ymm10; /* +21 1144 inc_le128(%ymm3, %ymm0, %ymm4); 1145 inc_le128(%ymm3, %ymm0, %ymm4); 1146 vpshufb %ymm6, %ymm3, %ymm11; /* +23 1147 inc_le128(%ymm3, %ymm0, %ymm4); 1148 inc_le128(%ymm3, %ymm0, %ymm4); 1149 vpshufb %ymm6, %ymm3, %ymm12; /* +25 1150 inc_le128(%ymm3, %ymm0, %ymm4); 1151 inc_le128(%ymm3, %ymm0, %ymm4); 1152 vpshufb %ymm6, %ymm3, %ymm13; /* +27 1153 inc_le128(%ymm3, %ymm0, %ymm4); 1154 inc_le128(%ymm3, %ymm0, %ymm4); 1155 vpshufb %ymm6, %ymm3, %ymm14; /* +29 1156 inc_le128(%ymm3, %ymm0, %ymm4); 1157 inc_le128(%ymm3, %ymm0, %ymm4); 1158 vpshufb %ymm6, %ymm3, %ymm15; /* +31 1159 inc_le128(%ymm3, %ymm0, %ymm4); 1160 vextracti128 $1, %ymm3, %xmm3; 1161 vpshufb %xmm6, %xmm3, %xmm3; /* +32 * 1162 vmovdqu %xmm3, (%r8); 1163 vmovdqu (0 * 32)(%rcx), %ymm0; 1164 vmovdqu (1 * 32)(%rcx), %ymm1; 1165 vmovdqu (2 * 32)(%rcx), %ymm2; 1166 vmovdqu (3 * 32)(%rcx), %ymm3; 1167 vmovdqu (4 * 32)(%rcx), %ymm4; 1168 vmovdqu (5 * 32)(%rcx), %ymm5; 1169 vmovdqu (6 * 32)(%rcx), %ymm6; 1170 vmovdqu (7 * 32)(%rcx), %ymm7; 1171 1172 .Lctr_carry_done: 1173 1174 FRAME_END 1175 RET; 1176 SYM_FUNC_END(__aria_aesni_avx2_ctr_gen_keystr 1177 1178 SYM_TYPED_FUNC_START(aria_aesni_avx2_ctr_cryp 1179 /* input: 1180 * %rdi: ctx 1181 * %rsi: dst 1182 * %rdx: src 1183 * %rcx: keystream 1184 * %r8: iv (big endian, 128bit) 1185 */ 1186 FRAME_BEGIN 1187 1188 call __aria_aesni_avx2_ctr_gen_keystr 1189 1190 leaq (%rsi), %r10; 1191 leaq (%rdx), %r11; 1192 leaq (%rcx), %rsi; 1193 leaq (%rcx), %rdx; 1194 leaq ARIA_CTX_enc_key(CTX), %r9; 1195 1196 call __aria_aesni_avx2_crypt_32way; 1197 1198 vpxor (0 * 32)(%r11), %ymm1, %ymm1; 1199 vpxor (1 * 32)(%r11), %ymm0, %ymm0; 1200 vpxor (2 * 32)(%r11), %ymm3, %ymm3; 1201 vpxor (3 * 32)(%r11), %ymm2, %ymm2; 1202 vpxor (4 * 32)(%r11), %ymm4, %ymm4; 1203 vpxor (5 * 32)(%r11), %ymm5, %ymm5; 1204 vpxor (6 * 32)(%r11), %ymm6, %ymm6; 1205 vpxor (7 * 32)(%r11), %ymm7, %ymm7; 1206 vpxor (8 * 32)(%r11), %ymm8, %ymm8; 1207 vpxor (9 * 32)(%r11), %ymm9, %ymm9; 1208 vpxor (10 * 32)(%r11), %ymm10, %ymm10 1209 vpxor (11 * 32)(%r11), %ymm11, %ymm11 1210 vpxor (12 * 32)(%r11), %ymm12, %ymm12 1211 vpxor (13 * 32)(%r11), %ymm13, %ymm13 1212 vpxor (14 * 32)(%r11), %ymm14, %ymm14 1213 vpxor (15 * 32)(%r11), %ymm15, %ymm15 1214 write_output(%ymm1, %ymm0, %ymm3, %ym 1215 %ymm8, %ymm9, %ymm10, %y 1216 %ymm15, %r10); 1217 1218 FRAME_END 1219 RET; 1220 SYM_FUNC_END(aria_aesni_avx2_ctr_crypt_32way) 1221 1222 #ifdef CONFIG_AS_GFNI 1223 SYM_FUNC_START_LOCAL(__aria_aesni_avx2_gfni_c 1224 /* input: 1225 * %r9: rk 1226 * %rsi: dst 1227 * %rdx: src 1228 * %ymm0..%ymm15: 16 byte-sliced 1229 */ 1230 1231 FRAME_BEGIN 1232 1233 movq %rsi, %rax; 1234 leaq 8 * 32(%rax), %r8; 1235 1236 inpack16_post(%ymm0, %ymm1, %ymm2, %y 1237 %ymm4, %ymm5, %ymm6, %y 1238 %ymm8, %ymm9, %ymm10, % 1239 %ymm12, %ymm13, %ymm14, 1240 %ymm15, %rax, %r8); 1241 aria_fo_gfni(%ymm8, %ymm9, %ymm10, %y 1242 %ymm12, %ymm13, %ymm14, 1243 %ymm0, %ymm1, %ymm2, %ym 1244 %ymm4, %ymm5, %ymm6, %ym 1245 %rax, %r9, 0); 1246 aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ym 1247 %ymm4, %ymm5, %ymm6, %ym 1248 %ymm8, %ymm9, %ymm10, %y 1249 %ymm12, %ymm13, %ymm14, 1250 %ymm15, %rax, %r9, 1); 1251 aria_fo_gfni(%ymm9, %ymm8, %ymm11, %y 1252 %ymm12, %ymm13, %ymm14, 1253 %ymm0, %ymm1, %ymm2, %ym 1254 %ymm4, %ymm5, %ymm6, %ym 1255 %rax, %r9, 2); 1256 aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ym 1257 %ymm4, %ymm5, %ymm6, %ym 1258 %ymm8, %ymm9, %ymm10, %y 1259 %ymm12, %ymm13, %ymm14, 1260 %ymm15, %rax, %r9, 3); 1261 aria_fo_gfni(%ymm9, %ymm8, %ymm11, %y 1262 %ymm12, %ymm13, %ymm14, 1263 %ymm0, %ymm1, %ymm2, %ym 1264 %ymm4, %ymm5, %ymm6, %ym 1265 %rax, %r9, 4); 1266 aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ym 1267 %ymm4, %ymm5, %ymm6, %ym 1268 %ymm8, %ymm9, %ymm10, %y 1269 %ymm12, %ymm13, %ymm14, 1270 %ymm15, %rax, %r9, 5); 1271 aria_fo_gfni(%ymm9, %ymm8, %ymm11, %y 1272 %ymm12, %ymm13, %ymm14, 1273 %ymm0, %ymm1, %ymm2, %ym 1274 %ymm4, %ymm5, %ymm6, %ym 1275 %rax, %r9, 6); 1276 aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ym 1277 %ymm4, %ymm5, %ymm6, %ym 1278 %ymm8, %ymm9, %ymm10, %y 1279 %ymm12, %ymm13, %ymm14, 1280 %ymm15, %rax, %r9, 7); 1281 aria_fo_gfni(%ymm9, %ymm8, %ymm11, %y 1282 %ymm12, %ymm13, %ymm14, 1283 %ymm0, %ymm1, %ymm2, %ym 1284 %ymm4, %ymm5, %ymm6, %ym 1285 %rax, %r9, 8); 1286 aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ym 1287 %ymm4, %ymm5, %ymm6, %ym 1288 %ymm8, %ymm9, %ymm10, %y 1289 %ymm12, %ymm13, %ymm14, 1290 %ymm15, %rax, %r9, 9); 1291 aria_fo_gfni(%ymm9, %ymm8, %ymm11, %y 1292 %ymm12, %ymm13, %ymm14, 1293 %ymm0, %ymm1, %ymm2, %ym 1294 %ymm4, %ymm5, %ymm6, %ym 1295 %rax, %r9, 10); 1296 cmpl $12, ARIA_CTX_rounds(CTX); 1297 jne .Laria_gfni_192; 1298 aria_ff_gfni(%ymm1, %ymm0, %ymm3, %ym 1299 %ymm8, %ymm9, %ymm10, %ymm11, 1300 %ymm15, %rax, %r9, 11, 12); 1301 jmp .Laria_gfni_end; 1302 .Laria_gfni_192: 1303 aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ym 1304 %ymm4, %ymm5, %ymm6, %ym 1305 %ymm8, %ymm9, %ymm10, %y 1306 %ymm12, %ymm13, %ymm14, 1307 %ymm15, %rax, %r9, 11); 1308 aria_fo_gfni(%ymm9, %ymm8, %ymm11, %y 1309 %ymm12, %ymm13, %ymm14, 1310 %ymm0, %ymm1, %ymm2, %ym 1311 %ymm4, %ymm5, %ymm6, %ym 1312 %rax, %r9, 12); 1313 cmpl $14, ARIA_CTX_rounds(CTX); 1314 jne .Laria_gfni_256; 1315 aria_ff_gfni(%ymm1, %ymm0, %ymm3, %ym 1316 %ymm4, %ymm5, %ymm6, %ym 1317 %ymm8, %ymm9, %ymm10, %y 1318 %ymm12, %ymm13, %ymm14, 1319 %ymm15, %rax, %r9, 13, 1 1320 jmp .Laria_gfni_end; 1321 .Laria_gfni_256: 1322 aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ym 1323 %ymm4, %ymm5, %ymm6, %ym 1324 %ymm8, %ymm9, %ymm10, %y 1325 %ymm12, %ymm13, %ymm14, 1326 %ymm15, %rax, %r9, 13); 1327 aria_fo_gfni(%ymm9, %ymm8, %ymm11, %y 1328 %ymm12, %ymm13, %ymm14, 1329 %ymm0, %ymm1, %ymm2, %ym 1330 %ymm4, %ymm5, %ymm6, %ym 1331 %rax, %r9, 14); 1332 aria_ff_gfni(%ymm1, %ymm0, %ymm3, %ym 1333 %ymm4, %ymm5, %ymm6, %ym 1334 %ymm8, %ymm9, %ymm10, %y 1335 %ymm12, %ymm13, %ymm14, 1336 %ymm15, %rax, %r9, 15, 1 1337 .Laria_gfni_end: 1338 debyteslice_16x16b(%ymm8, %ymm12, %ym 1339 %ymm9, %ymm13, %ym 1340 %ymm10, %ymm14, %y 1341 %ymm11, %ymm15, %y 1342 (%rax), (%r8)); 1343 1344 FRAME_END 1345 RET; 1346 SYM_FUNC_END(__aria_aesni_avx2_gfni_crypt_32w 1347 1348 SYM_TYPED_FUNC_START(aria_aesni_avx2_gfni_enc 1349 /* input: 1350 * %rdi: ctx, CTX 1351 * %rsi: dst 1352 * %rdx: src 1353 */ 1354 1355 FRAME_BEGIN 1356 1357 leaq ARIA_CTX_enc_key(CTX), %r9; 1358 1359 inpack16_pre(%ymm0, %ymm1, %ymm2, %ym 1360 %ymm8, %ymm9, %ymm10, %y 1361 %ymm15, %rdx); 1362 1363 call __aria_aesni_avx2_gfni_crypt_32w 1364 1365 write_output(%ymm1, %ymm0, %ymm3, %ym 1366 %ymm8, %ymm9, %ymm10, %y 1367 %ymm15, %rax); 1368 1369 FRAME_END 1370 RET; 1371 SYM_FUNC_END(aria_aesni_avx2_gfni_encrypt_32w 1372 1373 SYM_TYPED_FUNC_START(aria_aesni_avx2_gfni_dec 1374 /* input: 1375 * %rdi: ctx, CTX 1376 * %rsi: dst 1377 * %rdx: src 1378 */ 1379 1380 FRAME_BEGIN 1381 1382 leaq ARIA_CTX_dec_key(CTX), %r9; 1383 1384 inpack16_pre(%ymm0, %ymm1, %ymm2, %ym 1385 %ymm8, %ymm9, %ymm10, %y 1386 %ymm15, %rdx); 1387 1388 call __aria_aesni_avx2_gfni_crypt_32w 1389 1390 write_output(%ymm1, %ymm0, %ymm3, %ym 1391 %ymm8, %ymm9, %ymm10, %y 1392 %ymm15, %rax); 1393 1394 FRAME_END 1395 RET; 1396 SYM_FUNC_END(aria_aesni_avx2_gfni_decrypt_32w 1397 1398 SYM_TYPED_FUNC_START(aria_aesni_avx2_gfni_ctr 1399 /* input: 1400 * %rdi: ctx 1401 * %rsi: dst 1402 * %rdx: src 1403 * %rcx: keystream 1404 * %r8: iv (big endian, 128bit) 1405 */ 1406 FRAME_BEGIN 1407 1408 call __aria_aesni_avx2_ctr_gen_keystr 1409 1410 leaq (%rsi), %r10; 1411 leaq (%rdx), %r11; 1412 leaq (%rcx), %rsi; 1413 leaq (%rcx), %rdx; 1414 leaq ARIA_CTX_enc_key(CTX), %r9; 1415 1416 call __aria_aesni_avx2_gfni_crypt_32w 1417 1418 vpxor (0 * 32)(%r11), %ymm1, %ymm1; 1419 vpxor (1 * 32)(%r11), %ymm0, %ymm0; 1420 vpxor (2 * 32)(%r11), %ymm3, %ymm3; 1421 vpxor (3 * 32)(%r11), %ymm2, %ymm2; 1422 vpxor (4 * 32)(%r11), %ymm4, %ymm4; 1423 vpxor (5 * 32)(%r11), %ymm5, %ymm5; 1424 vpxor (6 * 32)(%r11), %ymm6, %ymm6; 1425 vpxor (7 * 32)(%r11), %ymm7, %ymm7; 1426 vpxor (8 * 32)(%r11), %ymm8, %ymm8; 1427 vpxor (9 * 32)(%r11), %ymm9, %ymm9; 1428 vpxor (10 * 32)(%r11), %ymm10, %ymm10 1429 vpxor (11 * 32)(%r11), %ymm11, %ymm11 1430 vpxor (12 * 32)(%r11), %ymm12, %ymm12 1431 vpxor (13 * 32)(%r11), %ymm13, %ymm13 1432 vpxor (14 * 32)(%r11), %ymm14, %ymm14 1433 vpxor (15 * 32)(%r11), %ymm15, %ymm15 1434 write_output(%ymm1, %ymm0, %ymm3, %ym 1435 %ymm8, %ymm9, %ymm10, %y 1436 %ymm15, %r10); 1437 1438 FRAME_END 1439 RET; 1440 SYM_FUNC_END(aria_aesni_avx2_gfni_ctr_crypt_3 1441 #endif /* CONFIG_AS_GFNI */
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.