1 /* SPDX-License-Identifier: GPL-2.0-or-later * 2 /* 3 * ARIA Cipher 16-way parallel algorithm (AVX) 4 * 5 * Copyright (c) 2022 Taehee Yoo <ap420073@gmai 6 * 7 */ 8 9 #include <linux/linkage.h> 10 #include <linux/cfi_types.h> 11 #include <asm/asm-offsets.h> 12 #include <asm/frame.h> 13 14 /* register macros */ 15 #define CTX %rdi 16 17 18 #define BV8(a0, a1, a2, a3, a4, a5, a6, a7) 19 ( (((a0) & 1) << 0) | 20 (((a1) & 1) << 1) | 21 (((a2) & 1) << 2) | 22 (((a3) & 1) << 3) | 23 (((a4) & 1) << 4) | 24 (((a5) & 1) << 5) | 25 (((a6) & 1) << 6) | 26 (((a7) & 1) << 7) ) 27 28 #define BM8X8(l0, l1, l2, l3, l4, l5, l6, l7) 29 ( ((l7) << (0 * 8)) | 30 ((l6) << (1 * 8)) | 31 ((l5) << (2 * 8)) | 32 ((l4) << (3 * 8)) | 33 ((l3) << (4 * 8)) | 34 ((l2) << (5 * 8)) | 35 ((l1) << (6 * 8)) | 36 ((l0) << (7 * 8)) ) 37 38 #define inc_le128(x, minus_one, tmp) 39 vpcmpeqq minus_one, x, tmp; 40 vpsubq minus_one, x, x; 41 vpslldq $8, tmp, tmp; 42 vpsubq tmp, x, x; 43 44 #define filter_8bit(x, lo_t, hi_t, mask4bit, t 45 vpand x, mask4bit, tmp0; 46 vpandn x, mask4bit, x; 47 vpsrld $4, x, x; 48 49 vpshufb tmp0, lo_t, tmp0; 50 vpshufb x, hi_t, x; 51 vpxor tmp0, x, x; 52 53 #define transpose_4x4(x0, x1, x2, x3, t1, t2) 54 vpunpckhdq x1, x0, t2; 55 vpunpckldq x1, x0, x0; 56 57 vpunpckldq x3, x2, t1; 58 vpunpckhdq x3, x2, x2; 59 60 vpunpckhqdq t1, x0, x1; 61 vpunpcklqdq t1, x0, x0; 62 63 vpunpckhqdq x2, t2, x3; 64 vpunpcklqdq x2, t2, x2; 65 66 #define byteslice_16x16b(a0, b0, c0, d0, 67 a1, b1, c1, d1, 68 a2, b2, c2, d2, 69 a3, b3, c3, d3, 70 st0, st1) 71 vmovdqu d2, st0; 72 vmovdqu d3, st1; 73 transpose_4x4(a0, a1, a2, a3, d2, d3); 74 transpose_4x4(b0, b1, b2, b3, d2, d3); 75 vmovdqu st0, d2; 76 vmovdqu st1, d3; 77 78 vmovdqu a0, st0; 79 vmovdqu a1, st1; 80 transpose_4x4(c0, c1, c2, c3, a0, a1); 81 transpose_4x4(d0, d1, d2, d3, a0, a1); 82 83 vmovdqu .Lshufb_16x16b(%rip), a0; 84 vmovdqu st1, a1; 85 vpshufb a0, a2, a2; 86 vpshufb a0, a3, a3; 87 vpshufb a0, b0, b0; 88 vpshufb a0, b1, b1; 89 vpshufb a0, b2, b2; 90 vpshufb a0, b3, b3; 91 vpshufb a0, a1, a1; 92 vpshufb a0, c0, c0; 93 vpshufb a0, c1, c1; 94 vpshufb a0, c2, c2; 95 vpshufb a0, c3, c3; 96 vpshufb a0, d0, d0; 97 vpshufb a0, d1, d1; 98 vpshufb a0, d2, d2; 99 vpshufb a0, d3, d3; 100 vmovdqu d3, st1; 101 vmovdqu st0, d3; 102 vpshufb a0, d3, a0; 103 vmovdqu d2, st0; 104 105 transpose_4x4(a0, b0, c0, d0, d2, d3); 106 transpose_4x4(a1, b1, c1, d1, d2, d3); 107 vmovdqu st0, d2; 108 vmovdqu st1, d3; 109 110 vmovdqu b0, st0; 111 vmovdqu b1, st1; 112 transpose_4x4(a2, b2, c2, d2, b0, b1); 113 transpose_4x4(a3, b3, c3, d3, b0, b1); 114 vmovdqu st0, b0; 115 vmovdqu st1, b1; 116 /* does not adjust output bytes inside 117 118 #define debyteslice_16x16b(a0, b0, c0, d0, 119 a1, b1, c1, d1, 120 a2, b2, c2, d2, 121 a3, b3, c3, d3, 122 st0, st1) 123 vmovdqu d2, st0; 124 vmovdqu d3, st1; 125 transpose_4x4(a0, a1, a2, a3, d2, d3); 126 transpose_4x4(b0, b1, b2, b3, d2, d3); 127 vmovdqu st0, d2; 128 vmovdqu st1, d3; 129 130 vmovdqu a0, st0; 131 vmovdqu a1, st1; 132 transpose_4x4(c0, c1, c2, c3, a0, a1); 133 transpose_4x4(d0, d1, d2, d3, a0, a1); 134 135 vmovdqu .Lshufb_16x16b(%rip), a0; 136 vmovdqu st1, a1; 137 vpshufb a0, a2, a2; 138 vpshufb a0, a3, a3; 139 vpshufb a0, b0, b0; 140 vpshufb a0, b1, b1; 141 vpshufb a0, b2, b2; 142 vpshufb a0, b3, b3; 143 vpshufb a0, a1, a1; 144 vpshufb a0, c0, c0; 145 vpshufb a0, c1, c1; 146 vpshufb a0, c2, c2; 147 vpshufb a0, c3, c3; 148 vpshufb a0, d0, d0; 149 vpshufb a0, d1, d1; 150 vpshufb a0, d2, d2; 151 vpshufb a0, d3, d3; 152 vmovdqu d3, st1; 153 vmovdqu st0, d3; 154 vpshufb a0, d3, a0; 155 vmovdqu d2, st0; 156 157 transpose_4x4(c0, d0, a0, b0, d2, d3); 158 transpose_4x4(c1, d1, a1, b1, d2, d3); 159 vmovdqu st0, d2; 160 vmovdqu st1, d3; 161 162 vmovdqu b0, st0; 163 vmovdqu b1, st1; 164 transpose_4x4(c2, d2, a2, b2, b0, b1); 165 transpose_4x4(c3, d3, a3, b3, b0, b1); 166 vmovdqu st0, b0; 167 vmovdqu st1, b1; 168 /* does not adjust output bytes inside 169 170 /* load blocks to registers and apply pre-whit 171 #define inpack16_pre(x0, x1, x2, x3, 172 x4, x5, x6, x7, 173 y0, y1, y2, y3, 174 y4, y5, y6, y7, 175 rio) 176 vmovdqu (0 * 16)(rio), x0; 177 vmovdqu (1 * 16)(rio), x1; 178 vmovdqu (2 * 16)(rio), x2; 179 vmovdqu (3 * 16)(rio), x3; 180 vmovdqu (4 * 16)(rio), x4; 181 vmovdqu (5 * 16)(rio), x5; 182 vmovdqu (6 * 16)(rio), x6; 183 vmovdqu (7 * 16)(rio), x7; 184 vmovdqu (8 * 16)(rio), y0; 185 vmovdqu (9 * 16)(rio), y1; 186 vmovdqu (10 * 16)(rio), y2; 187 vmovdqu (11 * 16)(rio), y3; 188 vmovdqu (12 * 16)(rio), y4; 189 vmovdqu (13 * 16)(rio), y5; 190 vmovdqu (14 * 16)(rio), y6; 191 vmovdqu (15 * 16)(rio), y7; 192 193 /* byteslice pre-whitened blocks and store to 194 #define inpack16_post(x0, x1, x2, x3, 195 x4, x5, x6, x7, 196 y0, y1, y2, y3, 197 y4, y5, y6, y7, 198 mem_ab, mem_cd) 199 byteslice_16x16b(x0, x1, x2, x3, 200 x4, x5, x6, x7, 201 y0, y1, y2, y3, 202 y4, y5, y6, y7, 203 (mem_ab), (mem_cd)); 204 205 vmovdqu x0, 0 * 16(mem_ab); 206 vmovdqu x1, 1 * 16(mem_ab); 207 vmovdqu x2, 2 * 16(mem_ab); 208 vmovdqu x3, 3 * 16(mem_ab); 209 vmovdqu x4, 4 * 16(mem_ab); 210 vmovdqu x5, 5 * 16(mem_ab); 211 vmovdqu x6, 6 * 16(mem_ab); 212 vmovdqu x7, 7 * 16(mem_ab); 213 vmovdqu y0, 0 * 16(mem_cd); 214 vmovdqu y1, 1 * 16(mem_cd); 215 vmovdqu y2, 2 * 16(mem_cd); 216 vmovdqu y3, 3 * 16(mem_cd); 217 vmovdqu y4, 4 * 16(mem_cd); 218 vmovdqu y5, 5 * 16(mem_cd); 219 vmovdqu y6, 6 * 16(mem_cd); 220 vmovdqu y7, 7 * 16(mem_cd); 221 222 #define write_output(x0, x1, x2, x3, 223 x4, x5, x6, x7, 224 y0, y1, y2, y3, 225 y4, y5, y6, y7, 226 mem) 227 vmovdqu x0, 0 * 16(mem); 228 vmovdqu x1, 1 * 16(mem); 229 vmovdqu x2, 2 * 16(mem); 230 vmovdqu x3, 3 * 16(mem); 231 vmovdqu x4, 4 * 16(mem); 232 vmovdqu x5, 5 * 16(mem); 233 vmovdqu x6, 6 * 16(mem); 234 vmovdqu x7, 7 * 16(mem); 235 vmovdqu y0, 8 * 16(mem); 236 vmovdqu y1, 9 * 16(mem); 237 vmovdqu y2, 10 * 16(mem); 238 vmovdqu y3, 11 * 16(mem); 239 vmovdqu y4, 12 * 16(mem); 240 vmovdqu y5, 13 * 16(mem); 241 vmovdqu y6, 14 * 16(mem); 242 vmovdqu y7, 15 * 16(mem); 243 244 #define aria_store_state_8way(x0, x1, x2, x3, 245 x4, x5, x6, x7, 246 mem_tmp, idx) 247 vmovdqu x0, ((idx + 0) * 16)(mem_tmp); 248 vmovdqu x1, ((idx + 1) * 16)(mem_tmp); 249 vmovdqu x2, ((idx + 2) * 16)(mem_tmp); 250 vmovdqu x3, ((idx + 3) * 16)(mem_tmp); 251 vmovdqu x4, ((idx + 4) * 16)(mem_tmp); 252 vmovdqu x5, ((idx + 5) * 16)(mem_tmp); 253 vmovdqu x6, ((idx + 6) * 16)(mem_tmp); 254 vmovdqu x7, ((idx + 7) * 16)(mem_tmp); 255 256 #define aria_load_state_8way(x0, x1, x2, x3, 257 x4, x5, x6, x7, 258 mem_tmp, idx) 259 vmovdqu ((idx + 0) * 16)(mem_tmp), x0; 260 vmovdqu ((idx + 1) * 16)(mem_tmp), x1; 261 vmovdqu ((idx + 2) * 16)(mem_tmp), x2; 262 vmovdqu ((idx + 3) * 16)(mem_tmp), x3; 263 vmovdqu ((idx + 4) * 16)(mem_tmp), x4; 264 vmovdqu ((idx + 5) * 16)(mem_tmp), x5; 265 vmovdqu ((idx + 6) * 16)(mem_tmp), x6; 266 vmovdqu ((idx + 7) * 16)(mem_tmp), x7; 267 268 #define aria_ark_8way(x0, x1, x2, x3, 269 x4, x5, x6, x7, 270 t0, t1, t2, rk, 271 idx, round) 272 /* AddRoundKey */ 273 vbroadcastss ((round * 16) + idx + 0)( 274 vpsrld $24, t0, t2; 275 vpshufb t1, t2, t2; 276 vpxor t2, x0, x0; 277 vpsrld $16, t0, t2; 278 vpshufb t1, t2, t2; 279 vpxor t2, x1, x1; 280 vpsrld $8, t0, t2; 281 vpshufb t1, t2, t2; 282 vpxor t2, x2, x2; 283 vpshufb t1, t0, t2; 284 vpxor t2, x3, x3; 285 vbroadcastss ((round * 16) + idx + 4)( 286 vpsrld $24, t0, t2; 287 vpshufb t1, t2, t2; 288 vpxor t2, x4, x4; 289 vpsrld $16, t0, t2; 290 vpshufb t1, t2, t2; 291 vpxor t2, x5, x5; 292 vpsrld $8, t0, t2; 293 vpshufb t1, t2, t2; 294 vpxor t2, x6, x6; 295 vpshufb t1, t0, t2; 296 vpxor t2, x7, x7; 297 298 #ifdef CONFIG_AS_GFNI 299 #define aria_sbox_8way_gfni(x0, x1, x2, x3, 300 x4, x5, x6, x7, 301 t0, t1, t2, t3, 302 t4, t5, t6, t7) 303 vmovdqa .Ltf_s2_bitmatrix(%rip), t0; 304 vmovdqa .Ltf_inv_bitmatrix(%rip), t1; 305 vmovdqa .Ltf_id_bitmatrix(%rip), t2; 306 vmovdqa .Ltf_aff_bitmatrix(%rip), t3; 307 vmovdqa .Ltf_x2_bitmatrix(%rip), t4; 308 vgf2p8affineinvqb $(tf_s2_const), t0, 309 vgf2p8affineinvqb $(tf_s2_const), t0, 310 vgf2p8affineqb $(tf_inv_const), t1, x2 311 vgf2p8affineqb $(tf_inv_const), t1, x6 312 vgf2p8affineinvqb $0, t2, x2, x2; 313 vgf2p8affineinvqb $0, t2, x6, x6; 314 vgf2p8affineinvqb $(tf_aff_const), t3, 315 vgf2p8affineinvqb $(tf_aff_const), t3, 316 vgf2p8affineqb $(tf_x2_const), t4, x3, 317 vgf2p8affineqb $(tf_x2_const), t4, x7, 318 vgf2p8affineinvqb $0, t2, x3, x3; 319 vgf2p8affineinvqb $0, t2, x7, x7 320 321 #endif /* CONFIG_AS_GFNI */ 322 323 #define aria_sbox_8way(x0, x1, x2, x3, 324 x4, x5, x6, x7, 325 t0, t1, t2, t3, 326 t4, t5, t6, t7) 327 vmovdqa .Linv_shift_row(%rip), t0; 328 vmovdqa .Lshift_row(%rip), t1; 329 vbroadcastss .L0f0f0f0f(%rip), t6; 330 vmovdqa .Ltf_lo__inv_aff__and__s2(%rip 331 vmovdqa .Ltf_hi__inv_aff__and__s2(%rip 332 vmovdqa .Ltf_lo__x2__and__fwd_aff(%rip 333 vmovdqa .Ltf_hi__x2__and__fwd_aff(%rip 334 335 vaesenclast t7, x0, x0; 336 vaesenclast t7, x4, x4; 337 vaesenclast t7, x1, x1; 338 vaesenclast t7, x5, x5; 339 vaesdeclast t7, x2, x2; 340 vaesdeclast t7, x6, x6; 341 342 /* AES inverse shift rows */ 343 vpshufb t0, x0, x0; 344 vpshufb t0, x4, x4; 345 vpshufb t0, x1, x1; 346 vpshufb t0, x5, x5; 347 vpshufb t1, x3, x3; 348 vpshufb t1, x7, x7; 349 vpshufb t1, x2, x2; 350 vpshufb t1, x6, x6; 351 352 /* affine transformation for S2 */ 353 filter_8bit(x1, t2, t3, t6, t0); 354 /* affine transformation for S2 */ 355 filter_8bit(x5, t2, t3, t6, t0); 356 357 /* affine transformation for X2 */ 358 filter_8bit(x3, t4, t5, t6, t0); 359 /* affine transformation for X2 */ 360 filter_8bit(x7, t4, t5, t6, t0); 361 vaesdeclast t7, x3, x3; 362 vaesdeclast t7, x7, x7; 363 364 #define aria_diff_m(x0, x1, x2, x3, 365 t0, t1, t2, t3) 366 /* T = rotr32(X, 8); */ 367 /* X ^= T */ 368 vpxor x0, x3, t0; 369 vpxor x1, x0, t1; 370 vpxor x2, x1, t2; 371 vpxor x3, x2, t3; 372 /* X = T ^ rotr(X, 16); */ 373 vpxor t2, x0, x0; 374 vpxor x1, t3, t3; 375 vpxor t0, x2, x2; 376 vpxor t1, x3, x1; 377 vmovdqu t3, x3; 378 379 #define aria_diff_word(x0, x1, x2, x3, 380 x4, x5, x6, x7, 381 y0, y1, y2, y3, 382 y4, y5, y6, y7) 383 /* t1 ^= t2; */ 384 vpxor y0, x4, x4; 385 vpxor y1, x5, x5; 386 vpxor y2, x6, x6; 387 vpxor y3, x7, x7; 388 389 /* t2 ^= t3; */ 390 vpxor y4, y0, y0; 391 vpxor y5, y1, y1; 392 vpxor y6, y2, y2; 393 vpxor y7, y3, y3; 394 395 /* t0 ^= t1; */ 396 vpxor x4, x0, x0; 397 vpxor x5, x1, x1; 398 vpxor x6, x2, x2; 399 vpxor x7, x3, x3; 400 401 /* t3 ^= t1; */ 402 vpxor x4, y4, y4; 403 vpxor x5, y5, y5; 404 vpxor x6, y6, y6; 405 vpxor x7, y7, y7; 406 407 /* t2 ^= t0; */ 408 vpxor x0, y0, y0; 409 vpxor x1, y1, y1; 410 vpxor x2, y2, y2; 411 vpxor x3, y3, y3; 412 413 /* t1 ^= t2; */ 414 vpxor y0, x4, x4; 415 vpxor y1, x5, x5; 416 vpxor y2, x6, x6; 417 vpxor y3, x7, x7; 418 419 #define aria_fe(x0, x1, x2, x3, 420 x4, x5, x6, x7, 421 y0, y1, y2, y3, 422 y4, y5, y6, y7, 423 mem_tmp, rk, round) 424 vpxor y7, y7, y7; 425 aria_ark_8way(x0, x1, x2, x3, x4, x5, 426 y0, y7, y2, rk, 8, round 427 428 aria_sbox_8way(x2, x3, x0, x1, x6, x7, 429 y0, y1, y2, y3, y4, y5, 430 431 aria_diff_m(x0, x1, x2, x3, y0, y1, y2 432 aria_diff_m(x4, x5, x6, x7, y0, y1, y2 433 aria_store_state_8way(x0, x1, x2, x3, 434 x4, x5, x6, x7, 435 mem_tmp, 8); 436 437 aria_load_state_8way(x0, x1, x2, x3, 438 x4, x5, x6, x7, 439 mem_tmp, 0); 440 aria_ark_8way(x0, x1, x2, x3, x4, x5, 441 y0, y7, y2, rk, 0, round 442 443 aria_sbox_8way(x2, x3, x0, x1, x6, x7, 444 y0, y1, y2, y3, y4, y5, 445 446 aria_diff_m(x0, x1, x2, x3, y0, y1, y2 447 aria_diff_m(x4, x5, x6, x7, y0, y1, y2 448 aria_store_state_8way(x0, x1, x2, x3, 449 x4, x5, x6, x7, 450 mem_tmp, 0); 451 aria_load_state_8way(y0, y1, y2, y3, 452 y4, y5, y6, y7, 453 mem_tmp, 8); 454 aria_diff_word(x0, x1, x2, x3, 455 x4, x5, x6, x7, 456 y0, y1, y2, y3, 457 y4, y5, y6, y7); 458 /* aria_diff_byte() 459 * T3 = ABCD -> BADC 460 * T3 = y4, y5, y6, y7 -> y5, y4, y7, 461 * T0 = ABCD -> CDAB 462 * T0 = x0, x1, x2, x3 -> x2, x3, x0, 463 * T1 = ABCD -> DCBA 464 * T1 = x4, x5, x6, x7 -> x7, x6, x5, 465 */ 466 aria_diff_word(x2, x3, x0, x1, 467 x7, x6, x5, x4, 468 y0, y1, y2, y3, 469 y5, y4, y7, y6); 470 aria_store_state_8way(x3, x2, x1, x0, 471 x6, x7, x4, x5, 472 mem_tmp, 0); 473 474 #define aria_fo(x0, x1, x2, x3, 475 x4, x5, x6, x7, 476 y0, y1, y2, y3, 477 y4, y5, y6, y7, 478 mem_tmp, rk, round) 479 vpxor y7, y7, y7; 480 aria_ark_8way(x0, x1, x2, x3, x4, x5, 481 y0, y7, y2, rk, 8, round 482 483 aria_sbox_8way(x0, x1, x2, x3, x4, x5, 484 y0, y1, y2, y3, y4, y5, 485 486 aria_diff_m(x0, x1, x2, x3, y0, y1, y2 487 aria_diff_m(x4, x5, x6, x7, y0, y1, y2 488 aria_store_state_8way(x0, x1, x2, x3, 489 x4, x5, x6, x7, 490 mem_tmp, 8); 491 492 aria_load_state_8way(x0, x1, x2, x3, 493 x4, x5, x6, x7, 494 mem_tmp, 0); 495 aria_ark_8way(x0, x1, x2, x3, x4, x5, 496 y0, y7, y2, rk, 0, round 497 498 aria_sbox_8way(x0, x1, x2, x3, x4, x5, 499 y0, y1, y2, y3, y4, y5, 500 501 aria_diff_m(x0, x1, x2, x3, y0, y1, y2 502 aria_diff_m(x4, x5, x6, x7, y0, y1, y2 503 aria_store_state_8way(x0, x1, x2, x3, 504 x4, x5, x6, x7, 505 mem_tmp, 0); 506 aria_load_state_8way(y0, y1, y2, y3, 507 y4, y5, y6, y7, 508 mem_tmp, 8); 509 aria_diff_word(x0, x1, x2, x3, 510 x4, x5, x6, x7, 511 y0, y1, y2, y3, 512 y4, y5, y6, y7); 513 /* aria_diff_byte() 514 * T1 = ABCD -> BADC 515 * T1 = x4, x5, x6, x7 -> x5, x4, x7, 516 * T2 = ABCD -> CDAB 517 * T2 = y0, y1, y2, y3, -> y2, y3, y0, 518 * T3 = ABCD -> DCBA 519 * T3 = y4, y5, y6, y7 -> y7, y6, y5, 520 */ 521 aria_diff_word(x0, x1, x2, x3, 522 x5, x4, x7, x6, 523 y2, y3, y0, y1, 524 y7, y6, y5, y4); 525 aria_store_state_8way(x3, x2, x1, x0, 526 x6, x7, x4, x5, 527 mem_tmp, 0); 528 529 #define aria_ff(x0, x1, x2, x3, 530 x4, x5, x6, x7, 531 y0, y1, y2, y3, 532 y4, y5, y6, y7, 533 mem_tmp, rk, round, last_round 534 vpxor y7, y7, y7; 535 aria_ark_8way(x0, x1, x2, x3, x4, x5, 536 y0, y7, y2, rk, 8, round 537 538 aria_sbox_8way(x2, x3, x0, x1, x6, x7, 539 y0, y1, y2, y3, y4, y5, 540 541 aria_ark_8way(x0, x1, x2, x3, x4, x5, 542 y0, y7, y2, rk, 8, last_ 543 544 aria_store_state_8way(x0, x1, x2, x3, 545 x4, x5, x6, x7, 546 mem_tmp, 8); 547 548 aria_load_state_8way(x0, x1, x2, x3, 549 x4, x5, x6, x7, 550 mem_tmp, 0); 551 aria_ark_8way(x0, x1, x2, x3, x4, x5, 552 y0, y7, y2, rk, 0, round 553 554 aria_sbox_8way(x2, x3, x0, x1, x6, x7, 555 y0, y1, y2, y3, y4, y5, 556 557 aria_ark_8way(x0, x1, x2, x3, x4, x5, 558 y0, y7, y2, rk, 0, last_ 559 560 aria_load_state_8way(y0, y1, y2, y3, 561 y4, y5, y6, y7, 562 mem_tmp, 8); 563 564 #ifdef CONFIG_AS_GFNI 565 #define aria_fe_gfni(x0, x1, x2, x3, 566 x4, x5, x6, x7, 567 y0, y1, y2, y3, 568 y4, y5, y6, y7, 569 mem_tmp, rk, round) 570 vpxor y7, y7, y7; 571 aria_ark_8way(x0, x1, x2, x3, x4, x5, 572 y0, y7, y2, rk, 8, round 573 574 aria_sbox_8way_gfni(x2, x3, x0, x1, 575 x6, x7, x4, x5, 576 y0, y1, y2, y3, 577 y4, y5, y6, y7); 578 579 aria_diff_m(x0, x1, x2, x3, y0, y1, y2 580 aria_diff_m(x4, x5, x6, x7, y0, y1, y2 581 aria_store_state_8way(x0, x1, x2, x3, 582 x4, x5, x6, x7, 583 mem_tmp, 8); 584 585 aria_load_state_8way(x0, x1, x2, x3, 586 x4, x5, x6, x7, 587 mem_tmp, 0); 588 aria_ark_8way(x0, x1, x2, x3, x4, x5, 589 y0, y7, y2, rk, 0, round 590 591 aria_sbox_8way_gfni(x2, x3, x0, x1, 592 x6, x7, x4, x5, 593 y0, y1, y2, y3, 594 y4, y5, y6, y7); 595 596 aria_diff_m(x0, x1, x2, x3, y0, y1, y2 597 aria_diff_m(x4, x5, x6, x7, y0, y1, y2 598 aria_store_state_8way(x0, x1, x2, x3, 599 x4, x5, x6, x7, 600 mem_tmp, 0); 601 aria_load_state_8way(y0, y1, y2, y3, 602 y4, y5, y6, y7, 603 mem_tmp, 8); 604 aria_diff_word(x0, x1, x2, x3, 605 x4, x5, x6, x7, 606 y0, y1, y2, y3, 607 y4, y5, y6, y7); 608 /* aria_diff_byte() 609 * T3 = ABCD -> BADC 610 * T3 = y4, y5, y6, y7 -> y5, y4, y7, 611 * T0 = ABCD -> CDAB 612 * T0 = x0, x1, x2, x3 -> x2, x3, x0, 613 * T1 = ABCD -> DCBA 614 * T1 = x4, x5, x6, x7 -> x7, x6, x5, 615 */ 616 aria_diff_word(x2, x3, x0, x1, 617 x7, x6, x5, x4, 618 y0, y1, y2, y3, 619 y5, y4, y7, y6); 620 aria_store_state_8way(x3, x2, x1, x0, 621 x6, x7, x4, x5, 622 mem_tmp, 0); 623 624 #define aria_fo_gfni(x0, x1, x2, x3, 625 x4, x5, x6, x7, 626 y0, y1, y2, y3, 627 y4, y5, y6, y7, 628 mem_tmp, rk, round) 629 vpxor y7, y7, y7; 630 aria_ark_8way(x0, x1, x2, x3, x4, x5, 631 y0, y7, y2, rk, 8, round 632 633 aria_sbox_8way_gfni(x0, x1, x2, x3, 634 x4, x5, x6, x7, 635 y0, y1, y2, y3, 636 y4, y5, y6, y7); 637 638 aria_diff_m(x0, x1, x2, x3, y0, y1, y2 639 aria_diff_m(x4, x5, x6, x7, y0, y1, y2 640 aria_store_state_8way(x0, x1, x2, x3, 641 x4, x5, x6, x7, 642 mem_tmp, 8); 643 644 aria_load_state_8way(x0, x1, x2, x3, 645 x4, x5, x6, x7, 646 mem_tmp, 0); 647 aria_ark_8way(x0, x1, x2, x3, x4, x5, 648 y0, y7, y2, rk, 0, round 649 650 aria_sbox_8way_gfni(x0, x1, x2, x3, 651 x4, x5, x6, x7, 652 y0, y1, y2, y3, 653 y4, y5, y6, y7); 654 655 aria_diff_m(x0, x1, x2, x3, y0, y1, y2 656 aria_diff_m(x4, x5, x6, x7, y0, y1, y2 657 aria_store_state_8way(x0, x1, x2, x3, 658 x4, x5, x6, x7, 659 mem_tmp, 0); 660 aria_load_state_8way(y0, y1, y2, y3, 661 y4, y5, y6, y7, 662 mem_tmp, 8); 663 aria_diff_word(x0, x1, x2, x3, 664 x4, x5, x6, x7, 665 y0, y1, y2, y3, 666 y4, y5, y6, y7); 667 /* aria_diff_byte() 668 * T1 = ABCD -> BADC 669 * T1 = x4, x5, x6, x7 -> x5, x4, x7, 670 * T2 = ABCD -> CDAB 671 * T2 = y0, y1, y2, y3, -> y2, y3, y0, 672 * T3 = ABCD -> DCBA 673 * T3 = y4, y5, y6, y7 -> y7, y6, y5, 674 */ 675 aria_diff_word(x0, x1, x2, x3, 676 x5, x4, x7, x6, 677 y2, y3, y0, y1, 678 y7, y6, y5, y4); 679 aria_store_state_8way(x3, x2, x1, x0, 680 x6, x7, x4, x5, 681 mem_tmp, 0); 682 683 #define aria_ff_gfni(x0, x1, x2, x3, 684 x4, x5, x6, x7, 685 y0, y1, y2, y3, 686 y4, y5, y6, y7, 687 mem_tmp, rk, round, last_round 688 vpxor y7, y7, y7; 689 aria_ark_8way(x0, x1, x2, x3, x4, x5, 690 y0, y7, y2, rk, 8, round 691 692 aria_sbox_8way_gfni(x2, x3, x0, x1, 693 x6, x7, x4, x5, 694 y0, y1, y2, y3, 695 y4, y5, y6, y7); 696 697 aria_ark_8way(x0, x1, x2, x3, x4, x5, 698 y0, y7, y2, rk, 8, last_ 699 700 aria_store_state_8way(x0, x1, x2, x3, 701 x4, x5, x6, x7, 702 mem_tmp, 8); 703 704 aria_load_state_8way(x0, x1, x2, x3, 705 x4, x5, x6, x7, 706 mem_tmp, 0); 707 aria_ark_8way(x0, x1, x2, x3, x4, x5, 708 y0, y7, y2, rk, 0, round 709 710 aria_sbox_8way_gfni(x2, x3, x0, x1, 711 x6, x7, x4, x5, 712 y0, y1, y2, y3, 713 y4, y5, y6, y7); 714 715 aria_ark_8way(x0, x1, x2, x3, x4, x5, 716 y0, y7, y2, rk, 0, last_ 717 718 aria_load_state_8way(y0, y1, y2, y3, 719 y4, y5, y6, y7, 720 mem_tmp, 8); 721 722 #endif /* CONFIG_AS_GFNI */ 723 724 /* NB: section is mergeable, all elements must 725 .section .rodata.cst16, "aM", @progbits 726 .align 16 727 728 #define SHUFB_BYTES(idx) \ 729 0 + (idx), 4 + (idx), 8 + (idx), 12 + 730 731 .Lshufb_16x16b: 732 .byte SHUFB_BYTES(0), SHUFB_BYTES(1), 733 /* For isolating SubBytes from AESENCLAST, inv 734 .Linv_shift_row: 735 .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x 736 .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x 737 .Lshift_row: 738 .byte 0x00, 0x05, 0x0a, 0x0f, 0x04, 0x 739 .byte 0x08, 0x0d, 0x02, 0x07, 0x0c, 0x 740 /* For CTR-mode IV byteswap */ 741 .Lbswap128_mask: 742 .byte 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x 743 .byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x 744 745 /* AES inverse affine and S2 combined: 746 * 1 1 0 0 0 0 0 1 x0 0 747 * 0 1 0 0 1 0 0 0 x1 0 748 * 1 1 0 0 1 1 1 1 x2 0 749 * 0 1 1 0 1 0 0 1 x3 1 750 * 0 1 0 0 1 1 0 0 * x4 + 0 751 * 0 1 0 1 1 0 0 0 x5 0 752 * 0 0 0 0 0 1 0 1 x6 0 753 * 1 1 1 0 0 1 1 1 x7 1 754 */ 755 .Ltf_lo__inv_aff__and__s2: 756 .octa 0x92172DA81A9FA520B2370D883ABF85 757 .Ltf_hi__inv_aff__and__s2: 758 .octa 0x2B15FFC1AF917B45E6D8320C625CB6 759 760 /* X2 and AES forward affine combined: 761 * 1 0 1 1 0 0 0 1 x0 0 762 * 0 1 1 1 1 0 1 1 x1 0 763 * 0 0 0 1 1 0 1 0 x2 1 764 * 0 1 0 0 0 1 0 0 x3 0 765 * 0 0 1 1 1 0 1 1 * x4 + 0 766 * 0 1 0 0 1 0 0 0 x5 0 767 * 1 1 0 1 0 0 1 1 x6 0 768 * 0 1 0 0 1 0 1 0 x7 0 769 */ 770 .Ltf_lo__x2__and__fwd_aff: 771 .octa 0xEFAE0544FCBD1657B8F95213ABEA41 772 .Ltf_hi__x2__and__fwd_aff: 773 .octa 0x3F893781E95FE1576CDA64D2BA0CB2 774 775 #ifdef CONFIG_AS_GFNI 776 /* AES affine: */ 777 #define tf_aff_const BV8(1, 1, 0, 0, 0, 1, 1, 778 .Ltf_aff_bitmatrix: 779 .quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1 780 BV8(1, 1, 0, 0, 0, 1, 1, 1 781 BV8(1, 1, 1, 0, 0, 0, 1, 1 782 BV8(1, 1, 1, 1, 0, 0, 0, 1 783 BV8(1, 1, 1, 1, 1, 0, 0, 0 784 BV8(0, 1, 1, 1, 1, 1, 0, 0 785 BV8(0, 0, 1, 1, 1, 1, 1, 0 786 BV8(0, 0, 0, 1, 1, 1, 1, 1 787 .quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1 788 BV8(1, 1, 0, 0, 0, 1, 1, 1 789 BV8(1, 1, 1, 0, 0, 0, 1, 1 790 BV8(1, 1, 1, 1, 0, 0, 0, 1 791 BV8(1, 1, 1, 1, 1, 0, 0, 0 792 BV8(0, 1, 1, 1, 1, 1, 0, 0 793 BV8(0, 0, 1, 1, 1, 1, 1, 0 794 BV8(0, 0, 0, 1, 1, 1, 1, 1 795 796 /* AES inverse affine: */ 797 #define tf_inv_const BV8(1, 0, 1, 0, 0, 0, 0, 798 .Ltf_inv_bitmatrix: 799 .quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1 800 BV8(1, 0, 0, 1, 0, 0, 1, 0 801 BV8(0, 1, 0, 0, 1, 0, 0, 1 802 BV8(1, 0, 1, 0, 0, 1, 0, 0 803 BV8(0, 1, 0, 1, 0, 0, 1, 0 804 BV8(0, 0, 1, 0, 1, 0, 0, 1 805 BV8(1, 0, 0, 1, 0, 1, 0, 0 806 BV8(0, 1, 0, 0, 1, 0, 1, 0 807 .quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1 808 BV8(1, 0, 0, 1, 0, 0, 1, 0 809 BV8(0, 1, 0, 0, 1, 0, 0, 1 810 BV8(1, 0, 1, 0, 0, 1, 0, 0 811 BV8(0, 1, 0, 1, 0, 0, 1, 0 812 BV8(0, 0, 1, 0, 1, 0, 0, 1 813 BV8(1, 0, 0, 1, 0, 1, 0, 0 814 BV8(0, 1, 0, 0, 1, 0, 1, 0 815 816 /* S2: */ 817 #define tf_s2_const BV8(0, 1, 0, 0, 0, 1, 1, 1 818 .Ltf_s2_bitmatrix: 819 .quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1 820 BV8(0, 0, 1, 1, 1, 1, 1, 1 821 BV8(1, 1, 1, 0, 1, 1, 0, 1 822 BV8(1, 1, 0, 0, 0, 0, 1, 1 823 BV8(0, 1, 0, 0, 0, 0, 1, 1 824 BV8(1, 1, 0, 0, 1, 1, 1, 0 825 BV8(0, 1, 1, 0, 0, 0, 1, 1 826 BV8(1, 1, 1, 1, 0, 1, 1, 0 827 .quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1 828 BV8(0, 0, 1, 1, 1, 1, 1, 1 829 BV8(1, 1, 1, 0, 1, 1, 0, 1 830 BV8(1, 1, 0, 0, 0, 0, 1, 1 831 BV8(0, 1, 0, 0, 0, 0, 1, 1 832 BV8(1, 1, 0, 0, 1, 1, 1, 0 833 BV8(0, 1, 1, 0, 0, 0, 1, 1 834 BV8(1, 1, 1, 1, 0, 1, 1, 0 835 836 /* X2: */ 837 #define tf_x2_const BV8(0, 0, 1, 1, 0, 1, 0, 0 838 .Ltf_x2_bitmatrix: 839 .quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0 840 BV8(0, 0, 1, 0, 0, 1, 1, 0 841 BV8(0, 0, 0, 0, 1, 0, 1, 0 842 BV8(1, 1, 1, 0, 0, 0, 1, 1 843 BV8(1, 1, 1, 0, 1, 1, 0, 0 844 BV8(0, 1, 1, 0, 1, 0, 1, 1 845 BV8(1, 0, 1, 1, 1, 1, 0, 1 846 BV8(1, 0, 0, 1, 0, 0, 1, 1 847 .quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0 848 BV8(0, 0, 1, 0, 0, 1, 1, 0 849 BV8(0, 0, 0, 0, 1, 0, 1, 0 850 BV8(1, 1, 1, 0, 0, 0, 1, 1 851 BV8(1, 1, 1, 0, 1, 1, 0, 0 852 BV8(0, 1, 1, 0, 1, 0, 1, 1 853 BV8(1, 0, 1, 1, 1, 1, 0, 1 854 BV8(1, 0, 0, 1, 0, 0, 1, 1 855 856 /* Identity matrix: */ 857 .Ltf_id_bitmatrix: 858 .quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0 859 BV8(0, 1, 0, 0, 0, 0, 0, 0 860 BV8(0, 0, 1, 0, 0, 0, 0, 0 861 BV8(0, 0, 0, 1, 0, 0, 0, 0 862 BV8(0, 0, 0, 0, 1, 0, 0, 0 863 BV8(0, 0, 0, 0, 0, 1, 0, 0 864 BV8(0, 0, 0, 0, 0, 0, 1, 0 865 BV8(0, 0, 0, 0, 0, 0, 0, 1 866 .quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0 867 BV8(0, 1, 0, 0, 0, 0, 0, 0 868 BV8(0, 0, 1, 0, 0, 0, 0, 0 869 BV8(0, 0, 0, 1, 0, 0, 0, 0 870 BV8(0, 0, 0, 0, 1, 0, 0, 0 871 BV8(0, 0, 0, 0, 0, 1, 0, 0 872 BV8(0, 0, 0, 0, 0, 0, 1, 0 873 BV8(0, 0, 0, 0, 0, 0, 0, 1 874 #endif /* CONFIG_AS_GFNI */ 875 876 /* 4-bit mask */ 877 .section .rodata.cst4.L0f0f0f0f, "aM", 878 .align 4 879 .L0f0f0f0f: 880 .long 0x0f0f0f0f 881 882 .text 883 884 SYM_FUNC_START_LOCAL(__aria_aesni_avx_crypt_16 885 /* input: 886 * %r9: rk 887 * %rsi: dst 888 * %rdx: src 889 * %xmm0..%xmm15: 16 byte-sliced b 890 */ 891 892 FRAME_BEGIN 893 894 movq %rsi, %rax; 895 leaq 8 * 16(%rax), %r8; 896 897 inpack16_post(%xmm0, %xmm1, %xmm2, %xm 898 %xmm8, %xmm9, %xmm10, %x 899 %xmm15, %rax, %r8); 900 aria_fo(%xmm8, %xmm9, %xmm10, %xmm11, 901 %xmm0, %xmm1, %xmm2, %xmm3, %x 902 %rax, %r9, 0); 903 aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %x 904 %xmm8, %xmm9, %xmm10, %xmm11, 905 %xmm15, %rax, %r9, 1); 906 aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, 907 %xmm0, %xmm1, %xmm2, %xmm3, %x 908 %rax, %r9, 2); 909 aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %x 910 %xmm8, %xmm9, %xmm10, %xmm11, 911 %xmm15, %rax, %r9, 3); 912 aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, 913 %xmm0, %xmm1, %xmm2, %xmm3, %x 914 %rax, %r9, 4); 915 aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %x 916 %xmm8, %xmm9, %xmm10, %xmm11, 917 %xmm15, %rax, %r9, 5); 918 aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, 919 %xmm0, %xmm1, %xmm2, %xmm3, %x 920 %rax, %r9, 6); 921 aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %x 922 %xmm8, %xmm9, %xmm10, %xmm11, 923 %xmm15, %rax, %r9, 7); 924 aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, 925 %xmm0, %xmm1, %xmm2, %xmm3, %x 926 %rax, %r9, 8); 927 aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %x 928 %xmm8, %xmm9, %xmm10, %xmm11, 929 %xmm15, %rax, %r9, 9); 930 aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, 931 %xmm0, %xmm1, %xmm2, %xmm3, %x 932 %rax, %r9, 10); 933 cmpl $12, ARIA_CTX_rounds(CTX); 934 jne .Laria_192; 935 aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %x 936 %xmm8, %xmm9, %xmm10, %xmm11, 937 %xmm15, %rax, %r9, 11, 12); 938 jmp .Laria_end; 939 .Laria_192: 940 aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %x 941 %xmm8, %xmm9, %xmm10, %xmm11, 942 %xmm15, %rax, %r9, 11); 943 aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, 944 %xmm0, %xmm1, %xmm2, %xmm3, %x 945 %rax, %r9, 12); 946 cmpl $14, ARIA_CTX_rounds(CTX); 947 jne .Laria_256; 948 aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %x 949 %xmm8, %xmm9, %xmm10, %xmm11, 950 %xmm15, %rax, %r9, 13, 14); 951 jmp .Laria_end; 952 .Laria_256: 953 aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %x 954 %xmm8, %xmm9, %xmm10, %xmm11, 955 %xmm15, %rax, %r9, 13); 956 aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, 957 %xmm0, %xmm1, %xmm2, %xmm3, %x 958 %rax, %r9, 14); 959 aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %x 960 %xmm8, %xmm9, %xmm10, %xmm11, 961 %xmm15, %rax, %r9, 15, 16); 962 .Laria_end: 963 debyteslice_16x16b(%xmm8, %xmm12, %xmm 964 %xmm9, %xmm13, %xmm 965 %xmm10, %xmm14, %xm 966 %xmm11, %xmm15, %xm 967 (%rax), (%r8)); 968 969 FRAME_END 970 RET; 971 SYM_FUNC_END(__aria_aesni_avx_crypt_16way) 972 973 SYM_TYPED_FUNC_START(aria_aesni_avx_encrypt_16 974 /* input: 975 * %rdi: ctx, CTX 976 * %rsi: dst 977 * %rdx: src 978 */ 979 980 FRAME_BEGIN 981 982 leaq ARIA_CTX_enc_key(CTX), %r9; 983 984 inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm 985 %xmm8, %xmm9, %xmm10, %xm 986 %xmm15, %rdx); 987 988 call __aria_aesni_avx_crypt_16way; 989 990 write_output(%xmm1, %xmm0, %xmm3, %xmm 991 %xmm8, %xmm9, %xmm10, %xm 992 %xmm15, %rax); 993 994 FRAME_END 995 RET; 996 SYM_FUNC_END(aria_aesni_avx_encrypt_16way) 997 998 SYM_TYPED_FUNC_START(aria_aesni_avx_decrypt_16 999 /* input: 1000 * %rdi: ctx, CTX 1001 * %rsi: dst 1002 * %rdx: src 1003 */ 1004 1005 FRAME_BEGIN 1006 1007 leaq ARIA_CTX_dec_key(CTX), %r9; 1008 1009 inpack16_pre(%xmm0, %xmm1, %xmm2, %xm 1010 %xmm8, %xmm9, %xmm10, %x 1011 %xmm15, %rdx); 1012 1013 call __aria_aesni_avx_crypt_16way; 1014 1015 write_output(%xmm1, %xmm0, %xmm3, %xm 1016 %xmm8, %xmm9, %xmm10, %x 1017 %xmm15, %rax); 1018 1019 FRAME_END 1020 RET; 1021 SYM_FUNC_END(aria_aesni_avx_decrypt_16way) 1022 1023 SYM_FUNC_START_LOCAL(__aria_aesni_avx_ctr_gen 1024 /* input: 1025 * %rdi: ctx 1026 * %rsi: dst 1027 * %rdx: src 1028 * %rcx: keystream 1029 * %r8: iv (big endian, 128bit) 1030 */ 1031 1032 FRAME_BEGIN 1033 /* load IV and byteswap */ 1034 vmovdqu (%r8), %xmm8; 1035 1036 vmovdqa .Lbswap128_mask (%rip), %xmm1 1037 vpshufb %xmm1, %xmm8, %xmm3; /* be => 1038 1039 vpcmpeqd %xmm0, %xmm0, %xmm0; 1040 vpsrldq $8, %xmm0, %xmm0; /* low: -1, 1041 1042 /* construct IVs */ 1043 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 1044 vpshufb %xmm1, %xmm3, %xmm9; 1045 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 1046 vpshufb %xmm1, %xmm3, %xmm10; 1047 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 1048 vpshufb %xmm1, %xmm3, %xmm11; 1049 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 1050 vpshufb %xmm1, %xmm3, %xmm12; 1051 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 1052 vpshufb %xmm1, %xmm3, %xmm13; 1053 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 1054 vpshufb %xmm1, %xmm3, %xmm14; 1055 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 1056 vpshufb %xmm1, %xmm3, %xmm15; 1057 vmovdqu %xmm8, (0 * 16)(%rcx); 1058 vmovdqu %xmm9, (1 * 16)(%rcx); 1059 vmovdqu %xmm10, (2 * 16)(%rcx); 1060 vmovdqu %xmm11, (3 * 16)(%rcx); 1061 vmovdqu %xmm12, (4 * 16)(%rcx); 1062 vmovdqu %xmm13, (5 * 16)(%rcx); 1063 vmovdqu %xmm14, (6 * 16)(%rcx); 1064 vmovdqu %xmm15, (7 * 16)(%rcx); 1065 1066 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 1067 vpshufb %xmm1, %xmm3, %xmm8; 1068 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 1069 vpshufb %xmm1, %xmm3, %xmm9; 1070 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 1071 vpshufb %xmm1, %xmm3, %xmm10; 1072 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 1073 vpshufb %xmm1, %xmm3, %xmm11; 1074 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 1075 vpshufb %xmm1, %xmm3, %xmm12; 1076 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 1077 vpshufb %xmm1, %xmm3, %xmm13; 1078 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 1079 vpshufb %xmm1, %xmm3, %xmm14; 1080 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 1081 vpshufb %xmm1, %xmm3, %xmm15; 1082 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 1083 vpshufb %xmm1, %xmm3, %xmm4; 1084 vmovdqu %xmm4, (%r8); 1085 1086 vmovdqu (0 * 16)(%rcx), %xmm0; 1087 vmovdqu (1 * 16)(%rcx), %xmm1; 1088 vmovdqu (2 * 16)(%rcx), %xmm2; 1089 vmovdqu (3 * 16)(%rcx), %xmm3; 1090 vmovdqu (4 * 16)(%rcx), %xmm4; 1091 vmovdqu (5 * 16)(%rcx), %xmm5; 1092 vmovdqu (6 * 16)(%rcx), %xmm6; 1093 vmovdqu (7 * 16)(%rcx), %xmm7; 1094 1095 FRAME_END 1096 RET; 1097 SYM_FUNC_END(__aria_aesni_avx_ctr_gen_keystre 1098 1099 SYM_TYPED_FUNC_START(aria_aesni_avx_ctr_crypt 1100 /* input: 1101 * %rdi: ctx 1102 * %rsi: dst 1103 * %rdx: src 1104 * %rcx: keystream 1105 * %r8: iv (big endian, 128bit) 1106 */ 1107 FRAME_BEGIN 1108 1109 call __aria_aesni_avx_ctr_gen_keystre 1110 1111 leaq (%rsi), %r10; 1112 leaq (%rdx), %r11; 1113 leaq (%rcx), %rsi; 1114 leaq (%rcx), %rdx; 1115 leaq ARIA_CTX_enc_key(CTX), %r9; 1116 1117 call __aria_aesni_avx_crypt_16way; 1118 1119 vpxor (0 * 16)(%r11), %xmm1, %xmm1; 1120 vpxor (1 * 16)(%r11), %xmm0, %xmm0; 1121 vpxor (2 * 16)(%r11), %xmm3, %xmm3; 1122 vpxor (3 * 16)(%r11), %xmm2, %xmm2; 1123 vpxor (4 * 16)(%r11), %xmm4, %xmm4; 1124 vpxor (5 * 16)(%r11), %xmm5, %xmm5; 1125 vpxor (6 * 16)(%r11), %xmm6, %xmm6; 1126 vpxor (7 * 16)(%r11), %xmm7, %xmm7; 1127 vpxor (8 * 16)(%r11), %xmm8, %xmm8; 1128 vpxor (9 * 16)(%r11), %xmm9, %xmm9; 1129 vpxor (10 * 16)(%r11), %xmm10, %xmm10 1130 vpxor (11 * 16)(%r11), %xmm11, %xmm11 1131 vpxor (12 * 16)(%r11), %xmm12, %xmm12 1132 vpxor (13 * 16)(%r11), %xmm13, %xmm13 1133 vpxor (14 * 16)(%r11), %xmm14, %xmm14 1134 vpxor (15 * 16)(%r11), %xmm15, %xmm15 1135 write_output(%xmm1, %xmm0, %xmm3, %xm 1136 %xmm8, %xmm9, %xmm10, %x 1137 %xmm15, %r10); 1138 1139 FRAME_END 1140 RET; 1141 SYM_FUNC_END(aria_aesni_avx_ctr_crypt_16way) 1142 1143 #ifdef CONFIG_AS_GFNI 1144 SYM_FUNC_START_LOCAL(__aria_aesni_avx_gfni_cr 1145 /* input: 1146 * %r9: rk 1147 * %rsi: dst 1148 * %rdx: src 1149 * %xmm0..%xmm15: 16 byte-sliced 1150 */ 1151 1152 FRAME_BEGIN 1153 1154 movq %rsi, %rax; 1155 leaq 8 * 16(%rax), %r8; 1156 1157 inpack16_post(%xmm0, %xmm1, %xmm2, %x 1158 %xmm4, %xmm5, %xmm6, %x 1159 %xmm8, %xmm9, %xmm10, % 1160 %xmm12, %xmm13, %xmm14, 1161 %xmm15, %rax, %r8); 1162 aria_fo_gfni(%xmm8, %xmm9, %xmm10, %x 1163 %xmm12, %xmm13, %xmm14, 1164 %xmm0, %xmm1, %xmm2, %xm 1165 %xmm4, %xmm5, %xmm6, %xm 1166 %rax, %r9, 0); 1167 aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xm 1168 %xmm4, %xmm5, %xmm6, %xm 1169 %xmm8, %xmm9, %xmm10, %x 1170 %xmm12, %xmm13, %xmm14, 1171 %xmm15, %rax, %r9, 1); 1172 aria_fo_gfni(%xmm9, %xmm8, %xmm11, %x 1173 %xmm12, %xmm13, %xmm14, 1174 %xmm0, %xmm1, %xmm2, %xm 1175 %xmm4, %xmm5, %xmm6, %xm 1176 %rax, %r9, 2); 1177 aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xm 1178 %xmm4, %xmm5, %xmm6, %xm 1179 %xmm8, %xmm9, %xmm10, %x 1180 %xmm12, %xmm13, %xmm14, 1181 %xmm15, %rax, %r9, 3); 1182 aria_fo_gfni(%xmm9, %xmm8, %xmm11, %x 1183 %xmm12, %xmm13, %xmm14, 1184 %xmm0, %xmm1, %xmm2, %xm 1185 %xmm4, %xmm5, %xmm6, %xm 1186 %rax, %r9, 4); 1187 aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xm 1188 %xmm4, %xmm5, %xmm6, %xm 1189 %xmm8, %xmm9, %xmm10, %x 1190 %xmm12, %xmm13, %xmm14, 1191 %xmm15, %rax, %r9, 5); 1192 aria_fo_gfni(%xmm9, %xmm8, %xmm11, %x 1193 %xmm12, %xmm13, %xmm14, 1194 %xmm0, %xmm1, %xmm2, %xm 1195 %xmm4, %xmm5, %xmm6, %xm 1196 %rax, %r9, 6); 1197 aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xm 1198 %xmm4, %xmm5, %xmm6, %xm 1199 %xmm8, %xmm9, %xmm10, %x 1200 %xmm12, %xmm13, %xmm14, 1201 %xmm15, %rax, %r9, 7); 1202 aria_fo_gfni(%xmm9, %xmm8, %xmm11, %x 1203 %xmm12, %xmm13, %xmm14, 1204 %xmm0, %xmm1, %xmm2, %xm 1205 %xmm4, %xmm5, %xmm6, %xm 1206 %rax, %r9, 8); 1207 aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xm 1208 %xmm4, %xmm5, %xmm6, %xm 1209 %xmm8, %xmm9, %xmm10, %x 1210 %xmm12, %xmm13, %xmm14, 1211 %xmm15, %rax, %r9, 9); 1212 aria_fo_gfni(%xmm9, %xmm8, %xmm11, %x 1213 %xmm12, %xmm13, %xmm14, 1214 %xmm0, %xmm1, %xmm2, %xm 1215 %xmm4, %xmm5, %xmm6, %xm 1216 %rax, %r9, 10); 1217 cmpl $12, ARIA_CTX_rounds(CTX); 1218 jne .Laria_gfni_192; 1219 aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xm 1220 %xmm8, %xmm9, %xmm10, %xmm11, 1221 %xmm15, %rax, %r9, 11, 12); 1222 jmp .Laria_gfni_end; 1223 .Laria_gfni_192: 1224 aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xm 1225 %xmm4, %xmm5, %xmm6, %xm 1226 %xmm8, %xmm9, %xmm10, %x 1227 %xmm12, %xmm13, %xmm14, 1228 %xmm15, %rax, %r9, 11); 1229 aria_fo_gfni(%xmm9, %xmm8, %xmm11, %x 1230 %xmm12, %xmm13, %xmm14, 1231 %xmm0, %xmm1, %xmm2, %xm 1232 %xmm4, %xmm5, %xmm6, %xm 1233 %rax, %r9, 12); 1234 cmpl $14, ARIA_CTX_rounds(CTX); 1235 jne .Laria_gfni_256; 1236 aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xm 1237 %xmm4, %xmm5, %xmm6, %xm 1238 %xmm8, %xmm9, %xmm10, %x 1239 %xmm12, %xmm13, %xmm14, 1240 %xmm15, %rax, %r9, 13, 1 1241 jmp .Laria_gfni_end; 1242 .Laria_gfni_256: 1243 aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xm 1244 %xmm4, %xmm5, %xmm6, %xm 1245 %xmm8, %xmm9, %xmm10, %x 1246 %xmm12, %xmm13, %xmm14, 1247 %xmm15, %rax, %r9, 13); 1248 aria_fo_gfni(%xmm9, %xmm8, %xmm11, %x 1249 %xmm12, %xmm13, %xmm14, 1250 %xmm0, %xmm1, %xmm2, %xm 1251 %xmm4, %xmm5, %xmm6, %xm 1252 %rax, %r9, 14); 1253 aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xm 1254 %xmm4, %xmm5, %xmm6, %xm 1255 %xmm8, %xmm9, %xmm10, %x 1256 %xmm12, %xmm13, %xmm14, 1257 %xmm15, %rax, %r9, 15, 1 1258 .Laria_gfni_end: 1259 debyteslice_16x16b(%xmm8, %xmm12, %xm 1260 %xmm9, %xmm13, %xm 1261 %xmm10, %xmm14, %x 1262 %xmm11, %xmm15, %x 1263 (%rax), (%r8)); 1264 1265 FRAME_END 1266 RET; 1267 SYM_FUNC_END(__aria_aesni_avx_gfni_crypt_16wa 1268 1269 SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_encr 1270 /* input: 1271 * %rdi: ctx, CTX 1272 * %rsi: dst 1273 * %rdx: src 1274 */ 1275 1276 FRAME_BEGIN 1277 1278 leaq ARIA_CTX_enc_key(CTX), %r9; 1279 1280 inpack16_pre(%xmm0, %xmm1, %xmm2, %xm 1281 %xmm8, %xmm9, %xmm10, %x 1282 %xmm15, %rdx); 1283 1284 call __aria_aesni_avx_gfni_crypt_16wa 1285 1286 write_output(%xmm1, %xmm0, %xmm3, %xm 1287 %xmm8, %xmm9, %xmm10, %x 1288 %xmm15, %rax); 1289 1290 FRAME_END 1291 RET; 1292 SYM_FUNC_END(aria_aesni_avx_gfni_encrypt_16wa 1293 1294 SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_decr 1295 /* input: 1296 * %rdi: ctx, CTX 1297 * %rsi: dst 1298 * %rdx: src 1299 */ 1300 1301 FRAME_BEGIN 1302 1303 leaq ARIA_CTX_dec_key(CTX), %r9; 1304 1305 inpack16_pre(%xmm0, %xmm1, %xmm2, %xm 1306 %xmm8, %xmm9, %xmm10, %x 1307 %xmm15, %rdx); 1308 1309 call __aria_aesni_avx_gfni_crypt_16wa 1310 1311 write_output(%xmm1, %xmm0, %xmm3, %xm 1312 %xmm8, %xmm9, %xmm10, %x 1313 %xmm15, %rax); 1314 1315 FRAME_END 1316 RET; 1317 SYM_FUNC_END(aria_aesni_avx_gfni_decrypt_16wa 1318 1319 SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_ctr_ 1320 /* input: 1321 * %rdi: ctx 1322 * %rsi: dst 1323 * %rdx: src 1324 * %rcx: keystream 1325 * %r8: iv (big endian, 128bit) 1326 */ 1327 FRAME_BEGIN 1328 1329 call __aria_aesni_avx_ctr_gen_keystre 1330 1331 leaq (%rsi), %r10; 1332 leaq (%rdx), %r11; 1333 leaq (%rcx), %rsi; 1334 leaq (%rcx), %rdx; 1335 leaq ARIA_CTX_enc_key(CTX), %r9; 1336 1337 call __aria_aesni_avx_gfni_crypt_16wa 1338 1339 vpxor (0 * 16)(%r11), %xmm1, %xmm1; 1340 vpxor (1 * 16)(%r11), %xmm0, %xmm0; 1341 vpxor (2 * 16)(%r11), %xmm3, %xmm3; 1342 vpxor (3 * 16)(%r11), %xmm2, %xmm2; 1343 vpxor (4 * 16)(%r11), %xmm4, %xmm4; 1344 vpxor (5 * 16)(%r11), %xmm5, %xmm5; 1345 vpxor (6 * 16)(%r11), %xmm6, %xmm6; 1346 vpxor (7 * 16)(%r11), %xmm7, %xmm7; 1347 vpxor (8 * 16)(%r11), %xmm8, %xmm8; 1348 vpxor (9 * 16)(%r11), %xmm9, %xmm9; 1349 vpxor (10 * 16)(%r11), %xmm10, %xmm10 1350 vpxor (11 * 16)(%r11), %xmm11, %xmm11 1351 vpxor (12 * 16)(%r11), %xmm12, %xmm12 1352 vpxor (13 * 16)(%r11), %xmm13, %xmm13 1353 vpxor (14 * 16)(%r11), %xmm14, %xmm14 1354 vpxor (15 * 16)(%r11), %xmm15, %xmm15 1355 write_output(%xmm1, %xmm0, %xmm3, %xm 1356 %xmm8, %xmm9, %xmm10, %x 1357 %xmm15, %r10); 1358 1359 FRAME_END 1360 RET; 1361 SYM_FUNC_END(aria_aesni_avx_gfni_ctr_crypt_16 1362 #endif /* CONFIG_AS_GFNI */
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.