1 /* SPDX-License-Identifier: GPL-2.0-or-later * 2 /* 3 * x86_64/AVX2 assembler optimized version of 4 * 5 * Copyright © 2012-2013 Jussi Kivilinna <juss 6 * 7 * Based on AVX assembler implementation of Se 8 * Copyright © 2012 Johannes Goetzfried 9 * <Johannes.Goetzfried@informatik.stud.un 10 */ 11 12 #include <linux/linkage.h> 13 #include <asm/frame.h> 14 #include "glue_helper-asm-avx2.S" 15 16 .file "serpent-avx2-asm_64.S" 17 18 .section .rodata.cst16.bswap128_mask, " 19 .align 16 20 .Lbswap128_mask: 21 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 22 23 .text 24 25 #define CTX %rdi 26 27 #define RNOT %ymm0 28 #define tp %ymm1 29 30 #define RA1 %ymm2 31 #define RA2 %ymm3 32 #define RB1 %ymm4 33 #define RB2 %ymm5 34 #define RC1 %ymm6 35 #define RC2 %ymm7 36 #define RD1 %ymm8 37 #define RD2 %ymm9 38 #define RE1 %ymm10 39 #define RE2 %ymm11 40 41 #define RK0 %ymm12 42 #define RK1 %ymm13 43 #define RK2 %ymm14 44 #define RK3 %ymm15 45 46 #define RK0x %xmm12 47 #define RK1x %xmm13 48 #define RK2x %xmm14 49 #define RK3x %xmm15 50 51 #define S0_1(x0, x1, x2, x3, x4) \ 52 vpor x0, x3, tp; \ 53 vpxor x3, x0, x0; \ 54 vpxor x2, x3, x4; \ 55 vpxor RNOT, x4, x4; \ 56 vpxor x1, tp, x3; \ 57 vpand x0, x1, x1; \ 58 vpxor x4, x1, x1; \ 59 vpxor x0, x2, x2; 60 #define S0_2(x0, x1, x2, x3, x4) \ 61 vpxor x3, x0, x0; \ 62 vpor x0, x4, x4; \ 63 vpxor x2, x0, x0; \ 64 vpand x1, x2, x2; \ 65 vpxor x2, x3, x3; \ 66 vpxor RNOT, x1, x1; \ 67 vpxor x4, x2, x2; \ 68 vpxor x2, x1, x1; 69 70 #define S1_1(x0, x1, x2, x3, x4) \ 71 vpxor x0, x1, tp; \ 72 vpxor x3, x0, x0; \ 73 vpxor RNOT, x3, x3; \ 74 vpand tp, x1, x4; \ 75 vpor tp, x0, x0; \ 76 vpxor x2, x3, x3; \ 77 vpxor x3, x0, x0; \ 78 vpxor x3, tp, x1; 79 #define S1_2(x0, x1, x2, x3, x4) \ 80 vpxor x4, x3, x3; \ 81 vpor x4, x1, x1; \ 82 vpxor x2, x4, x4; \ 83 vpand x0, x2, x2; \ 84 vpxor x1, x2, x2; \ 85 vpor x0, x1, x1; \ 86 vpxor RNOT, x0, x0; \ 87 vpxor x2, x0, x0; \ 88 vpxor x1, x4, x4; 89 90 #define S2_1(x0, x1, x2, x3, x4) \ 91 vpxor RNOT, x3, x3; \ 92 vpxor x0, x1, x1; \ 93 vpand x2, x0, tp; \ 94 vpxor x3, tp, tp; \ 95 vpor x0, x3, x3; \ 96 vpxor x1, x2, x2; \ 97 vpxor x1, x3, x3; \ 98 vpand tp, x1, x1; 99 #define S2_2(x0, x1, x2, x3, x4) \ 100 vpxor x2, tp, tp; \ 101 vpand x3, x2, x2; \ 102 vpor x1, x3, x3; \ 103 vpxor RNOT, tp, tp; \ 104 vpxor tp, x3, x3; \ 105 vpxor tp, x0, x4; \ 106 vpxor x2, tp, x0; \ 107 vpor x2, x1, x1; 108 109 #define S3_1(x0, x1, x2, x3, x4) \ 110 vpxor x3, x1, tp; \ 111 vpor x0, x3, x3; \ 112 vpand x0, x1, x4; \ 113 vpxor x2, x0, x0; \ 114 vpxor tp, x2, x2; \ 115 vpand x3, tp, x1; \ 116 vpxor x3, x2, x2; \ 117 vpor x4, x0, x0; \ 118 vpxor x3, x4, x4; 119 #define S3_2(x0, x1, x2, x3, x4) \ 120 vpxor x0, x1, x1; \ 121 vpand x3, x0, x0; \ 122 vpand x4, x3, x3; \ 123 vpxor x2, x3, x3; \ 124 vpor x1, x4, x4; \ 125 vpand x1, x2, x2; \ 126 vpxor x3, x4, x4; \ 127 vpxor x3, x0, x0; \ 128 vpxor x2, x3, x3; 129 130 #define S4_1(x0, x1, x2, x3, x4) \ 131 vpand x0, x3, tp; \ 132 vpxor x3, x0, x0; \ 133 vpxor x2, tp, tp; \ 134 vpor x3, x2, x2; \ 135 vpxor x1, x0, x0; \ 136 vpxor tp, x3, x4; \ 137 vpor x0, x2, x2; \ 138 vpxor x1, x2, x2; 139 #define S4_2(x0, x1, x2, x3, x4) \ 140 vpand x0, x1, x1; \ 141 vpxor x4, x1, x1; \ 142 vpand x2, x4, x4; \ 143 vpxor tp, x2, x2; \ 144 vpxor x0, x4, x4; \ 145 vpor x1, tp, x3; \ 146 vpxor RNOT, x1, x1; \ 147 vpxor x0, x3, x3; 148 149 #define S5_1(x0, x1, x2, x3, x4) \ 150 vpor x0, x1, tp; \ 151 vpxor tp, x2, x2; \ 152 vpxor RNOT, x3, x3; \ 153 vpxor x0, x1, x4; \ 154 vpxor x2, x0, x0; \ 155 vpand x4, tp, x1; \ 156 vpor x3, x4, x4; \ 157 vpxor x0, x4, x4; 158 #define S5_2(x0, x1, x2, x3, x4) \ 159 vpand x3, x0, x0; \ 160 vpxor x3, x1, x1; \ 161 vpxor x2, x3, x3; \ 162 vpxor x1, x0, x0; \ 163 vpand x4, x2, x2; \ 164 vpxor x2, x1, x1; \ 165 vpand x0, x2, x2; \ 166 vpxor x2, x3, x3; 167 168 #define S6_1(x0, x1, x2, x3, x4) \ 169 vpxor x0, x3, x3; \ 170 vpxor x2, x1, tp; \ 171 vpxor x0, x2, x2; \ 172 vpand x3, x0, x0; \ 173 vpor x3, tp, tp; \ 174 vpxor RNOT, x1, x4; \ 175 vpxor tp, x0, x0; \ 176 vpxor x2, tp, x1; 177 #define S6_2(x0, x1, x2, x3, x4) \ 178 vpxor x4, x3, x3; \ 179 vpxor x0, x4, x4; \ 180 vpand x0, x2, x2; \ 181 vpxor x1, x4, x4; \ 182 vpxor x3, x2, x2; \ 183 vpand x1, x3, x3; \ 184 vpxor x0, x3, x3; \ 185 vpxor x2, x1, x1; 186 187 #define S7_1(x0, x1, x2, x3, x4) \ 188 vpxor RNOT, x1, tp; \ 189 vpxor RNOT, x0, x0; \ 190 vpand x2, tp, x1; \ 191 vpxor x3, x1, x1; \ 192 vpor tp, x3, x3; \ 193 vpxor x2, tp, x4; \ 194 vpxor x3, x2, x2; \ 195 vpxor x0, x3, x3; \ 196 vpor x1, x0, x0; 197 #define S7_2(x0, x1, x2, x3, x4) \ 198 vpand x0, x2, x2; \ 199 vpxor x4, x0, x0; \ 200 vpxor x3, x4, x4; \ 201 vpand x0, x3, x3; \ 202 vpxor x1, x4, x4; \ 203 vpxor x4, x2, x2; \ 204 vpxor x1, x3, x3; \ 205 vpor x0, x4, x4; \ 206 vpxor x1, x4, x4; 207 208 #define SI0_1(x0, x1, x2, x3, x4) \ 209 vpxor x0, x1, x1; \ 210 vpor x1, x3, tp; \ 211 vpxor x1, x3, x4; \ 212 vpxor RNOT, x0, x0; \ 213 vpxor tp, x2, x2; \ 214 vpxor x0, tp, x3; \ 215 vpand x1, x0, x0; \ 216 vpxor x2, x0, x0; 217 #define SI0_2(x0, x1, x2, x3, x4) \ 218 vpand x3, x2, x2; \ 219 vpxor x4, x3, x3; \ 220 vpxor x3, x2, x2; \ 221 vpxor x3, x1, x1; \ 222 vpand x0, x3, x3; \ 223 vpxor x0, x1, x1; \ 224 vpxor x2, x0, x0; \ 225 vpxor x3, x4, x4; 226 227 #define SI1_1(x0, x1, x2, x3, x4) \ 228 vpxor x3, x1, x1; \ 229 vpxor x2, x0, tp; \ 230 vpxor RNOT, x2, x2; \ 231 vpor x1, x0, x4; \ 232 vpxor x3, x4, x4; \ 233 vpand x1, x3, x3; \ 234 vpxor x2, x1, x1; \ 235 vpand x4, x2, x2; 236 #define SI1_2(x0, x1, x2, x3, x4) \ 237 vpxor x1, x4, x4; \ 238 vpor x3, x1, x1; \ 239 vpxor tp, x3, x3; \ 240 vpxor tp, x2, x2; \ 241 vpor x4, tp, x0; \ 242 vpxor x4, x2, x2; \ 243 vpxor x0, x1, x1; \ 244 vpxor x1, x4, x4; 245 246 #define SI2_1(x0, x1, x2, x3, x4) \ 247 vpxor x1, x2, x2; \ 248 vpxor RNOT, x3, tp; \ 249 vpor x2, tp, tp; \ 250 vpxor x3, x2, x2; \ 251 vpxor x0, x3, x4; \ 252 vpxor x1, tp, x3; \ 253 vpor x2, x1, x1; \ 254 vpxor x0, x2, x2; 255 #define SI2_2(x0, x1, x2, x3, x4) \ 256 vpxor x4, x1, x1; \ 257 vpor x3, x4, x4; \ 258 vpxor x3, x2, x2; \ 259 vpxor x2, x4, x4; \ 260 vpand x1, x2, x2; \ 261 vpxor x3, x2, x2; \ 262 vpxor x4, x3, x3; \ 263 vpxor x0, x4, x4; 264 265 #define SI3_1(x0, x1, x2, x3, x4) \ 266 vpxor x1, x2, x2; \ 267 vpand x2, x1, tp; \ 268 vpxor x0, tp, tp; \ 269 vpor x1, x0, x0; \ 270 vpxor x3, x1, x4; \ 271 vpxor x3, x0, x0; \ 272 vpor tp, x3, x3; \ 273 vpxor x2, tp, x1; 274 #define SI3_2(x0, x1, x2, x3, x4) \ 275 vpxor x3, x1, x1; \ 276 vpxor x2, x0, x0; \ 277 vpxor x3, x2, x2; \ 278 vpand x1, x3, x3; \ 279 vpxor x0, x1, x1; \ 280 vpand x2, x0, x0; \ 281 vpxor x3, x4, x4; \ 282 vpxor x0, x3, x3; \ 283 vpxor x1, x0, x0; 284 285 #define SI4_1(x0, x1, x2, x3, x4) \ 286 vpxor x3, x2, x2; \ 287 vpand x1, x0, tp; \ 288 vpxor x2, tp, tp; \ 289 vpor x3, x2, x2; \ 290 vpxor RNOT, x0, x4; \ 291 vpxor tp, x1, x1; \ 292 vpxor x2, tp, x0; \ 293 vpand x4, x2, x2; 294 #define SI4_2(x0, x1, x2, x3, x4) \ 295 vpxor x0, x2, x2; \ 296 vpor x4, x0, x0; \ 297 vpxor x3, x0, x0; \ 298 vpand x2, x3, x3; \ 299 vpxor x3, x4, x4; \ 300 vpxor x1, x3, x3; \ 301 vpand x0, x1, x1; \ 302 vpxor x1, x4, x4; \ 303 vpxor x3, x0, x0; 304 305 #define SI5_1(x0, x1, x2, x3, x4) \ 306 vpor x2, x1, tp; \ 307 vpxor x1, x2, x2; \ 308 vpxor x3, tp, tp; \ 309 vpand x1, x3, x3; \ 310 vpxor x3, x2, x2; \ 311 vpor x0, x3, x3; \ 312 vpxor RNOT, x0, x0; \ 313 vpxor x2, x3, x3; \ 314 vpor x0, x2, x2; 315 #define SI5_2(x0, x1, x2, x3, x4) \ 316 vpxor tp, x1, x4; \ 317 vpxor x4, x2, x2; \ 318 vpand x0, x4, x4; \ 319 vpxor tp, x0, x0; \ 320 vpxor x3, tp, x1; \ 321 vpand x2, x0, x0; \ 322 vpxor x3, x2, x2; \ 323 vpxor x2, x0, x0; \ 324 vpxor x4, x2, x2; \ 325 vpxor x3, x4, x4; 326 327 #define SI6_1(x0, x1, x2, x3, x4) \ 328 vpxor x2, x0, x0; \ 329 vpand x3, x0, tp; \ 330 vpxor x3, x2, x2; \ 331 vpxor x2, tp, tp; \ 332 vpxor x1, x3, x3; \ 333 vpor x0, x2, x2; \ 334 vpxor x3, x2, x2; \ 335 vpand tp, x3, x3; 336 #define SI6_2(x0, x1, x2, x3, x4) \ 337 vpxor RNOT, tp, tp; \ 338 vpxor x1, x3, x3; \ 339 vpand x2, x1, x1; \ 340 vpxor tp, x0, x4; \ 341 vpxor x4, x3, x3; \ 342 vpxor x2, x4, x4; \ 343 vpxor x1, tp, x0; \ 344 vpxor x0, x2, x2; 345 346 #define SI7_1(x0, x1, x2, x3, x4) \ 347 vpand x0, x3, tp; \ 348 vpxor x2, x0, x0; \ 349 vpor x3, x2, x2; \ 350 vpxor x1, x3, x4; \ 351 vpxor RNOT, x0, x0; \ 352 vpor tp, x1, x1; \ 353 vpxor x0, x4, x4; \ 354 vpand x2, x0, x0; \ 355 vpxor x1, x0, x0; 356 #define SI7_2(x0, x1, x2, x3, x4) \ 357 vpand x2, x1, x1; \ 358 vpxor x2, tp, x3; \ 359 vpxor x3, x4, x4; \ 360 vpand x3, x2, x2; \ 361 vpor x0, x3, x3; \ 362 vpxor x4, x1, x1; \ 363 vpxor x4, x3, x3; \ 364 vpand x0, x4, x4; \ 365 vpxor x2, x4, x4; 366 367 #define get_key(i,j,t) \ 368 vpbroadcastd (4*(i)+(j))*4(CTX), t; 369 370 #define K2(x0, x1, x2, x3, x4, i) \ 371 get_key(i, 0, RK0); \ 372 get_key(i, 1, RK1); \ 373 get_key(i, 2, RK2); \ 374 get_key(i, 3, RK3); \ 375 vpxor RK0, x0 ## 1, x0 ## 1; \ 376 vpxor RK1, x1 ## 1, x1 ## 1; \ 377 vpxor RK2, x2 ## 1, x2 ## 1; \ 378 vpxor RK3, x3 ## 1, x3 ## 1; \ 379 vpxor RK0, x0 ## 2, x0 ## 380 vpxor RK1, x1 ## 2, x1 ## 381 vpxor RK2, x2 ## 2, x2 ## 382 vpxor RK3, x3 ## 2, x3 ## 383 384 #define LK2(x0, x1, x2, x3, x4, i) \ 385 vpslld $13, x0 ## 1, x4 ## 386 vpsrld $(32 - 13), x0 ## 1, x0 ## 387 vpor x4 ## 1, x0 ## 388 vpxor x0 ## 1, x1 ## 389 vpslld $3, x2 ## 1, x4 ## 390 vpsrld $(32 - 3), x2 ## 1, x2 ## 391 vpor x4 ## 1, x2 ## 392 vpxor x2 ## 1, x1 ## 393 vpslld $13, x0 ## 394 vpsrld $(32 - 13), x0 ## 395 vpor x4 ## 396 vpxor x0 ## 397 vpslld $3, x2 ## 398 vpsrld $(32 - 3), x2 ## 399 vpor x4 ## 400 vpxor x2 ## 401 vpslld $1, x1 ## 1, x4 ## 402 vpsrld $(32 - 1), x1 ## 1, x1 ## 403 vpor x4 ## 1, x1 ## 404 vpslld $3, x0 ## 1, x4 ## 405 vpxor x2 ## 1, x3 ## 406 vpxor x4 ## 1, x3 ## 407 get_key(i, 1, RK1); \ 408 vpslld $1, x1 ## 409 vpsrld $(32 - 1), x1 ## 410 vpor x4 ## 411 vpslld $3, x0 ## 412 vpxor x2 ## 413 vpxor x4 ## 414 get_key(i, 3, RK3); \ 415 vpslld $7, x3 ## 1, x4 ## 416 vpsrld $(32 - 7), x3 ## 1, x3 ## 417 vpor x4 ## 1, x3 ## 418 vpslld $7, x1 ## 1, x4 ## 419 vpxor x1 ## 1, x0 ## 420 vpxor x3 ## 1, x0 ## 421 vpxor x3 ## 1, x2 ## 422 vpxor x4 ## 1, x2 ## 423 get_key(i, 0, RK0); \ 424 vpslld $7, x3 ## 425 vpsrld $(32 - 7), x3 ## 426 vpor x4 ## 427 vpslld $7, x1 ## 428 vpxor x1 ## 429 vpxor x3 ## 430 vpxor x3 ## 431 vpxor x4 ## 432 get_key(i, 2, RK2); \ 433 vpxor RK1, x1 ## 1, 434 vpxor RK3, x3 ## 1, 435 vpslld $5, x0 ## 1, x4 ## 436 vpsrld $(32 - 5), x0 ## 1, x0 ## 437 vpor x4 ## 1, x0 ## 438 vpslld $22, x2 ## 1, x4 ## 439 vpsrld $(32 - 22), x2 ## 1, x2 ## 440 vpor x4 ## 1, x2 ## 441 vpxor RK0, x0 ## 1, 442 vpxor RK2, x2 ## 1, 443 vpxor RK1, x 444 vpxor RK3, x 445 vpslld $5, x0 ## 446 vpsrld $(32 - 5), x0 ## 447 vpor x4 ## 448 vpslld $22, x2 ## 449 vpsrld $(32 - 22), x2 ## 450 vpor x4 ## 451 vpxor RK0, x 452 vpxor RK2, x 453 454 #define KL2(x0, x1, x2, x3, x4, i) \ 455 vpxor RK0, x0 ## 1, 456 vpxor RK2, x2 ## 1, 457 vpsrld $5, x0 ## 1, x4 ## 458 vpslld $(32 - 5), x0 ## 1, x0 ## 459 vpor x4 ## 1, x0 ## 460 vpxor RK3, x3 ## 1, 461 vpxor RK1, x1 ## 1, 462 vpsrld $22, x2 ## 1, x4 ## 463 vpslld $(32 - 22), x2 ## 1, x2 ## 464 vpor x4 ## 1, x2 ## 465 vpxor x3 ## 1, x2 ## 466 vpxor RK0, x 467 vpxor RK2, x 468 vpsrld $5, x0 ## 469 vpslld $(32 - 5), x0 ## 470 vpor x4 ## 471 vpxor RK3, x 472 vpxor RK1, x 473 vpsrld $22, x2 ## 474 vpslld $(32 - 22), x2 ## 475 vpor x4 ## 476 vpxor x3 ## 477 vpxor x3 ## 1, x0 ## 478 vpslld $7, x1 ## 1, x4 ## 479 vpxor x1 ## 1, x0 ## 480 vpxor x4 ## 1, x2 ## 481 vpsrld $1, x1 ## 1, x4 ## 482 vpslld $(32 - 1), x1 ## 1, x1 ## 483 vpor x4 ## 1, x1 ## 484 vpxor x3 ## 485 vpslld $7, x1 ## 486 vpxor x1 ## 487 vpxor x4 ## 488 vpsrld $1, x1 ## 489 vpslld $(32 - 1), x1 ## 490 vpor x4 ## 491 vpsrld $7, x3 ## 1, x4 ## 492 vpslld $(32 - 7), x3 ## 1, x3 ## 493 vpor x4 ## 1, x3 ## 494 vpxor x0 ## 1, x1 ## 495 vpslld $3, x0 ## 1, x4 ## 496 vpxor x4 ## 1, x3 ## 497 vpsrld $7, x3 ## 498 vpslld $(32 - 7), x3 ## 499 vpor x4 ## 500 vpxor x0 ## 501 vpslld $3, x0 ## 502 vpxor x4 ## 503 vpsrld $13, x0 ## 1, x4 ## 504 vpslld $(32 - 13), x0 ## 1, x0 ## 505 vpor x4 ## 1, x0 ## 506 vpxor x2 ## 1, x1 ## 507 vpxor x2 ## 1, x3 ## 508 vpsrld $3, x2 ## 1, x4 ## 509 vpslld $(32 - 3), x2 ## 1, x2 ## 510 vpor x4 ## 1, x2 ## 511 vpsrld $13, x0 ## 512 vpslld $(32 - 13), x0 ## 513 vpor x4 ## 514 vpxor x2 ## 515 vpxor x2 ## 516 vpsrld $3, x2 ## 517 vpslld $(32 - 3), x2 ## 518 vpor x4 ## 519 520 #define S(SBOX, x0, x1, x2, x3, x4) \ 521 SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, 522 SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, 523 SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, 524 SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, 525 526 #define SP(SBOX, x0, x1, x2, x3, x4, i) \ 527 get_key(i, 0, RK0); \ 528 SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, 529 get_key(i, 2, RK2); \ 530 SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, 531 get_key(i, 3, RK3); \ 532 SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, 533 get_key(i, 1, RK1); \ 534 SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, 535 536 #define transpose_4x4(x0, x1, x2, x3, t0, t1, 537 vpunpckldq x1, x0, t0; \ 538 vpunpckhdq x1, x0, t2; \ 539 vpunpckldq x3, x2, t1; \ 540 vpunpckhdq x3, x2, x3; \ 541 \ 542 vpunpcklqdq t1, t0, x0; \ 543 vpunpckhqdq t1, t0, x1; \ 544 vpunpcklqdq x3, t2, x2; \ 545 vpunpckhqdq x3, t2, x3; 546 547 #define read_blocks(x0, x1, x2, x3, t0, t1, t2 548 transpose_4x4(x0, x1, x2, x3, t0, t1, 549 550 #define write_blocks(x0, x1, x2, x3, t0, t1, t 551 transpose_4x4(x0, x1, x2, x3, t0, t1, 552 553 SYM_FUNC_START_LOCAL(__serpent_enc_blk16) 554 /* input: 555 * %rdi: ctx, CTX 556 * RA1, RB1, RC1, RD1, RA2, RB2, 557 * output: 558 * RA1, RB1, RC1, RD1, RA2, RB2, 559 */ 560 561 vpcmpeqd RNOT, RNOT, RNOT; 562 563 read_blocks(RA1, RB1, RC1, RD1, RK0, R 564 read_blocks(RA2, RB2, RC2, RD2, RK0, R 565 566 567 S(S0, RA, RB, RC, RD, RE); 568 S(S1, RC, RB, RD, RA, RE); 569 S(S2, RE, RD, RA, RC, RB); 570 S(S3, RB, RD, RE, RC, RA); 571 S(S4, RC, RA, RD, RB, RE); 572 S(S5, RA, RD, RB, RE, RC); 573 S(S6, RC, RA, RD, RE, RB); 574 S(S7, RD, RB, RA, RE, RC); 575 S(S0, RC, RA, RE, RD, RB); 576 S(S1, RE, RA, RD, RC, RB); 577 S(S2, RB, RD, RC, RE, RA); 578 S(S3, RA, RD, RB, RE, RC); 579 S(S4, RE, RC, RD, RA, RB); 580 S(S5, RC, RD, RA, RB, RE); 581 S(S6, RE, RC, RD, RB, RA); 582 S(S7, RD, RA, RC, RB, RE); 583 S(S0, RE, RC, RB, RD, RA); 584 S(S1, RB, RC, RD, RE, RA); 585 S(S2, RA, RD, RE, RB, RC); 586 S(S3, RC, RD, RA, RB, RE); 587 S(S4, RB, RE, RD, RC, RA); 588 S(S5, RE, RD, RC, RA, RB); 589 S(S6, RB, RE, RD, RA, RC); 590 S(S7, RD, RC, RE, RA, RB); 591 S(S0, RB, RE, RA, RD, RC); 592 S(S1, RA, RE, RD, RB, RC); 593 S(S2, RC, RD, RB, RA, RE); 594 S(S3, RE, RD, RC, RA, RB); 595 S(S4, RA, RB, RD, RE, RC); 596 S(S5, RB, RD, RE, RC, RA); 597 S(S6, RA, RB, RD, RC, RE); 598 S(S7, RD, RE, RB, RC, RA); 599 600 write_blocks(RA1, RB1, RC1, RD1, RK0, 601 write_blocks(RA2, RB2, RC2, RD2, RK0, 602 603 RET; 604 SYM_FUNC_END(__serpent_enc_blk16) 605 606 SYM_FUNC_START_LOCAL(__serpent_dec_blk16) 607 /* input: 608 * %rdi: ctx, CTX 609 * RA1, RB1, RC1, RD1, RA2, RB2, 610 * output: 611 * RC1, RD1, RB1, RE1, RC2, RD2, 612 */ 613 614 vpcmpeqd RNOT, RNOT, RNOT; 615 616 read_blocks(RA1, RB1, RC1, RD1, RK0, R 617 read_blocks(RA2, RB2, RC2, RD2, RK0, R 618 619 620 SP(SI7, RA, RB, RC, RD, RE, 31); 621 SP(SI6, RB, RD, RA, RE, RC, 30); 622 SP(SI5, RA, RC, RE, RB, RD, 29); 623 SP(SI4, RC, RD, RA, RE, RB, 28); 624 SP(SI3, RC, RA, RB, RE, RD, 27); 625 SP(SI2, RB, RC, RD, RE, RA, 26); 626 SP(SI1, RC, RA, RE, RD, RB, 25); 627 SP(SI0, RB, RA, RE, RD, RC, 24); 628 SP(SI7, RE, RC, RA, RB, RD, 23); 629 SP(SI6, RC, RB, RE, RD, RA, 22); 630 SP(SI5, RE, RA, RD, RC, RB, 21); 631 SP(SI4, RA, RB, RE, RD, RC, 20); 632 SP(SI3, RA, RE, RC, RD, RB, 19); 633 SP(SI2, RC, RA, RB, RD, RE, 18); 634 SP(SI1, RA, RE, RD, RB, RC, 17); 635 SP(SI0, RC, RE, RD, RB, RA, 16); 636 SP(SI7, RD, RA, RE, RC, RB, 15); 637 SP(SI6, RA, RC, RD, RB, RE, 14); 638 SP(SI5, RD, RE, RB, RA, RC, 13); 639 SP(SI4, RE, RC, RD, RB, RA, 12); 640 SP(SI3, RE, RD, RA, RB, RC, 11); 641 SP(SI2, RA, RE, RC, RB, RD, 10); 642 SP(SI1, RE, RD, RB, RC, RA, 9); 643 SP(SI0, RA, RD, RB, RC, RE, 8); 644 SP(SI7, RB, RE, RD, RA, RC, 7); 645 SP(SI6, RE, RA, RB, RC, RD, 6); 646 SP(SI5, RB, RD, RC, RE, RA, 5); 647 SP(SI4, RD, RA, RB, RC, RE, 4); 648 SP(SI3, RD, RB, RE, RC, RA, 3); 649 SP(SI2, RE, RD, RA, RC, RB, 2); 650 SP(SI1, RD, RB, RC, RA, RE, 1); 651 S(SI0, RE, RB, RC, RA, RD); 652 653 write_blocks(RC1, RD1, RB1, RE1, RK0, 654 write_blocks(RC2, RD2, RB2, RE2, RK0, 655 656 RET; 657 SYM_FUNC_END(__serpent_dec_blk16) 658 659 SYM_FUNC_START(serpent_ecb_enc_16way) 660 /* input: 661 * %rdi: ctx, CTX 662 * %rsi: dst 663 * %rdx: src 664 */ 665 FRAME_BEGIN 666 667 vzeroupper; 668 669 load_16way(%rdx, RA1, RB1, RC1, RD1, R 670 671 call __serpent_enc_blk16; 672 673 store_16way(%rsi, RA1, RB1, RC1, RD1, 674 675 vzeroupper; 676 677 FRAME_END 678 RET; 679 SYM_FUNC_END(serpent_ecb_enc_16way) 680 681 SYM_FUNC_START(serpent_ecb_dec_16way) 682 /* input: 683 * %rdi: ctx, CTX 684 * %rsi: dst 685 * %rdx: src 686 */ 687 FRAME_BEGIN 688 689 vzeroupper; 690 691 load_16way(%rdx, RA1, RB1, RC1, RD1, R 692 693 call __serpent_dec_blk16; 694 695 store_16way(%rsi, RC1, RD1, RB1, RE1, 696 697 vzeroupper; 698 699 FRAME_END 700 RET; 701 SYM_FUNC_END(serpent_ecb_dec_16way) 702 703 SYM_FUNC_START(serpent_cbc_dec_16way) 704 /* input: 705 * %rdi: ctx, CTX 706 * %rsi: dst 707 * %rdx: src 708 */ 709 FRAME_BEGIN 710 711 vzeroupper; 712 713 load_16way(%rdx, RA1, RB1, RC1, RD1, R 714 715 call __serpent_dec_blk16; 716 717 store_cbc_16way(%rdx, %rsi, RC1, RD1, 718 RK0); 719 720 vzeroupper; 721 722 FRAME_END 723 RET; 724 SYM_FUNC_END(serpent_cbc_dec_16way)
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.