1 /* SPDX-License-Identifier: GPL-2.0-or-later * 2 /* 3 * Serpent Cipher 4-way parallel algorithm (i5 4 * 5 * Copyright (C) 2011 Jussi Kivilinna <jussi.ki 6 * 7 * Based on crypto/serpent.c by 8 * Copyright (C) 2002 Dag Arne Osvik <osvik@ii 9 * 2003 Herbert Valerio Riedel < 10 */ 11 12 #include <linux/linkage.h> 13 14 .file "serpent-sse2-i586-asm_32.S" 15 .text 16 17 #define arg_ctx 4 18 #define arg_dst 8 19 #define arg_src 12 20 #define arg_xor 16 21 22 /********************************************* 23 4-way SSE2 serpent 24 ********************************************* 25 #define CTX %edx 26 27 #define RA %xmm0 28 #define RB %xmm1 29 #define RC %xmm2 30 #define RD %xmm3 31 #define RE %xmm4 32 33 #define RT0 %xmm5 34 #define RT1 %xmm6 35 36 #define RNOT %xmm7 37 38 #define get_key(i, j, t) \ 39 movd (4*(i)+(j))*4(CTX), t; \ 40 pshufd $0, t, t; 41 42 #define K(x0, x1, x2, x3, x4, i) \ 43 get_key(i, 0, x4); \ 44 get_key(i, 1, RT0); \ 45 get_key(i, 2, RT1); \ 46 pxor x4, x0; \ 47 pxor RT0, x1; \ 48 pxor RT1, x2; \ 49 get_key(i, 3, x4); \ 50 pxor x4, x3; 51 52 #define LK(x0, x1, x2, x3, x4, i) \ 53 movdqa x0, x4; \ 54 pslld $13, x0; \ 55 psrld $(32 - 13), x4; \ 56 por x4, x0; \ 57 pxor x0, x1; \ 58 movdqa x2, x4; \ 59 pslld $3, x2; \ 60 psrld $(32 - 3), x4; \ 61 por x4, x2; \ 62 pxor x2, x1; \ 63 movdqa x1, x4; \ 64 pslld $1, x1; \ 65 psrld $(32 - 1), x4; \ 66 por x4, x1; \ 67 movdqa x0, x4; \ 68 pslld $3, x4; \ 69 pxor x2, x3; \ 70 pxor x4, x3; \ 71 movdqa x3, x4; \ 72 pslld $7, x3; \ 73 psrld $(32 - 7), x4; \ 74 por x4, x3; \ 75 movdqa x1, x4; \ 76 pslld $7, x4; \ 77 pxor x1, x0; \ 78 pxor x3, x0; \ 79 pxor x3, x2; \ 80 pxor x4, x2; \ 81 movdqa x0, x4; \ 82 get_key(i, 1, RT0); \ 83 pxor RT0, x1; \ 84 get_key(i, 3, RT0); \ 85 pxor RT0, x3; \ 86 pslld $5, x0; \ 87 psrld $(32 - 5), x4; \ 88 por x4, x0; \ 89 movdqa x2, x4; \ 90 pslld $22, x2; \ 91 psrld $(32 - 22), x4; \ 92 por x4, x2; \ 93 get_key(i, 0, RT0); \ 94 pxor RT0, x0; \ 95 get_key(i, 2, RT0); \ 96 pxor RT0, x2; 97 98 #define KL(x0, x1, x2, x3, x4, i) \ 99 K(x0, x1, x2, x3, x4, i); \ 100 movdqa x0, x4; \ 101 psrld $5, x0; \ 102 pslld $(32 - 5), x4; \ 103 por x4, x0; \ 104 movdqa x2, x4; \ 105 psrld $22, x2; \ 106 pslld $(32 - 22), x4; \ 107 por x4, x2; \ 108 pxor x3, x2; \ 109 pxor x3, x0; \ 110 movdqa x1, x4; \ 111 pslld $7, x4; \ 112 pxor x1, x0; \ 113 pxor x4, x2; \ 114 movdqa x1, x4; \ 115 psrld $1, x1; \ 116 pslld $(32 - 1), x4; \ 117 por x4, x1; \ 118 movdqa x3, x4; \ 119 psrld $7, x3; \ 120 pslld $(32 - 7), x4; \ 121 por x4, x3; \ 122 pxor x0, x1; \ 123 movdqa x0, x4; \ 124 pslld $3, x4; \ 125 pxor x4, x3; \ 126 movdqa x0, x4; \ 127 psrld $13, x0; \ 128 pslld $(32 - 13), x4; \ 129 por x4, x0; \ 130 pxor x2, x1; \ 131 pxor x2, x3; \ 132 movdqa x2, x4; \ 133 psrld $3, x2; \ 134 pslld $(32 - 3), x4; \ 135 por x4, x2; 136 137 #define S0(x0, x1, x2, x3, x4) \ 138 movdqa x3, x4; \ 139 por x0, x3; \ 140 pxor x4, x0; \ 141 pxor x2, x4; \ 142 pxor RNOT, x4; \ 143 pxor x1, x3; \ 144 pand x0, x1; \ 145 pxor x4, x1; \ 146 pxor x0, x2; \ 147 pxor x3, x0; \ 148 por x0, x4; \ 149 pxor x2, x0; \ 150 pand x1, x2; \ 151 pxor x2, x3; \ 152 pxor RNOT, x1; \ 153 pxor x4, x2; \ 154 pxor x2, x1; 155 156 #define S1(x0, x1, x2, x3, x4) \ 157 movdqa x1, x4; \ 158 pxor x0, x1; \ 159 pxor x3, x0; \ 160 pxor RNOT, x3; \ 161 pand x1, x4; \ 162 por x1, x0; \ 163 pxor x2, x3; \ 164 pxor x3, x0; \ 165 pxor x3, x1; \ 166 pxor x4, x3; \ 167 por x4, x1; \ 168 pxor x2, x4; \ 169 pand x0, x2; \ 170 pxor x1, x2; \ 171 por x0, x1; \ 172 pxor RNOT, x0; \ 173 pxor x2, x0; \ 174 pxor x1, x4; 175 176 #define S2(x0, x1, x2, x3, x4) \ 177 pxor RNOT, x3; \ 178 pxor x0, x1; \ 179 movdqa x0, x4; \ 180 pand x2, x0; \ 181 pxor x3, x0; \ 182 por x4, x3; \ 183 pxor x1, x2; \ 184 pxor x1, x3; \ 185 pand x0, x1; \ 186 pxor x2, x0; \ 187 pand x3, x2; \ 188 por x1, x3; \ 189 pxor RNOT, x0; \ 190 pxor x0, x3; \ 191 pxor x0, x4; \ 192 pxor x2, x0; \ 193 por x2, x1; 194 195 #define S3(x0, x1, x2, x3, x4) \ 196 movdqa x1, x4; \ 197 pxor x3, x1; \ 198 por x0, x3; \ 199 pand x0, x4; \ 200 pxor x2, x0; \ 201 pxor x1, x2; \ 202 pand x3, x1; \ 203 pxor x3, x2; \ 204 por x4, x0; \ 205 pxor x3, x4; \ 206 pxor x0, x1; \ 207 pand x3, x0; \ 208 pand x4, x3; \ 209 pxor x2, x3; \ 210 por x1, x4; \ 211 pand x1, x2; \ 212 pxor x3, x4; \ 213 pxor x3, x0; \ 214 pxor x2, x3; 215 216 #define S4(x0, x1, x2, x3, x4) \ 217 movdqa x3, x4; \ 218 pand x0, x3; \ 219 pxor x4, x0; \ 220 pxor x2, x3; \ 221 por x4, x2; \ 222 pxor x1, x0; \ 223 pxor x3, x4; \ 224 por x0, x2; \ 225 pxor x1, x2; \ 226 pand x0, x1; \ 227 pxor x4, x1; \ 228 pand x2, x4; \ 229 pxor x3, x2; \ 230 pxor x0, x4; \ 231 por x1, x3; \ 232 pxor RNOT, x1; \ 233 pxor x0, x3; 234 235 #define S5(x0, x1, x2, x3, x4) \ 236 movdqa x1, x4; \ 237 por x0, x1; \ 238 pxor x1, x2; \ 239 pxor RNOT, x3; \ 240 pxor x0, x4; \ 241 pxor x2, x0; \ 242 pand x4, x1; \ 243 por x3, x4; \ 244 pxor x0, x4; \ 245 pand x3, x0; \ 246 pxor x3, x1; \ 247 pxor x2, x3; \ 248 pxor x1, x0; \ 249 pand x4, x2; \ 250 pxor x2, x1; \ 251 pand x0, x2; \ 252 pxor x2, x3; 253 254 #define S6(x0, x1, x2, x3, x4) \ 255 movdqa x1, x4; \ 256 pxor x0, x3; \ 257 pxor x2, x1; \ 258 pxor x0, x2; \ 259 pand x3, x0; \ 260 por x3, x1; \ 261 pxor RNOT, x4; \ 262 pxor x1, x0; \ 263 pxor x2, x1; \ 264 pxor x4, x3; \ 265 pxor x0, x4; \ 266 pand x0, x2; \ 267 pxor x1, x4; \ 268 pxor x3, x2; \ 269 pand x1, x3; \ 270 pxor x0, x3; \ 271 pxor x2, x1; 272 273 #define S7(x0, x1, x2, x3, x4) \ 274 pxor RNOT, x1; \ 275 movdqa x1, x4; \ 276 pxor RNOT, x0; \ 277 pand x2, x1; \ 278 pxor x3, x1; \ 279 por x4, x3; \ 280 pxor x2, x4; \ 281 pxor x3, x2; \ 282 pxor x0, x3; \ 283 por x1, x0; \ 284 pand x0, x2; \ 285 pxor x4, x0; \ 286 pxor x3, x4; \ 287 pand x0, x3; \ 288 pxor x1, x4; \ 289 pxor x4, x2; \ 290 pxor x1, x3; \ 291 por x0, x4; \ 292 pxor x1, x4; 293 294 #define SI0(x0, x1, x2, x3, x4) \ 295 movdqa x3, x4; \ 296 pxor x0, x1; \ 297 por x1, x3; \ 298 pxor x1, x4; \ 299 pxor RNOT, x0; \ 300 pxor x3, x2; \ 301 pxor x0, x3; \ 302 pand x1, x0; \ 303 pxor x2, x0; \ 304 pand x3, x2; \ 305 pxor x4, x3; \ 306 pxor x3, x2; \ 307 pxor x3, x1; \ 308 pand x0, x3; \ 309 pxor x0, x1; \ 310 pxor x2, x0; \ 311 pxor x3, x4; 312 313 #define SI1(x0, x1, x2, x3, x4) \ 314 pxor x3, x1; \ 315 movdqa x0, x4; \ 316 pxor x2, x0; \ 317 pxor RNOT, x2; \ 318 por x1, x4; \ 319 pxor x3, x4; \ 320 pand x1, x3; \ 321 pxor x2, x1; \ 322 pand x4, x2; \ 323 pxor x1, x4; \ 324 por x3, x1; \ 325 pxor x0, x3; \ 326 pxor x0, x2; \ 327 por x4, x0; \ 328 pxor x4, x2; \ 329 pxor x0, x1; \ 330 pxor x1, x4; 331 332 #define SI2(x0, x1, x2, x3, x4) \ 333 pxor x1, x2; \ 334 movdqa x3, x4; \ 335 pxor RNOT, x3; \ 336 por x2, x3; \ 337 pxor x4, x2; \ 338 pxor x0, x4; \ 339 pxor x1, x3; \ 340 por x2, x1; \ 341 pxor x0, x2; \ 342 pxor x4, x1; \ 343 por x3, x4; \ 344 pxor x3, x2; \ 345 pxor x2, x4; \ 346 pand x1, x2; \ 347 pxor x3, x2; \ 348 pxor x4, x3; \ 349 pxor x0, x4; 350 351 #define SI3(x0, x1, x2, x3, x4) \ 352 pxor x1, x2; \ 353 movdqa x1, x4; \ 354 pand x2, x1; \ 355 pxor x0, x1; \ 356 por x4, x0; \ 357 pxor x3, x4; \ 358 pxor x3, x0; \ 359 por x1, x3; \ 360 pxor x2, x1; \ 361 pxor x3, x1; \ 362 pxor x2, x0; \ 363 pxor x3, x2; \ 364 pand x1, x3; \ 365 pxor x0, x1; \ 366 pand x2, x0; \ 367 pxor x3, x4; \ 368 pxor x0, x3; \ 369 pxor x1, x0; 370 371 #define SI4(x0, x1, x2, x3, x4) \ 372 pxor x3, x2; \ 373 movdqa x0, x4; \ 374 pand x1, x0; \ 375 pxor x2, x0; \ 376 por x3, x2; \ 377 pxor RNOT, x4; \ 378 pxor x0, x1; \ 379 pxor x2, x0; \ 380 pand x4, x2; \ 381 pxor x0, x2; \ 382 por x4, x0; \ 383 pxor x3, x0; \ 384 pand x2, x3; \ 385 pxor x3, x4; \ 386 pxor x1, x3; \ 387 pand x0, x1; \ 388 pxor x1, x4; \ 389 pxor x3, x0; 390 391 #define SI5(x0, x1, x2, x3, x4) \ 392 movdqa x1, x4; \ 393 por x2, x1; \ 394 pxor x4, x2; \ 395 pxor x3, x1; \ 396 pand x4, x3; \ 397 pxor x3, x2; \ 398 por x0, x3; \ 399 pxor RNOT, x0; \ 400 pxor x2, x3; \ 401 por x0, x2; \ 402 pxor x1, x4; \ 403 pxor x4, x2; \ 404 pand x0, x4; \ 405 pxor x1, x0; \ 406 pxor x3, x1; \ 407 pand x2, x0; \ 408 pxor x3, x2; \ 409 pxor x2, x0; \ 410 pxor x4, x2; \ 411 pxor x3, x4; 412 413 #define SI6(x0, x1, x2, x3, x4) \ 414 pxor x2, x0; \ 415 movdqa x0, x4; \ 416 pand x3, x0; \ 417 pxor x3, x2; \ 418 pxor x2, x0; \ 419 pxor x1, x3; \ 420 por x4, x2; \ 421 pxor x3, x2; \ 422 pand x0, x3; \ 423 pxor RNOT, x0; \ 424 pxor x1, x3; \ 425 pand x2, x1; \ 426 pxor x0, x4; \ 427 pxor x4, x3; \ 428 pxor x2, x4; \ 429 pxor x1, x0; \ 430 pxor x0, x2; 431 432 #define SI7(x0, x1, x2, x3, x4) \ 433 movdqa x3, x4; \ 434 pand x0, x3; \ 435 pxor x2, x0; \ 436 por x4, x2; \ 437 pxor x1, x4; \ 438 pxor RNOT, x0; \ 439 por x3, x1; \ 440 pxor x0, x4; \ 441 pand x2, x0; \ 442 pxor x1, x0; \ 443 pand x2, x1; \ 444 pxor x2, x3; \ 445 pxor x3, x4; \ 446 pand x3, x2; \ 447 por x0, x3; \ 448 pxor x4, x1; \ 449 pxor x4, x3; \ 450 pand x0, x4; \ 451 pxor x2, x4; 452 453 #define transpose_4x4(x0, x1, x2, x3, t0, t1, 454 movdqa x0, t2; \ 455 punpckldq x1, x0; \ 456 punpckhdq x1, t2; \ 457 movdqa x2, t1; \ 458 punpckhdq x3, x2; \ 459 punpckldq x3, t1; \ 460 movdqa x0, x1; \ 461 punpcklqdq t1, x0; \ 462 punpckhqdq t1, x1; \ 463 movdqa t2, x3; \ 464 punpcklqdq x2, t2; \ 465 punpckhqdq x2, x3; \ 466 movdqa t2, x2; 467 468 #define read_blocks(in, x0, x1, x2, x3, t0, t1 469 movdqu (0*4*4)(in), x0; \ 470 movdqu (1*4*4)(in), x1; \ 471 movdqu (2*4*4)(in), x2; \ 472 movdqu (3*4*4)(in), x3; \ 473 \ 474 transpose_4x4(x0, x1, x2, x3, t0, t1, 475 476 #define write_blocks(out, x0, x1, x2, x3, t0, 477 transpose_4x4(x0, x1, x2, x3, t0, t1, 478 \ 479 movdqu x0, (0*4*4)(out); \ 480 movdqu x1, (1*4*4)(out); \ 481 movdqu x2, (2*4*4)(out); \ 482 movdqu x3, (3*4*4)(out); 483 484 #define xor_blocks(out, x0, x1, x2, x3, t0, t1 485 transpose_4x4(x0, x1, x2, x3, t0, t1, 486 \ 487 movdqu (0*4*4)(out), t0; \ 488 pxor t0, x0; \ 489 movdqu x0, (0*4*4)(out); 490 movdqu (1*4*4)(out), t0; \ 491 pxor t0, x1; \ 492 movdqu x1, (1*4*4)(out); 493 movdqu (2*4*4)(out), t0; \ 494 pxor t0, x2; \ 495 movdqu x2, (2*4*4)(out); 496 movdqu (3*4*4)(out), t0; \ 497 pxor t0, x3; \ 498 movdqu x3, (3*4*4)(out); 499 500 SYM_FUNC_START(__serpent_enc_blk_4way) 501 /* input: 502 * arg_ctx(%esp): ctx, CTX 503 * arg_dst(%esp): dst 504 * arg_src(%esp): src 505 * arg_xor(%esp): bool, if true: 506 */ 507 508 pcmpeqd RNOT, RNOT; 509 510 movl arg_ctx(%esp), CTX; 511 512 movl arg_src(%esp), %eax; 513 read_blocks(%eax, RA, RB, RC, RD, RT0, 514 515 K(RA, 516 S0(RA, RB, RC, RD, RE); LK(RC, 517 S1(RC, RB, RD, RA, RE); LK(RE, 518 S2(RE, RD, RA, RC, RB); LK(RB, 519 S3(RB, RD, RE, RC, RA); LK(RC, 520 S4(RC, RA, RD, RB, RE); LK(RA, 521 S5(RA, RD, RB, RE, RC); LK(RC, 522 S6(RC, RA, RD, RE, RB); LK(RD, 523 S7(RD, RB, RA, RE, RC); LK(RC, 524 S0(RC, RA, RE, RD, RB); LK(RE, 525 S1(RE, RA, RD, RC, RB); LK(RB, 526 S2(RB, RD, RC, RE, RA); LK(RA, 527 S3(RA, RD, RB, RE, RC); LK(RE, 528 S4(RE, RC, RD, RA, RB); LK(RC, 529 S5(RC, RD, RA, RB, RE); LK(RE, 530 S6(RE, RC, RD, RB, RA); LK(RD, 531 S7(RD, RA, RC, RB, RE); LK(RE, 532 S0(RE, RC, RB, RD, RA); LK(RB, 533 S1(RB, RC, RD, RE, RA); LK(RA, 534 S2(RA, RD, RE, RB, RC); LK(RC, 535 S3(RC, RD, RA, RB, RE); LK(RB, 536 S4(RB, RE, RD, RC, RA); LK(RE, 537 S5(RE, RD, RC, RA, RB); LK(RB, 538 S6(RB, RE, RD, RA, RC); LK(RD, 539 S7(RD, RC, RE, RA, RB); LK(RB, 540 S0(RB, RE, RA, RD, RC); LK(RA, 541 S1(RA, RE, RD, RB, RC); LK(RC, 542 S2(RC, RD, RB, RA, RE); LK(RE, 543 S3(RE, RD, RC, RA, RB); LK(RA, 544 S4(RA, RB, RD, RE, RC); LK(RB, 545 S5(RB, RD, RE, RC, RA); LK(RA, 546 S6(RA, RB, RD, RC, RE); LK(RD, 547 S7(RD, RE, RB, RC, RA); K(RA, 548 549 movl arg_dst(%esp), %eax; 550 551 cmpb $0, arg_xor(%esp); 552 jnz .L__enc_xor4; 553 554 write_blocks(%eax, RA, RB, RC, RD, RT0 555 556 RET; 557 558 .L__enc_xor4: 559 xor_blocks(%eax, RA, RB, RC, RD, RT0, 560 561 RET; 562 SYM_FUNC_END(__serpent_enc_blk_4way) 563 564 SYM_FUNC_START(serpent_dec_blk_4way) 565 /* input: 566 * arg_ctx(%esp): ctx, CTX 567 * arg_dst(%esp): dst 568 * arg_src(%esp): src 569 */ 570 571 pcmpeqd RNOT, RNOT; 572 573 movl arg_ctx(%esp), CTX; 574 575 movl arg_src(%esp), %eax; 576 read_blocks(%eax, RA, RB, RC, RD, RT0, 577 578 K(RA, 579 SI7(RA, RB, RC, RD, RE); KL(RB, 580 SI6(RB, RD, RA, RE, RC); KL(RA, 581 SI5(RA, RC, RE, RB, RD); KL(RC, 582 SI4(RC, RD, RA, RE, RB); KL(RC, 583 SI3(RC, RA, RB, RE, RD); KL(RB, 584 SI2(RB, RC, RD, RE, RA); KL(RC, 585 SI1(RC, RA, RE, RD, RB); KL(RB, 586 SI0(RB, RA, RE, RD, RC); KL(RE, 587 SI7(RE, RC, RA, RB, RD); KL(RC, 588 SI6(RC, RB, RE, RD, RA); KL(RE, 589 SI5(RE, RA, RD, RC, RB); KL(RA, 590 SI4(RA, RB, RE, RD, RC); KL(RA, 591 SI3(RA, RE, RC, RD, RB); KL(RC, 592 SI2(RC, RA, RB, RD, RE); KL(RA, 593 SI1(RA, RE, RD, RB, RC); KL(RC, 594 SI0(RC, RE, RD, RB, RA); KL(RD, 595 SI7(RD, RA, RE, RC, RB); KL(RA, 596 SI6(RA, RC, RD, RB, RE); KL(RD, 597 SI5(RD, RE, RB, RA, RC); KL(RE, 598 SI4(RE, RC, RD, RB, RA); KL(RE, 599 SI3(RE, RD, RA, RB, RC); KL(RA, 600 SI2(RA, RE, RC, RB, RD); KL(RE, 601 SI1(RE, RD, RB, RC, RA); KL(RA, 602 SI0(RA, RD, RB, RC, RE); KL(RB, 603 SI7(RB, RE, RD, RA, RC); KL(RE, 604 SI6(RE, RA, RB, RC, RD); KL(RB, 605 SI5(RB, RD, RC, RE, RA); KL(RD, 606 SI4(RD, RA, RB, RC, RE); KL(RD, 607 SI3(RD, RB, RE, RC, RA); KL(RE, 608 SI2(RE, RD, RA, RC, RB); KL(RD, 609 SI1(RD, RB, RC, RA, RE); KL(RE, 610 SI0(RE, RB, RC, RA, RD); K(RC, 611 612 movl arg_dst(%esp), %eax; 613 write_blocks(%eax, RC, RD, RB, RE, RT0 614 615 RET; 616 SYM_FUNC_END(serpent_dec_blk_4way)
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.