1 /* SPDX-License-Identifier: GPL-2.0-or-later */ 2 /* 3 * 4 * Copyright (C) IBM Corporation, 2011 5 * 6 * Author: Anton Blanchard <anton@au.ibm.com> 7 */ 8 #include <asm/ppc_asm.h> 9 10 #ifndef SELFTEST_CASE 11 /* 0 == don't use VMX, 1 == use VMX */ 12 #define SELFTEST_CASE 0 13 #endif 14 15 #ifdef __BIG_ENDIAN__ 16 #define LVS(VRT,RA,RB) lvsl VRT,RA,RB 17 #define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRA,VRB,VRC 18 #else 19 #define LVS(VRT,RA,RB) lvsr VRT,RA,RB 20 #define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRB,VRA,VRC 21 #endif 22 23 .macro err1 24 100: 25 EX_TABLE(100b,.Ldo_err1) 26 .endm 27 28 .macro err2 29 200: 30 EX_TABLE(200b,.Ldo_err2) 31 .endm 32 33 #ifdef CONFIG_ALTIVEC 34 .macro err3 35 300: 36 EX_TABLE(300b,.Ldo_err3) 37 .endm 38 39 .macro err4 40 400: 41 EX_TABLE(400b,.Ldo_err4) 42 .endm 43 44 45 .Ldo_err4: 46 ld r16,STK_REG(R16)(r1) 47 ld r15,STK_REG(R15)(r1) 48 ld r14,STK_REG(R14)(r1) 49 .Ldo_err3: 50 bl CFUNC(exit_vmx_usercopy) 51 ld r0,STACKFRAMESIZE+16(r1) 52 mtlr r0 53 b .Lexit 54 #endif /* CONFIG_ALTIVEC */ 55 56 .Ldo_err2: 57 ld r22,STK_REG(R22)(r1) 58 ld r21,STK_REG(R21)(r1) 59 ld r20,STK_REG(R20)(r1) 60 ld r19,STK_REG(R19)(r1) 61 ld r18,STK_REG(R18)(r1) 62 ld r17,STK_REG(R17)(r1) 63 ld r16,STK_REG(R16)(r1) 64 ld r15,STK_REG(R15)(r1) 65 ld r14,STK_REG(R14)(r1) 66 .Lexit: 67 addi r1,r1,STACKFRAMESIZE 68 .Ldo_err1: 69 ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1) 70 ld r4,-STACKFRAMESIZE+STK_REG(R30)(r1) 71 ld r5,-STACKFRAMESIZE+STK_REG(R29)(r1) 72 b __copy_tofrom_user_base 73 74 75 _GLOBAL(__copy_tofrom_user_power7) 76 cmpldi r5,16 77 cmpldi cr1,r5,3328 78 79 std r3,-STACKFRAMESIZE+STK_REG(R31)(r1) 80 std r4,-STACKFRAMESIZE+STK_REG(R30)(r1) 81 std r5,-STACKFRAMESIZE+STK_REG(R29)(r1) 82 83 blt .Lshort_copy 84 85 #ifdef CONFIG_ALTIVEC 86 test_feature = SELFTEST_CASE 87 BEGIN_FTR_SECTION 88 bgt cr1,.Lvmx_copy 89 END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) 90 #endif 91 92 .Lnonvmx_copy: 93 /* Get the source 8B aligned */ 94 neg r6,r4 95 mtocrf 0x01,r6 96 clrldi r6,r6,(64-3) 97 98 bf cr7*4+3,1f 99 err1; lbz r0,0(r4) 100 addi r4,r4,1 101 err1; stb r0,0(r3) 102 addi r3,r3,1 103 104 1: bf cr7*4+2,2f 105 err1; lhz r0,0(r4) 106 addi r4,r4,2 107 err1; sth r0,0(r3) 108 addi r3,r3,2 109 110 2: bf cr7*4+1,3f 111 err1; lwz r0,0(r4) 112 addi r4,r4,4 113 err1; stw r0,0(r3) 114 addi r3,r3,4 115 116 3: sub r5,r5,r6 117 cmpldi r5,128 118 blt 5f 119 120 mflr r0 121 stdu r1,-STACKFRAMESIZE(r1) 122 std r14,STK_REG(R14)(r1) 123 std r15,STK_REG(R15)(r1) 124 std r16,STK_REG(R16)(r1) 125 std r17,STK_REG(R17)(r1) 126 std r18,STK_REG(R18)(r1) 127 std r19,STK_REG(R19)(r1) 128 std r20,STK_REG(R20)(r1) 129 std r21,STK_REG(R21)(r1) 130 std r22,STK_REG(R22)(r1) 131 std r0,STACKFRAMESIZE+16(r1) 132 133 srdi r6,r5,7 134 mtctr r6 135 136 /* Now do cacheline (128B) sized loads and stores. */ 137 .align 5 138 4: 139 err2; ld r0,0(r4) 140 err2; ld r6,8(r4) 141 err2; ld r7,16(r4) 142 err2; ld r8,24(r4) 143 err2; ld r9,32(r4) 144 err2; ld r10,40(r4) 145 err2; ld r11,48(r4) 146 err2; ld r12,56(r4) 147 err2; ld r14,64(r4) 148 err2; ld r15,72(r4) 149 err2; ld r16,80(r4) 150 err2; ld r17,88(r4) 151 err2; ld r18,96(r4) 152 err2; ld r19,104(r4) 153 err2; ld r20,112(r4) 154 err2; ld r21,120(r4) 155 addi r4,r4,128 156 err2; std r0,0(r3) 157 err2; std r6,8(r3) 158 err2; std r7,16(r3) 159 err2; std r8,24(r3) 160 err2; std r9,32(r3) 161 err2; std r10,40(r3) 162 err2; std r11,48(r3) 163 err2; std r12,56(r3) 164 err2; std r14,64(r3) 165 err2; std r15,72(r3) 166 err2; std r16,80(r3) 167 err2; std r17,88(r3) 168 err2; std r18,96(r3) 169 err2; std r19,104(r3) 170 err2; std r20,112(r3) 171 err2; std r21,120(r3) 172 addi r3,r3,128 173 bdnz 4b 174 175 clrldi r5,r5,(64-7) 176 177 ld r14,STK_REG(R14)(r1) 178 ld r15,STK_REG(R15)(r1) 179 ld r16,STK_REG(R16)(r1) 180 ld r17,STK_REG(R17)(r1) 181 ld r18,STK_REG(R18)(r1) 182 ld r19,STK_REG(R19)(r1) 183 ld r20,STK_REG(R20)(r1) 184 ld r21,STK_REG(R21)(r1) 185 ld r22,STK_REG(R22)(r1) 186 addi r1,r1,STACKFRAMESIZE 187 188 /* Up to 127B to go */ 189 5: srdi r6,r5,4 190 mtocrf 0x01,r6 191 192 6: bf cr7*4+1,7f 193 err1; ld r0,0(r4) 194 err1; ld r6,8(r4) 195 err1; ld r7,16(r4) 196 err1; ld r8,24(r4) 197 err1; ld r9,32(r4) 198 err1; ld r10,40(r4) 199 err1; ld r11,48(r4) 200 err1; ld r12,56(r4) 201 addi r4,r4,64 202 err1; std r0,0(r3) 203 err1; std r6,8(r3) 204 err1; std r7,16(r3) 205 err1; std r8,24(r3) 206 err1; std r9,32(r3) 207 err1; std r10,40(r3) 208 err1; std r11,48(r3) 209 err1; std r12,56(r3) 210 addi r3,r3,64 211 212 /* Up to 63B to go */ 213 7: bf cr7*4+2,8f 214 err1; ld r0,0(r4) 215 err1; ld r6,8(r4) 216 err1; ld r7,16(r4) 217 err1; ld r8,24(r4) 218 addi r4,r4,32 219 err1; std r0,0(r3) 220 err1; std r6,8(r3) 221 err1; std r7,16(r3) 222 err1; std r8,24(r3) 223 addi r3,r3,32 224 225 /* Up to 31B to go */ 226 8: bf cr7*4+3,9f 227 err1; ld r0,0(r4) 228 err1; ld r6,8(r4) 229 addi r4,r4,16 230 err1; std r0,0(r3) 231 err1; std r6,8(r3) 232 addi r3,r3,16 233 234 9: clrldi r5,r5,(64-4) 235 236 /* Up to 15B to go */ 237 .Lshort_copy: 238 mtocrf 0x01,r5 239 bf cr7*4+0,12f 240 err1; lwz r0,0(r4) /* Less chance of a reject with word ops */ 241 err1; lwz r6,4(r4) 242 addi r4,r4,8 243 err1; stw r0,0(r3) 244 err1; stw r6,4(r3) 245 addi r3,r3,8 246 247 12: bf cr7*4+1,13f 248 err1; lwz r0,0(r4) 249 addi r4,r4,4 250 err1; stw r0,0(r3) 251 addi r3,r3,4 252 253 13: bf cr7*4+2,14f 254 err1; lhz r0,0(r4) 255 addi r4,r4,2 256 err1; sth r0,0(r3) 257 addi r3,r3,2 258 259 14: bf cr7*4+3,15f 260 err1; lbz r0,0(r4) 261 err1; stb r0,0(r3) 262 263 15: li r3,0 264 blr 265 266 .Lunwind_stack_nonvmx_copy: 267 addi r1,r1,STACKFRAMESIZE 268 b .Lnonvmx_copy 269 270 .Lvmx_copy: 271 #ifdef CONFIG_ALTIVEC 272 mflr r0 273 std r0,16(r1) 274 stdu r1,-STACKFRAMESIZE(r1) 275 bl CFUNC(enter_vmx_usercopy) 276 cmpwi cr1,r3,0 277 ld r0,STACKFRAMESIZE+16(r1) 278 ld r3,STK_REG(R31)(r1) 279 ld r4,STK_REG(R30)(r1) 280 ld r5,STK_REG(R29)(r1) 281 mtlr r0 282 283 /* 284 * We prefetch both the source and destination using enhanced touch 285 * instructions. We use a stream ID of 0 for the load side and 286 * 1 for the store side. 287 */ 288 clrrdi r6,r4,7 289 clrrdi r9,r3,7 290 ori r9,r9,1 /* stream=1 */ 291 292 srdi r7,r5,7 /* length in cachelines, capped at 0x3FF */ 293 cmpldi r7,0x3FF 294 ble 1f 295 li r7,0x3FF 296 1: lis r0,0x0E00 /* depth=7 */ 297 sldi r7,r7,7 298 or r7,r7,r0 299 ori r10,r7,1 /* stream=1 */ 300 301 DCBT_SETUP_STREAMS(r6, r7, r9, r10, r8) 302 303 beq cr1,.Lunwind_stack_nonvmx_copy 304 305 /* 306 * If source and destination are not relatively aligned we use a 307 * slower permute loop. 308 */ 309 xor r6,r4,r3 310 rldicl. r6,r6,0,(64-4) 311 bne .Lvmx_unaligned_copy 312 313 /* Get the destination 16B aligned */ 314 neg r6,r3 315 mtocrf 0x01,r6 316 clrldi r6,r6,(64-4) 317 318 bf cr7*4+3,1f 319 err3; lbz r0,0(r4) 320 addi r4,r4,1 321 err3; stb r0,0(r3) 322 addi r3,r3,1 323 324 1: bf cr7*4+2,2f 325 err3; lhz r0,0(r4) 326 addi r4,r4,2 327 err3; sth r0,0(r3) 328 addi r3,r3,2 329 330 2: bf cr7*4+1,3f 331 err3; lwz r0,0(r4) 332 addi r4,r4,4 333 err3; stw r0,0(r3) 334 addi r3,r3,4 335 336 3: bf cr7*4+0,4f 337 err3; ld r0,0(r4) 338 addi r4,r4,8 339 err3; std r0,0(r3) 340 addi r3,r3,8 341 342 4: sub r5,r5,r6 343 344 /* Get the desination 128B aligned */ 345 neg r6,r3 346 srdi r7,r6,4 347 mtocrf 0x01,r7 348 clrldi r6,r6,(64-7) 349 350 li r9,16 351 li r10,32 352 li r11,48 353 354 bf cr7*4+3,5f 355 err3; lvx v1,0,r4 356 addi r4,r4,16 357 err3; stvx v1,0,r3 358 addi r3,r3,16 359 360 5: bf cr7*4+2,6f 361 err3; lvx v1,0,r4 362 err3; lvx v0,r4,r9 363 addi r4,r4,32 364 err3; stvx v1,0,r3 365 err3; stvx v0,r3,r9 366 addi r3,r3,32 367 368 6: bf cr7*4+1,7f 369 err3; lvx v3,0,r4 370 err3; lvx v2,r4,r9 371 err3; lvx v1,r4,r10 372 err3; lvx v0,r4,r11 373 addi r4,r4,64 374 err3; stvx v3,0,r3 375 err3; stvx v2,r3,r9 376 err3; stvx v1,r3,r10 377 err3; stvx v0,r3,r11 378 addi r3,r3,64 379 380 7: sub r5,r5,r6 381 srdi r6,r5,7 382 383 std r14,STK_REG(R14)(r1) 384 std r15,STK_REG(R15)(r1) 385 std r16,STK_REG(R16)(r1) 386 387 li r12,64 388 li r14,80 389 li r15,96 390 li r16,112 391 392 mtctr r6 393 394 /* 395 * Now do cacheline sized loads and stores. By this stage the 396 * cacheline stores are also cacheline aligned. 397 */ 398 .align 5 399 8: 400 err4; lvx v7,0,r4 401 err4; lvx v6,r4,r9 402 err4; lvx v5,r4,r10 403 err4; lvx v4,r4,r11 404 err4; lvx v3,r4,r12 405 err4; lvx v2,r4,r14 406 err4; lvx v1,r4,r15 407 err4; lvx v0,r4,r16 408 addi r4,r4,128 409 err4; stvx v7,0,r3 410 err4; stvx v6,r3,r9 411 err4; stvx v5,r3,r10 412 err4; stvx v4,r3,r11 413 err4; stvx v3,r3,r12 414 err4; stvx v2,r3,r14 415 err4; stvx v1,r3,r15 416 err4; stvx v0,r3,r16 417 addi r3,r3,128 418 bdnz 8b 419 420 ld r14,STK_REG(R14)(r1) 421 ld r15,STK_REG(R15)(r1) 422 ld r16,STK_REG(R16)(r1) 423 424 /* Up to 127B to go */ 425 clrldi r5,r5,(64-7) 426 srdi r6,r5,4 427 mtocrf 0x01,r6 428 429 bf cr7*4+1,9f 430 err3; lvx v3,0,r4 431 err3; lvx v2,r4,r9 432 err3; lvx v1,r4,r10 433 err3; lvx v0,r4,r11 434 addi r4,r4,64 435 err3; stvx v3,0,r3 436 err3; stvx v2,r3,r9 437 err3; stvx v1,r3,r10 438 err3; stvx v0,r3,r11 439 addi r3,r3,64 440 441 9: bf cr7*4+2,10f 442 err3; lvx v1,0,r4 443 err3; lvx v0,r4,r9 444 addi r4,r4,32 445 err3; stvx v1,0,r3 446 err3; stvx v0,r3,r9 447 addi r3,r3,32 448 449 10: bf cr7*4+3,11f 450 err3; lvx v1,0,r4 451 addi r4,r4,16 452 err3; stvx v1,0,r3 453 addi r3,r3,16 454 455 /* Up to 15B to go */ 456 11: clrldi r5,r5,(64-4) 457 mtocrf 0x01,r5 458 bf cr7*4+0,12f 459 err3; ld r0,0(r4) 460 addi r4,r4,8 461 err3; std r0,0(r3) 462 addi r3,r3,8 463 464 12: bf cr7*4+1,13f 465 err3; lwz r0,0(r4) 466 addi r4,r4,4 467 err3; stw r0,0(r3) 468 addi r3,r3,4 469 470 13: bf cr7*4+2,14f 471 err3; lhz r0,0(r4) 472 addi r4,r4,2 473 err3; sth r0,0(r3) 474 addi r3,r3,2 475 476 14: bf cr7*4+3,15f 477 err3; lbz r0,0(r4) 478 err3; stb r0,0(r3) 479 480 15: addi r1,r1,STACKFRAMESIZE 481 b CFUNC(exit_vmx_usercopy) /* tail call optimise */ 482 483 .Lvmx_unaligned_copy: 484 /* Get the destination 16B aligned */ 485 neg r6,r3 486 mtocrf 0x01,r6 487 clrldi r6,r6,(64-4) 488 489 bf cr7*4+3,1f 490 err3; lbz r0,0(r4) 491 addi r4,r4,1 492 err3; stb r0,0(r3) 493 addi r3,r3,1 494 495 1: bf cr7*4+2,2f 496 err3; lhz r0,0(r4) 497 addi r4,r4,2 498 err3; sth r0,0(r3) 499 addi r3,r3,2 500 501 2: bf cr7*4+1,3f 502 err3; lwz r0,0(r4) 503 addi r4,r4,4 504 err3; stw r0,0(r3) 505 addi r3,r3,4 506 507 3: bf cr7*4+0,4f 508 err3; lwz r0,0(r4) /* Less chance of a reject with word ops */ 509 err3; lwz r7,4(r4) 510 addi r4,r4,8 511 err3; stw r0,0(r3) 512 err3; stw r7,4(r3) 513 addi r3,r3,8 514 515 4: sub r5,r5,r6 516 517 /* Get the desination 128B aligned */ 518 neg r6,r3 519 srdi r7,r6,4 520 mtocrf 0x01,r7 521 clrldi r6,r6,(64-7) 522 523 li r9,16 524 li r10,32 525 li r11,48 526 527 LVS(v16,0,r4) /* Setup permute control vector */ 528 err3; lvx v0,0,r4 529 addi r4,r4,16 530 531 bf cr7*4+3,5f 532 err3; lvx v1,0,r4 533 VPERM(v8,v0,v1,v16) 534 addi r4,r4,16 535 err3; stvx v8,0,r3 536 addi r3,r3,16 537 vor v0,v1,v1 538 539 5: bf cr7*4+2,6f 540 err3; lvx v1,0,r4 541 VPERM(v8,v0,v1,v16) 542 err3; lvx v0,r4,r9 543 VPERM(v9,v1,v0,v16) 544 addi r4,r4,32 545 err3; stvx v8,0,r3 546 err3; stvx v9,r3,r9 547 addi r3,r3,32 548 549 6: bf cr7*4+1,7f 550 err3; lvx v3,0,r4 551 VPERM(v8,v0,v3,v16) 552 err3; lvx v2,r4,r9 553 VPERM(v9,v3,v2,v16) 554 err3; lvx v1,r4,r10 555 VPERM(v10,v2,v1,v16) 556 err3; lvx v0,r4,r11 557 VPERM(v11,v1,v0,v16) 558 addi r4,r4,64 559 err3; stvx v8,0,r3 560 err3; stvx v9,r3,r9 561 err3; stvx v10,r3,r10 562 err3; stvx v11,r3,r11 563 addi r3,r3,64 564 565 7: sub r5,r5,r6 566 srdi r6,r5,7 567 568 std r14,STK_REG(R14)(r1) 569 std r15,STK_REG(R15)(r1) 570 std r16,STK_REG(R16)(r1) 571 572 li r12,64 573 li r14,80 574 li r15,96 575 li r16,112 576 577 mtctr r6 578 579 /* 580 * Now do cacheline sized loads and stores. By this stage the 581 * cacheline stores are also cacheline aligned. 582 */ 583 .align 5 584 8: 585 err4; lvx v7,0,r4 586 VPERM(v8,v0,v7,v16) 587 err4; lvx v6,r4,r9 588 VPERM(v9,v7,v6,v16) 589 err4; lvx v5,r4,r10 590 VPERM(v10,v6,v5,v16) 591 err4; lvx v4,r4,r11 592 VPERM(v11,v5,v4,v16) 593 err4; lvx v3,r4,r12 594 VPERM(v12,v4,v3,v16) 595 err4; lvx v2,r4,r14 596 VPERM(v13,v3,v2,v16) 597 err4; lvx v1,r4,r15 598 VPERM(v14,v2,v1,v16) 599 err4; lvx v0,r4,r16 600 VPERM(v15,v1,v0,v16) 601 addi r4,r4,128 602 err4; stvx v8,0,r3 603 err4; stvx v9,r3,r9 604 err4; stvx v10,r3,r10 605 err4; stvx v11,r3,r11 606 err4; stvx v12,r3,r12 607 err4; stvx v13,r3,r14 608 err4; stvx v14,r3,r15 609 err4; stvx v15,r3,r16 610 addi r3,r3,128 611 bdnz 8b 612 613 ld r14,STK_REG(R14)(r1) 614 ld r15,STK_REG(R15)(r1) 615 ld r16,STK_REG(R16)(r1) 616 617 /* Up to 127B to go */ 618 clrldi r5,r5,(64-7) 619 srdi r6,r5,4 620 mtocrf 0x01,r6 621 622 bf cr7*4+1,9f 623 err3; lvx v3,0,r4 624 VPERM(v8,v0,v3,v16) 625 err3; lvx v2,r4,r9 626 VPERM(v9,v3,v2,v16) 627 err3; lvx v1,r4,r10 628 VPERM(v10,v2,v1,v16) 629 err3; lvx v0,r4,r11 630 VPERM(v11,v1,v0,v16) 631 addi r4,r4,64 632 err3; stvx v8,0,r3 633 err3; stvx v9,r3,r9 634 err3; stvx v10,r3,r10 635 err3; stvx v11,r3,r11 636 addi r3,r3,64 637 638 9: bf cr7*4+2,10f 639 err3; lvx v1,0,r4 640 VPERM(v8,v0,v1,v16) 641 err3; lvx v0,r4,r9 642 VPERM(v9,v1,v0,v16) 643 addi r4,r4,32 644 err3; stvx v8,0,r3 645 err3; stvx v9,r3,r9 646 addi r3,r3,32 647 648 10: bf cr7*4+3,11f 649 err3; lvx v1,0,r4 650 VPERM(v8,v0,v1,v16) 651 addi r4,r4,16 652 err3; stvx v8,0,r3 653 addi r3,r3,16 654 655 /* Up to 15B to go */ 656 11: clrldi r5,r5,(64-4) 657 addi r4,r4,-16 /* Unwind the +16 load offset */ 658 mtocrf 0x01,r5 659 bf cr7*4+0,12f 660 err3; lwz r0,0(r4) /* Less chance of a reject with word ops */ 661 err3; lwz r6,4(r4) 662 addi r4,r4,8 663 err3; stw r0,0(r3) 664 err3; stw r6,4(r3) 665 addi r3,r3,8 666 667 12: bf cr7*4+1,13f 668 err3; lwz r0,0(r4) 669 addi r4,r4,4 670 err3; stw r0,0(r3) 671 addi r3,r3,4 672 673 13: bf cr7*4+2,14f 674 err3; lhz r0,0(r4) 675 addi r4,r4,2 676 err3; sth r0,0(r3) 677 addi r3,r3,2 678 679 14: bf cr7*4+3,15f 680 err3; lbz r0,0(r4) 681 err3; stb r0,0(r3) 682 683 15: addi r1,r1,STACKFRAMESIZE 684 b CFUNC(exit_vmx_usercopy) /* tail call optimise */ 685 #endif /* CONFIG_ALTIVEC */
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.