1 /* SPDX-License-Identifier: GPL-2.0 */ 2 /* 3 * arch/alpha/lib/ev6-memset.S 4 * 5 * This is an efficient (and relatively small) 6 * "memset()" function for the 21264 implement 7 * 8 * 21264 version contributed by Rick Gorton <r 9 * 10 * Much of the information about 21264 schedul 11 * Compiler Writer's Guide for the Alpha 12 * abbreviated as 'CWG' in other comments 13 * ftp.digital.com/pub/Digital/info/semic 14 * Scheduling notation: 15 * E - either cluster 16 * U - upper subcluster; U0 - subcl 17 * L - lower subcluster; L0 - subcl 18 * The algorithm for the leading and trailing 19 * however the loop has been unrolled to enabl 20 * and the code has been replicated for each o 21 * and __memset16 to permit better scheduling 22 * encountered during the mask replication. 23 * A future enhancement might be to put in a b 24 * small (say < 32 bytes) memset()s. Whether 25 * a win in the kernel would depend upon the c 26 * WARNING: Maintaining this is going to be mo 27 * as fixes will need to be made in multiple p 28 * is worth it. 29 */ 30 #include <linux/export.h> 31 .set noat 32 .set noreorder 33 .text 34 .globl memset 35 .globl __memset 36 .globl ___memset 37 .globl __memset16 38 .globl __constant_c_memset 39 40 .ent ___memset 41 .align 5 42 ___memset: 43 .frame $30,0,$26,0 44 .prologue 0 45 46 /* 47 * Serious stalling happens. The only 48 * undertake a major re-write to inter 49 * with other parts of the fall-throug 50 * though it makes maintenance tougher 51 * Do this later. 52 */ 53 and $17,255,$1 # E : 00000000 54 insbl $17,1,$2 # U : 00000000 55 bis $16,$16,$0 # E : return v 56 ble $18,end_b # U : zero len 57 58 addq $18,$16,$6 # E : max addr 59 bis $1,$2,$17 # E : 00000000 60 insbl $1,2,$3 # U : 00000000 61 insbl $1,3,$4 # U : 00000000 62 63 or $3,$4,$3 # E : 00000000 64 inswl $17,4,$5 # U : 0000chch 65 xor $16,$6,$1 # E : will com 66 inswl $17,6,$2 # U : chch0000 67 68 or $17,$3,$17 # E : 00000000 69 or $2,$5,$2 # E : chchchch 70 bic $1,7,$1 # E : fit with 71 and $16,7,$3 # E : Target a 72 73 or $17,$2,$17 # E : chchchch 74 beq $1,within_quad_b # U : 75 nop # E : 76 beq $3,aligned_b # U : target i 77 78 /* 79 * Target address is misaligned, and w 80 */ 81 ldq_u $4,0($16) # L : Fetch fi 82 bis $16,$16,$5 # E : Save the 83 insql $17,$16,$2 # U : Insert n 84 subq $3,8,$3 # E : Invert ( 85 86 addq $18,$3,$18 # E : $18 is n 87 mskql $4,$16,$4 # U : clear re 88 subq $16,$3,$16 # E : $16 is n 89 bis $2,$4,$1 # E : Final by 90 91 nop 92 stq_u $1,0($5) # L : Store re 93 nop 94 nop 95 96 .align 4 97 aligned_b: 98 /* 99 * We are now guaranteed to be quad al 100 * one partial quad to write. 101 */ 102 103 sra $18,3,$3 # U : Number o 104 and $18,7,$18 # E : Number o 105 bis $16,$16,$5 # E : Save des 106 beq $3,no_quad_b # U : tail stu 107 108 /* 109 * it's worth the effort to unroll thi 110 * Lifted a bunch of code from clear_u 111 * At this point, entry values are: 112 * $16 Current destination address 113 * $5 A copy of $16 114 * $6 The max quadword address to wr 115 * $18 Number trailer bytes 116 * $3 Number quads to write 117 */ 118 119 and $16, 0x3f, $2 # E : Forward 120 subq $3, 16, $4 # E : Only try 121 subq $2, 0x40, $1 # E : bias cou 122 blt $4, loop_b # U : 123 124 /* 125 * We know we've got at least 16 quads 126 * through unrolled loop. Do a quad a 127 * aligned. 128 */ 129 130 nop # E : 131 nop # E : 132 nop # E : 133 beq $1, $bigalign_b # U : 134 135 $alignmod64_b: 136 stq $17, 0($5) # L : 137 subq $3, 1, $3 # E : For cons 138 addq $1, 8, $1 # E : Incremen 139 addq $5, 8, $4 # E : Initial 140 141 nop 142 nop 143 addq $5, 8, $5 # E : Inc addr 144 blt $1, $alignmod64_b # U : 145 146 $bigalign_b: 147 /* 148 * $3 - number quads left to go 149 * $5 - target address (aligned 0mod64 150 * $17 - mask of stuff to store 151 * Scratch registers available: $7, $2 152 * we know that we'll be taking a mini 153 * CWG Section 3.7.6: do not expect a 154 * Assumes the wh64 needs to be for 2 155 * The wh64 is issued on for the start 156 * through the loop, and if there are 157 * address will be for the current tri 158 */ 159 160 $do_wh64_b: 161 wh64 ($4) # L1 : memory 162 subq $3, 24, $2 # E : For dete 163 stq $17, 0($5) # L : 164 nop # E : 165 166 addq $5, 128, $4 # E : speculat 167 stq $17, 8($5) # L : 168 stq $17, 16($5) # L : 169 addq $5, 64, $7 # E : Fallback 170 171 stq $17, 24($5) # L : 172 stq $17, 32($5) # L : 173 cmovlt $2, $7, $4 # E : Latency 174 nop 175 176 stq $17, 40($5) # L : 177 stq $17, 48($5) # L : 178 subq $3, 16, $2 # E : Repeat t 179 nop 180 181 stq $17, 56($5) # L : 182 addq $5, 64, $5 # E : 183 subq $3, 8, $3 # E : 184 bge $2, $do_wh64_b # U : 185 186 nop 187 nop 188 nop 189 beq $3, no_quad_b # U : Might ha 190 191 .align 4 192 /* 193 * Simple loop for trailing quadwords, 194 * of data (where we can't use an unro 195 */ 196 loop_b: 197 stq $17,0($5) # L : 198 subq $3,1,$3 # E : Decremen 199 addq $5,8,$5 # E : Inc addr 200 bne $3,loop_b # U : more? 201 202 no_quad_b: 203 /* 204 * Write 0..7 trailing bytes. 205 */ 206 nop # E : 207 beq $18,end_b # U : All done 208 ldq $7,0($5) # L : 209 mskqh $7,$6,$2 # U : Mask fin 210 211 insqh $17,$6,$4 # U : New bits 212 bis $2,$4,$1 # E : Put it a 213 stq $1,0($5) # L : And back 214 ret $31,($26),1 # L0 : 215 216 within_quad_b: 217 ldq_u $1,0($16) # L : 218 insql $17,$16,$2 # U : New bits 219 mskql $1,$16,$4 # U : Clear ol 220 bis $2,$4,$2 # E : New resu 221 222 mskql $2,$6,$4 # U : 223 mskqh $1,$6,$2 # U : 224 bis $2,$4,$1 # E : 225 stq_u $1,0($16) # L : 226 227 end_b: 228 nop 229 nop 230 nop 231 ret $31,($26),1 # L0 : 232 .end ___memset 233 EXPORT_SYMBOL(___memset) 234 235 /* 236 * This is the original body of code, 237 * rescheduling. Leave it here, as th 238 * entry point. 239 */ 240 .align 4 241 .ent __constant_c_memset 242 __constant_c_memset: 243 .frame $30,0,$26,0 244 .prologue 0 245 246 addq $18,$16,$6 # E : max addr 247 bis $16,$16,$0 # E : return v 248 xor $16,$6,$1 # E : will com 249 ble $18,end # U : zero len 250 251 bic $1,7,$1 # E : fit with 252 beq $1,within_one_quad # U : 253 and $16,7,$3 # E : Target a 254 beq $3,aligned # U : target i 255 256 /* 257 * Target address is misaligned, and w 258 */ 259 ldq_u $4,0($16) # L : Fetch fi 260 bis $16,$16,$5 # E : Save the 261 insql $17,$16,$2 # U : Insert n 262 subq $3,8,$3 # E : Invert ( 263 264 addq $18,$3,$18 # E : $18 is n 265 mskql $4,$16,$4 # U : clear re 266 subq $16,$3,$16 # E : $16 is n 267 bis $2,$4,$1 # E : Final by 268 269 nop 270 stq_u $1,0($5) # L : Store re 271 nop 272 nop 273 274 .align 4 275 aligned: 276 /* 277 * We are now guaranteed to be quad al 278 * one partial quad to write. 279 */ 280 281 sra $18,3,$3 # U : Number o 282 and $18,7,$18 # E : Number o 283 bis $16,$16,$5 # E : Save des 284 beq $3,no_quad # U : tail stu 285 286 /* 287 * it's worth the effort to unroll thi 288 * Lifted a bunch of code from clear_u 289 * At this point, entry values are: 290 * $16 Current destination address 291 * $5 A copy of $16 292 * $6 The max quadword address to wr 293 * $18 Number trailer bytes 294 * $3 Number quads to write 295 */ 296 297 and $16, 0x3f, $2 # E : Forward 298 subq $3, 16, $4 # E : Only try 299 subq $2, 0x40, $1 # E : bias cou 300 blt $4, loop # U : 301 302 /* 303 * We know we've got at least 16 quads 304 * through unrolled loop. Do a quad a 305 * aligned. 306 */ 307 308 nop # E : 309 nop # E : 310 nop # E : 311 beq $1, $bigalign # U : 312 313 $alignmod64: 314 stq $17, 0($5) # L : 315 subq $3, 1, $3 # E : For cons 316 addq $1, 8, $1 # E : Incremen 317 addq $5, 8, $4 # E : Initial 318 319 nop 320 nop 321 addq $5, 8, $5 # E : Inc addr 322 blt $1, $alignmod64 # U : 323 324 $bigalign: 325 /* 326 * $3 - number quads left to go 327 * $5 - target address (aligned 0mod64 328 * $17 - mask of stuff to store 329 * Scratch registers available: $7, $2 330 * we know that we'll be taking a mini 331 * CWG Section 3.7.6: do not expect a 332 * Assumes the wh64 needs to be for 2 333 * The wh64 is issued on for the start 334 * through the loop, and if there are 335 * address will be for the current tri 336 */ 337 338 $do_wh64: 339 wh64 ($4) # L1 : memory 340 subq $3, 24, $2 # E : For dete 341 stq $17, 0($5) # L : 342 nop # E : 343 344 addq $5, 128, $4 # E : speculat 345 stq $17, 8($5) # L : 346 stq $17, 16($5) # L : 347 addq $5, 64, $7 # E : Fallback 348 349 stq $17, 24($5) # L : 350 stq $17, 32($5) # L : 351 cmovlt $2, $7, $4 # E : Latency 352 nop 353 354 stq $17, 40($5) # L : 355 stq $17, 48($5) # L : 356 subq $3, 16, $2 # E : Repeat t 357 nop 358 359 stq $17, 56($5) # L : 360 addq $5, 64, $5 # E : 361 subq $3, 8, $3 # E : 362 bge $2, $do_wh64 # U : 363 364 nop 365 nop 366 nop 367 beq $3, no_quad # U : Might ha 368 369 .align 4 370 /* 371 * Simple loop for trailing quadwords, 372 * of data (where we can't use an unro 373 */ 374 loop: 375 stq $17,0($5) # L : 376 subq $3,1,$3 # E : Decremen 377 addq $5,8,$5 # E : Inc addr 378 bne $3,loop # U : more? 379 380 no_quad: 381 /* 382 * Write 0..7 trailing bytes. 383 */ 384 nop # E : 385 beq $18,end # U : All done 386 ldq $7,0($5) # L : 387 mskqh $7,$6,$2 # U : Mask fin 388 389 insqh $17,$6,$4 # U : New bits 390 bis $2,$4,$1 # E : Put it a 391 stq $1,0($5) # L : And back 392 ret $31,($26),1 # L0 : 393 394 within_one_quad: 395 ldq_u $1,0($16) # L : 396 insql $17,$16,$2 # U : New bits 397 mskql $1,$16,$4 # U : Clear ol 398 bis $2,$4,$2 # E : New resu 399 400 mskql $2,$6,$4 # U : 401 mskqh $1,$6,$2 # U : 402 bis $2,$4,$1 # E : 403 stq_u $1,0($16) # L : 404 405 end: 406 nop 407 nop 408 nop 409 ret $31,($26),1 # L0 : 410 .end __constant_c_memset 411 EXPORT_SYMBOL(__constant_c_memset) 412 413 /* 414 * This is a replicant of the __consta 415 * to mask stalls. Note that entry po 416 */ 417 .align 5 418 .ent __memset16 419 420 __memset16: 421 .frame $30,0,$26,0 422 .prologue 0 423 424 inswl $17,0,$5 # U : 00000000 425 inswl $17,2,$2 # U : 00000000 426 bis $16,$16,$0 # E : return v 427 addq $18,$16,$6 # E : max addr 428 429 ble $18, end_w # U : zero len 430 inswl $17,4,$3 # U : 0000c1c2 431 inswl $17,6,$4 # U : c1c20000 432 xor $16,$6,$1 # E : will com 433 434 or $2,$5,$2 # E : 00000000 435 or $3,$4,$17 # E : c1c2c1c2 436 bic $1,7,$1 # E : fit with 437 and $16,7,$3 # E : Target a 438 439 or $17,$2,$17 # E : c1c2c1c2 440 beq $1,within_quad_w # U : 441 nop 442 beq $3,aligned_w # U : target i 443 444 /* 445 * Target address is misaligned, and w 446 */ 447 ldq_u $4,0($16) # L : Fetch fi 448 bis $16,$16,$5 # E : Save the 449 insql $17,$16,$2 # U : Insert n 450 subq $3,8,$3 # E : Invert ( 451 452 addq $18,$3,$18 # E : $18 is n 453 mskql $4,$16,$4 # U : clear re 454 subq $16,$3,$16 # E : $16 is n 455 bis $2,$4,$1 # E : Final by 456 457 nop 458 stq_u $1,0($5) # L : Store re 459 nop 460 nop 461 462 .align 4 463 aligned_w: 464 /* 465 * We are now guaranteed to be quad al 466 * one partial quad to write. 467 */ 468 469 sra $18,3,$3 # U : Number o 470 and $18,7,$18 # E : Number o 471 bis $16,$16,$5 # E : Save des 472 beq $3,no_quad_w # U : tail stu 473 474 /* 475 * it's worth the effort to unroll thi 476 * Lifted a bunch of code from clear_u 477 * At this point, entry values are: 478 * $16 Current destination address 479 * $5 A copy of $16 480 * $6 The max quadword address to wr 481 * $18 Number trailer bytes 482 * $3 Number quads to write 483 */ 484 485 and $16, 0x3f, $2 # E : Forward 486 subq $3, 16, $4 # E : Only try 487 subq $2, 0x40, $1 # E : bias cou 488 blt $4, loop_w # U : 489 490 /* 491 * We know we've got at least 16 quads 492 * through unrolled loop. Do a quad a 493 * aligned. 494 */ 495 496 nop # E : 497 nop # E : 498 nop # E : 499 beq $1, $bigalign_w # U : 500 501 $alignmod64_w: 502 stq $17, 0($5) # L : 503 subq $3, 1, $3 # E : For cons 504 addq $1, 8, $1 # E : Incremen 505 addq $5, 8, $4 # E : Initial 506 507 nop 508 nop 509 addq $5, 8, $5 # E : Inc addr 510 blt $1, $alignmod64_w # U : 511 512 $bigalign_w: 513 /* 514 * $3 - number quads left to go 515 * $5 - target address (aligned 0mod64 516 * $17 - mask of stuff to store 517 * Scratch registers available: $7, $2 518 * we know that we'll be taking a mini 519 * CWG Section 3.7.6: do not expect a 520 * Assumes the wh64 needs to be for 2 521 * The wh64 is issued on for the start 522 * through the loop, and if there are 523 * address will be for the current tri 524 */ 525 526 $do_wh64_w: 527 wh64 ($4) # L1 : memory 528 subq $3, 24, $2 # E : For dete 529 stq $17, 0($5) # L : 530 nop # E : 531 532 addq $5, 128, $4 # E : speculat 533 stq $17, 8($5) # L : 534 stq $17, 16($5) # L : 535 addq $5, 64, $7 # E : Fallback 536 537 stq $17, 24($5) # L : 538 stq $17, 32($5) # L : 539 cmovlt $2, $7, $4 # E : Latency 540 nop 541 542 stq $17, 40($5) # L : 543 stq $17, 48($5) # L : 544 subq $3, 16, $2 # E : Repeat t 545 nop 546 547 stq $17, 56($5) # L : 548 addq $5, 64, $5 # E : 549 subq $3, 8, $3 # E : 550 bge $2, $do_wh64_w # U : 551 552 nop 553 nop 554 nop 555 beq $3, no_quad_w # U : Might ha 556 557 .align 4 558 /* 559 * Simple loop for trailing quadwords, 560 * of data (where we can't use an unro 561 */ 562 loop_w: 563 stq $17,0($5) # L : 564 subq $3,1,$3 # E : Decremen 565 addq $5,8,$5 # E : Inc addr 566 bne $3,loop_w # U : more? 567 568 no_quad_w: 569 /* 570 * Write 0..7 trailing bytes. 571 */ 572 nop # E : 573 beq $18,end_w # U : All done 574 ldq $7,0($5) # L : 575 mskqh $7,$6,$2 # U : Mask fin 576 577 insqh $17,$6,$4 # U : New bits 578 bis $2,$4,$1 # E : Put it a 579 stq $1,0($5) # L : And back 580 ret $31,($26),1 # L0 : 581 582 within_quad_w: 583 ldq_u $1,0($16) # L : 584 insql $17,$16,$2 # U : New bits 585 mskql $1,$16,$4 # U : Clear ol 586 bis $2,$4,$2 # E : New resu 587 588 mskql $2,$6,$4 # U : 589 mskqh $1,$6,$2 # U : 590 bis $2,$4,$1 # E : 591 stq_u $1,0($16) # L : 592 593 end_w: 594 nop 595 nop 596 nop 597 ret $31,($26),1 # L0 : 598 599 .end __memset16 600 EXPORT_SYMBOL(__memset16) 601 602 memset = ___memset 603 __memset = ___memset 604 EXPORT_SYMBOL(memset) 605 EXPORT_SYMBOL(__memset)
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.