1 /* SPDX-License-Identifier: GPL-2.0 */ 2 /* NGmemcpy.S: Niagara optimized memcpy. 3 * 4 * Copyright (C) 2006, 2007 David S. Miller (davem@davemloft.net) 5 */ 6 7 #ifdef __KERNEL__ 8 #include <linux/linkage.h> 9 #include <asm/asi.h> 10 #include <asm/thread_info.h> 11 #define GLOBAL_SPARE %g7 12 #define RESTORE_ASI(TMP) \ 13 wr %g0, ASI_AIUS, %asi 14 #else 15 #define GLOBAL_SPARE %g5 16 #define RESTORE_ASI(TMP) \ 17 wr %g0, ASI_PNF, %asi 18 #endif 19 20 #ifdef __sparc_v9__ 21 #define SAVE_AMOUNT 128 22 #else 23 #define SAVE_AMOUNT 64 24 #endif 25 26 #ifndef STORE_ASI 27 #define STORE_ASI ASI_BLK_INIT_QUAD_LDD_P 28 #endif 29 30 #ifndef EX_LD 31 #define EX_LD(x,y) x 32 #endif 33 34 #ifndef EX_ST 35 #define EX_ST(x,y) x 36 #endif 37 38 #ifndef LOAD 39 #ifndef MEMCPY_DEBUG 40 #define LOAD(type,addr,dest) type [addr], dest 41 #else 42 #define LOAD(type,addr,dest) type##a [addr] 0x80, dest 43 #endif 44 #endif 45 46 #ifndef LOAD_TWIN 47 #define LOAD_TWIN(addr_reg,dest0,dest1) \ 48 ldda [addr_reg] ASI_BLK_INIT_QUAD_LDD_P, dest0 49 #endif 50 51 #ifndef STORE 52 #define STORE(type,src,addr) type src, [addr] 53 #endif 54 55 #ifndef STORE_INIT 56 #ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA 57 #define STORE_INIT(src,addr) stxa src, [addr] %asi 58 #else 59 #define STORE_INIT(src,addr) stx src, [addr + 0x00] 60 #endif 61 #endif 62 63 #ifndef FUNC_NAME 64 #define FUNC_NAME NGmemcpy 65 #endif 66 67 #ifndef PREAMBLE 68 #define PREAMBLE 69 #endif 70 71 #ifndef XCC 72 #define XCC xcc 73 #endif 74 75 .register %g2,#scratch 76 .register %g3,#scratch 77 78 .text 79 #ifndef EX_RETVAL 80 #define EX_RETVAL(x) x 81 __restore_asi: 82 ret 83 wr %g0, ASI_AIUS, %asi 84 restore 85 ENTRY(NG_ret_i2_plus_i4_plus_1) 86 ba,pt %xcc, __restore_asi 87 add %i2, %i5, %i0 88 ENDPROC(NG_ret_i2_plus_i4_plus_1) 89 ENTRY(NG_ret_i2_plus_g1) 90 ba,pt %xcc, __restore_asi 91 add %i2, %g1, %i0 92 ENDPROC(NG_ret_i2_plus_g1) 93 ENTRY(NG_ret_i2_plus_g1_minus_8) 94 sub %g1, 8, %g1 95 ba,pt %xcc, __restore_asi 96 add %i2, %g1, %i0 97 ENDPROC(NG_ret_i2_plus_g1_minus_8) 98 ENTRY(NG_ret_i2_plus_g1_minus_16) 99 sub %g1, 16, %g1 100 ba,pt %xcc, __restore_asi 101 add %i2, %g1, %i0 102 ENDPROC(NG_ret_i2_plus_g1_minus_16) 103 ENTRY(NG_ret_i2_plus_g1_minus_24) 104 sub %g1, 24, %g1 105 ba,pt %xcc, __restore_asi 106 add %i2, %g1, %i0 107 ENDPROC(NG_ret_i2_plus_g1_minus_24) 108 ENTRY(NG_ret_i2_plus_g1_minus_32) 109 sub %g1, 32, %g1 110 ba,pt %xcc, __restore_asi 111 add %i2, %g1, %i0 112 ENDPROC(NG_ret_i2_plus_g1_minus_32) 113 ENTRY(NG_ret_i2_plus_g1_minus_40) 114 sub %g1, 40, %g1 115 ba,pt %xcc, __restore_asi 116 add %i2, %g1, %i0 117 ENDPROC(NG_ret_i2_plus_g1_minus_40) 118 ENTRY(NG_ret_i2_plus_g1_minus_48) 119 sub %g1, 48, %g1 120 ba,pt %xcc, __restore_asi 121 add %i2, %g1, %i0 122 ENDPROC(NG_ret_i2_plus_g1_minus_48) 123 ENTRY(NG_ret_i2_plus_g1_minus_56) 124 sub %g1, 56, %g1 125 ba,pt %xcc, __restore_asi 126 add %i2, %g1, %i0 127 ENDPROC(NG_ret_i2_plus_g1_minus_56) 128 ENTRY(NG_ret_i2_plus_i4) 129 ba,pt %xcc, __restore_asi 130 add %i2, %i4, %i0 131 ENDPROC(NG_ret_i2_plus_i4) 132 ENTRY(NG_ret_i2_plus_i4_minus_8) 133 sub %i4, 8, %i4 134 ba,pt %xcc, __restore_asi 135 add %i2, %i4, %i0 136 ENDPROC(NG_ret_i2_plus_i4_minus_8) 137 ENTRY(NG_ret_i2_plus_8) 138 ba,pt %xcc, __restore_asi 139 add %i2, 8, %i0 140 ENDPROC(NG_ret_i2_plus_8) 141 ENTRY(NG_ret_i2_plus_4) 142 ba,pt %xcc, __restore_asi 143 add %i2, 4, %i0 144 ENDPROC(NG_ret_i2_plus_4) 145 ENTRY(NG_ret_i2_plus_1) 146 ba,pt %xcc, __restore_asi 147 add %i2, 1, %i0 148 ENDPROC(NG_ret_i2_plus_1) 149 ENTRY(NG_ret_i2_plus_g1_plus_1) 150 add %g1, 1, %g1 151 ba,pt %xcc, __restore_asi 152 add %i2, %g1, %i0 153 ENDPROC(NG_ret_i2_plus_g1_plus_1) 154 ENTRY(NG_ret_i2) 155 ba,pt %xcc, __restore_asi 156 mov %i2, %i0 157 ENDPROC(NG_ret_i2) 158 ENTRY(NG_ret_i2_and_7_plus_i4) 159 and %i2, 7, %i2 160 ba,pt %xcc, __restore_asi 161 add %i2, %i4, %i0 162 ENDPROC(NG_ret_i2_and_7_plus_i4) 163 #endif 164 165 .align 64 166 167 .globl FUNC_NAME 168 .type FUNC_NAME,#function 169 FUNC_NAME: /* %i0=dst, %i1=src, %i2=len */ 170 PREAMBLE 171 save %sp, -SAVE_AMOUNT, %sp 172 srlx %i2, 31, %g2 173 cmp %g2, 0 174 tne %xcc, 5 175 mov %i0, %o0 176 cmp %i2, 0 177 be,pn %XCC, 85f 178 or %o0, %i1, %i3 179 cmp %i2, 16 180 blu,a,pn %XCC, 80f 181 or %i3, %i2, %i3 182 183 /* 2 blocks (128 bytes) is the minimum we can do the block 184 * copy with. We need to ensure that we'll iterate at least 185 * once in the block copy loop. At worst we'll need to align 186 * the destination to a 64-byte boundary which can chew up 187 * to (64 - 1) bytes from the length before we perform the 188 * block copy loop. 189 */ 190 cmp %i2, (2 * 64) 191 blu,pt %XCC, 70f 192 andcc %i3, 0x7, %g0 193 194 /* %o0: dst 195 * %i1: src 196 * %i2: len (known to be >= 128) 197 * 198 * The block copy loops will use %i4/%i5,%g2/%g3 as 199 * temporaries while copying the data. 200 */ 201 202 LOAD(prefetch, %i1, #one_read) 203 wr %g0, STORE_ASI, %asi 204 205 /* Align destination on 64-byte boundary. */ 206 andcc %o0, (64 - 1), %i4 207 be,pt %XCC, 2f 208 sub %i4, 64, %i4 209 sub %g0, %i4, %i4 ! bytes to align dst 210 sub %i2, %i4, %i2 211 1: subcc %i4, 1, %i4 212 EX_LD(LOAD(ldub, %i1, %g1), NG_ret_i2_plus_i4_plus_1) 213 EX_ST(STORE(stb, %g1, %o0), NG_ret_i2_plus_i4_plus_1) 214 add %i1, 1, %i1 215 bne,pt %XCC, 1b 216 add %o0, 1, %o0 217 218 /* If the source is on a 16-byte boundary we can do 219 * the direct block copy loop. If it is 8-byte aligned 220 * we can do the 16-byte loads offset by -8 bytes and the 221 * init stores offset by one register. 222 * 223 * If the source is not even 8-byte aligned, we need to do 224 * shifting and masking (basically integer faligndata). 225 * 226 * The careful bit with init stores is that if we store 227 * to any part of the cache line we have to store the whole 228 * cacheline else we can end up with corrupt L2 cache line 229 * contents. Since the loop works on 64-bytes of 64-byte 230 * aligned store data at a time, this is easy to ensure. 231 */ 232 2: 233 andcc %i1, (16 - 1), %i4 234 andn %i2, (64 - 1), %g1 ! block copy loop iterator 235 be,pt %XCC, 50f 236 sub %i2, %g1, %i2 ! final sub-block copy bytes 237 238 cmp %i4, 8 239 be,pt %XCC, 10f 240 sub %i1, %i4, %i1 241 242 /* Neither 8-byte nor 16-byte aligned, shift and mask. */ 243 and %i4, 0x7, GLOBAL_SPARE 244 sll GLOBAL_SPARE, 3, GLOBAL_SPARE 245 mov 64, %i5 246 EX_LD(LOAD_TWIN(%i1, %g2, %g3), NG_ret_i2_plus_g1) 247 sub %i5, GLOBAL_SPARE, %i5 248 mov 16, %o4 249 mov 32, %o5 250 mov 48, %o7 251 mov 64, %i3 252 253 bg,pn %XCC, 9f 254 nop 255 256 #define MIX_THREE_WORDS(WORD1, WORD2, WORD3, PRE_SHIFT, POST_SHIFT, TMP) \ 257 sllx WORD1, POST_SHIFT, WORD1; \ 258 srlx WORD2, PRE_SHIFT, TMP; \ 259 sllx WORD2, POST_SHIFT, WORD2; \ 260 or WORD1, TMP, WORD1; \ 261 srlx WORD3, PRE_SHIFT, TMP; \ 262 or WORD2, TMP, WORD2; 263 264 8: EX_LD(LOAD_TWIN(%i1 + %o4, %o2, %o3), NG_ret_i2_plus_g1) 265 MIX_THREE_WORDS(%g2, %g3, %o2, %i5, GLOBAL_SPARE, %o1) 266 LOAD(prefetch, %i1 + %i3, #one_read) 267 268 EX_ST(STORE_INIT(%g2, %o0 + 0x00), NG_ret_i2_plus_g1) 269 EX_ST(STORE_INIT(%g3, %o0 + 0x08), NG_ret_i2_plus_g1_minus_8) 270 271 EX_LD(LOAD_TWIN(%i1 + %o5, %g2, %g3), NG_ret_i2_plus_g1_minus_16) 272 MIX_THREE_WORDS(%o2, %o3, %g2, %i5, GLOBAL_SPARE, %o1) 273 274 EX_ST(STORE_INIT(%o2, %o0 + 0x10), NG_ret_i2_plus_g1_minus_16) 275 EX_ST(STORE_INIT(%o3, %o0 + 0x18), NG_ret_i2_plus_g1_minus_24) 276 277 EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3), NG_ret_i2_plus_g1_minus_32) 278 MIX_THREE_WORDS(%g2, %g3, %o2, %i5, GLOBAL_SPARE, %o1) 279 280 EX_ST(STORE_INIT(%g2, %o0 + 0x20), NG_ret_i2_plus_g1_minus_32) 281 EX_ST(STORE_INIT(%g3, %o0 + 0x28), NG_ret_i2_plus_g1_minus_40) 282 283 EX_LD(LOAD_TWIN(%i1 + %i3, %g2, %g3), NG_ret_i2_plus_g1_minus_48) 284 add %i1, 64, %i1 285 MIX_THREE_WORDS(%o2, %o3, %g2, %i5, GLOBAL_SPARE, %o1) 286 287 EX_ST(STORE_INIT(%o2, %o0 + 0x30), NG_ret_i2_plus_g1_minus_48) 288 EX_ST(STORE_INIT(%o3, %o0 + 0x38), NG_ret_i2_plus_g1_minus_56) 289 290 subcc %g1, 64, %g1 291 bne,pt %XCC, 8b 292 add %o0, 64, %o0 293 294 ba,pt %XCC, 60f 295 add %i1, %i4, %i1 296 297 9: EX_LD(LOAD_TWIN(%i1 + %o4, %o2, %o3), NG_ret_i2_plus_g1) 298 MIX_THREE_WORDS(%g3, %o2, %o3, %i5, GLOBAL_SPARE, %o1) 299 LOAD(prefetch, %i1 + %i3, #one_read) 300 301 EX_ST(STORE_INIT(%g3, %o0 + 0x00), NG_ret_i2_plus_g1) 302 EX_ST(STORE_INIT(%o2, %o0 + 0x08), NG_ret_i2_plus_g1_minus_8) 303 304 EX_LD(LOAD_TWIN(%i1 + %o5, %g2, %g3), NG_ret_i2_plus_g1_minus_16) 305 MIX_THREE_WORDS(%o3, %g2, %g3, %i5, GLOBAL_SPARE, %o1) 306 307 EX_ST(STORE_INIT(%o3, %o0 + 0x10), NG_ret_i2_plus_g1_minus_16) 308 EX_ST(STORE_INIT(%g2, %o0 + 0x18), NG_ret_i2_plus_g1_minus_24) 309 310 EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3), NG_ret_i2_plus_g1_minus_32) 311 MIX_THREE_WORDS(%g3, %o2, %o3, %i5, GLOBAL_SPARE, %o1) 312 313 EX_ST(STORE_INIT(%g3, %o0 + 0x20), NG_ret_i2_plus_g1_minus_32) 314 EX_ST(STORE_INIT(%o2, %o0 + 0x28), NG_ret_i2_plus_g1_minus_40) 315 316 EX_LD(LOAD_TWIN(%i1 + %i3, %g2, %g3), NG_ret_i2_plus_g1_minus_48) 317 add %i1, 64, %i1 318 MIX_THREE_WORDS(%o3, %g2, %g3, %i5, GLOBAL_SPARE, %o1) 319 320 EX_ST(STORE_INIT(%o3, %o0 + 0x30), NG_ret_i2_plus_g1_minus_48) 321 EX_ST(STORE_INIT(%g2, %o0 + 0x38), NG_ret_i2_plus_g1_minus_56) 322 323 subcc %g1, 64, %g1 324 bne,pt %XCC, 9b 325 add %o0, 64, %o0 326 327 ba,pt %XCC, 60f 328 add %i1, %i4, %i1 329 330 10: /* Destination is 64-byte aligned, source was only 8-byte 331 * aligned but it has been subtracted by 8 and we perform 332 * one twin load ahead, then add 8 back into source when 333 * we finish the loop. 334 */ 335 EX_LD(LOAD_TWIN(%i1, %o4, %o5), NG_ret_i2_plus_g1) 336 mov 16, %o7 337 mov 32, %g2 338 mov 48, %g3 339 mov 64, %o1 340 1: EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3), NG_ret_i2_plus_g1) 341 LOAD(prefetch, %i1 + %o1, #one_read) 342 EX_ST(STORE_INIT(%o5, %o0 + 0x00), NG_ret_i2_plus_g1) ! initializes cache line 343 EX_ST(STORE_INIT(%o2, %o0 + 0x08), NG_ret_i2_plus_g1_minus_8) 344 EX_LD(LOAD_TWIN(%i1 + %g2, %o4, %o5), NG_ret_i2_plus_g1_minus_16) 345 EX_ST(STORE_INIT(%o3, %o0 + 0x10), NG_ret_i2_plus_g1_minus_16) 346 EX_ST(STORE_INIT(%o4, %o0 + 0x18), NG_ret_i2_plus_g1_minus_24) 347 EX_LD(LOAD_TWIN(%i1 + %g3, %o2, %o3), NG_ret_i2_plus_g1_minus_32) 348 EX_ST(STORE_INIT(%o5, %o0 + 0x20), NG_ret_i2_plus_g1_minus_32) 349 EX_ST(STORE_INIT(%o2, %o0 + 0x28), NG_ret_i2_plus_g1_minus_40) 350 EX_LD(LOAD_TWIN(%i1 + %o1, %o4, %o5), NG_ret_i2_plus_g1_minus_48) 351 add %i1, 64, %i1 352 EX_ST(STORE_INIT(%o3, %o0 + 0x30), NG_ret_i2_plus_g1_minus_48) 353 EX_ST(STORE_INIT(%o4, %o0 + 0x38), NG_ret_i2_plus_g1_minus_56) 354 subcc %g1, 64, %g1 355 bne,pt %XCC, 1b 356 add %o0, 64, %o0 357 358 ba,pt %XCC, 60f 359 add %i1, 0x8, %i1 360 361 50: /* Destination is 64-byte aligned, and source is 16-byte 362 * aligned. 363 */ 364 mov 16, %o7 365 mov 32, %g2 366 mov 48, %g3 367 mov 64, %o1 368 1: EX_LD(LOAD_TWIN(%i1 + %g0, %o4, %o5), NG_ret_i2_plus_g1) 369 EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3), NG_ret_i2_plus_g1) 370 LOAD(prefetch, %i1 + %o1, #one_read) 371 EX_ST(STORE_INIT(%o4, %o0 + 0x00), NG_ret_i2_plus_g1) ! initializes cache line 372 EX_ST(STORE_INIT(%o5, %o0 + 0x08), NG_ret_i2_plus_g1_minus_8) 373 EX_LD(LOAD_TWIN(%i1 + %g2, %o4, %o5), NG_ret_i2_plus_g1_minus_16) 374 EX_ST(STORE_INIT(%o2, %o0 + 0x10), NG_ret_i2_plus_g1_minus_16) 375 EX_ST(STORE_INIT(%o3, %o0 + 0x18), NG_ret_i2_plus_g1_minus_24) 376 EX_LD(LOAD_TWIN(%i1 + %g3, %o2, %o3), NG_ret_i2_plus_g1_minus_32) 377 add %i1, 64, %i1 378 EX_ST(STORE_INIT(%o4, %o0 + 0x20), NG_ret_i2_plus_g1_minus_32) 379 EX_ST(STORE_INIT(%o5, %o0 + 0x28), NG_ret_i2_plus_g1_minus_40) 380 EX_ST(STORE_INIT(%o2, %o0 + 0x30), NG_ret_i2_plus_g1_minus_48) 381 EX_ST(STORE_INIT(%o3, %o0 + 0x38), NG_ret_i2_plus_g1_minus_56) 382 subcc %g1, 64, %g1 383 bne,pt %XCC, 1b 384 add %o0, 64, %o0 385 /* fall through */ 386 387 60: 388 membar #Sync 389 390 /* %i2 contains any final bytes still needed to be copied 391 * over. If anything is left, we copy it one byte at a time. 392 */ 393 RESTORE_ASI(%i3) 394 brz,pt %i2, 85f 395 sub %o0, %i1, %i3 396 ba,a,pt %XCC, 90f 397 nop 398 399 .align 64 400 70: /* 16 < len <= 64 */ 401 bne,pn %XCC, 75f 402 sub %o0, %i1, %i3 403 404 72: 405 andn %i2, 0xf, %i4 406 and %i2, 0xf, %i2 407 1: subcc %i4, 0x10, %i4 408 EX_LD(LOAD(ldx, %i1, %o4), NG_ret_i2_plus_i4) 409 add %i1, 0x08, %i1 410 EX_LD(LOAD(ldx, %i1, %g1), NG_ret_i2_plus_i4) 411 sub %i1, 0x08, %i1 412 EX_ST(STORE(stx, %o4, %i1 + %i3), NG_ret_i2_plus_i4) 413 add %i1, 0x8, %i1 414 EX_ST(STORE(stx, %g1, %i1 + %i3), NG_ret_i2_plus_i4_minus_8) 415 bgu,pt %XCC, 1b 416 add %i1, 0x8, %i1 417 73: andcc %i2, 0x8, %g0 418 be,pt %XCC, 1f 419 nop 420 sub %i2, 0x8, %i2 421 EX_LD(LOAD(ldx, %i1, %o4), NG_ret_i2_plus_8) 422 EX_ST(STORE(stx, %o4, %i1 + %i3), NG_ret_i2_plus_8) 423 add %i1, 0x8, %i1 424 1: andcc %i2, 0x4, %g0 425 be,pt %XCC, 1f 426 nop 427 sub %i2, 0x4, %i2 428 EX_LD(LOAD(lduw, %i1, %i5), NG_ret_i2_plus_4) 429 EX_ST(STORE(stw, %i5, %i1 + %i3), NG_ret_i2_plus_4) 430 add %i1, 0x4, %i1 431 1: cmp %i2, 0 432 be,pt %XCC, 85f 433 nop 434 ba,pt %xcc, 90f 435 nop 436 437 75: 438 andcc %o0, 0x7, %g1 439 sub %g1, 0x8, %g1 440 be,pn %icc, 2f 441 sub %g0, %g1, %g1 442 sub %i2, %g1, %i2 443 444 1: subcc %g1, 1, %g1 445 EX_LD(LOAD(ldub, %i1, %i5), NG_ret_i2_plus_g1_plus_1) 446 EX_ST(STORE(stb, %i5, %i1 + %i3), NG_ret_i2_plus_g1_plus_1) 447 bgu,pt %icc, 1b 448 add %i1, 1, %i1 449 450 2: add %i1, %i3, %o0 451 andcc %i1, 0x7, %g1 452 bne,pt %icc, 8f 453 sll %g1, 3, %g1 454 455 cmp %i2, 16 456 bgeu,pt %icc, 72b 457 nop 458 ba,a,pt %xcc, 73b 459 460 8: mov 64, %i3 461 andn %i1, 0x7, %i1 462 EX_LD(LOAD(ldx, %i1, %g2), NG_ret_i2) 463 sub %i3, %g1, %i3 464 andn %i2, 0x7, %i4 465 sllx %g2, %g1, %g2 466 1: add %i1, 0x8, %i1 467 EX_LD(LOAD(ldx, %i1, %g3), NG_ret_i2_and_7_plus_i4) 468 subcc %i4, 0x8, %i4 469 srlx %g3, %i3, %i5 470 or %i5, %g2, %i5 471 EX_ST(STORE(stx, %i5, %o0), NG_ret_i2_and_7_plus_i4) 472 add %o0, 0x8, %o0 473 bgu,pt %icc, 1b 474 sllx %g3, %g1, %g2 475 476 srl %g1, 3, %g1 477 andcc %i2, 0x7, %i2 478 be,pn %icc, 85f 479 add %i1, %g1, %i1 480 ba,pt %xcc, 90f 481 sub %o0, %i1, %i3 482 483 .align 64 484 80: /* 0 < len <= 16 */ 485 andcc %i3, 0x3, %g0 486 bne,pn %XCC, 90f 487 sub %o0, %i1, %i3 488 489 1: 490 subcc %i2, 4, %i2 491 EX_LD(LOAD(lduw, %i1, %g1), NG_ret_i2_plus_4) 492 EX_ST(STORE(stw, %g1, %i1 + %i3), NG_ret_i2_plus_4) 493 bgu,pt %XCC, 1b 494 add %i1, 4, %i1 495 496 85: ret 497 restore EX_RETVAL(%i0), %g0, %o0 498 499 .align 32 500 90: 501 subcc %i2, 1, %i2 502 EX_LD(LOAD(ldub, %i1, %g1), NG_ret_i2_plus_1) 503 EX_ST(STORE(stb, %g1, %i1 + %i3), NG_ret_i2_plus_1) 504 bgu,pt %XCC, 90b 505 add %i1, 1, %i1 506 ret 507 restore EX_RETVAL(%i0), %g0, %o0 508 509 .size FUNC_NAME, .-FUNC_NAME
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.