1 /* 2 * M7memcpy: Optimized SPARC M7 memcpy 3 * 4 * Copyright (c) 2016, Oracle and/or its affil 5 */ 6 7 .file "M7memcpy.S" 8 9 /* 10 * memcpy(s1, s2, len) 11 * 12 * Copy s2 to s1, always copy n bytes. 13 * Note: this C code does not work for overlap 14 * 15 * Fast assembler language version of the foll 16 * which represents the `standard' for the C-l 17 * 18 * void * 19 * memcpy(void *s, const void *s0, size_t 20 * { 21 * if (n != 0) { 22 * char *s1 = s; 23 * const char *s2 = s0; 24 * do { 25 * *s1++ = *s2++; 26 * } while (--n != 0); 27 * } 28 * return (s); 29 * } 30 * 31 * 32 * SPARC T7/M7 Flow : 33 * 34 * if (count < SMALL_MAX) { 35 * if count < SHORTCOPY (SHORTC 36 * copy bytes; exit with dst addr 37 * if src & dst aligned on word boundary but 38 * copy with ldw/stw; branch to finish_up 39 * if src & dst aligned on long word boundar 40 * copy with ldx/stx; branch to finish_up 41 * if src & dst not aligned and length <= SH 42 * copy bytes; exit with dst addr 43 * move enough bytes to get src to word boun 44 * if dst now on word boundary 45 * move_words: 46 * copy words; branch to finish_up 47 * if dst now on half word boundary 48 * load words, shift half words, store wor 49 * if dst on byte 1 50 * load words, shift 3 bytes, store words; 51 * if dst on byte 3 52 * load words, shift 1 byte, store words; 53 * finish_up: 54 * copy bytes; exit with dst addr 55 * } else { 56 * move bytes until dst is on long word boun 57 * if( src is on long word boundary ) { 58 * if (count < MED_MAX) { 59 * finish_long: 60 * copy with ldx/stx in 8-way unrolled l 61 * copy final 0-63 bytes; exit with dst 62 * } else { s 63 * align dst on 64 byte boundary; for ma 64 * prefetch src data to L2 cache; let HW 65 * Use BIS (block initializing store) to 66 * lines from memory. But pre-store firs 67 * ST_CHUNK lines in advance of the rest 68 * gives time for replacement cache line 69 * excess STQ and Miss Buffer filling. R 70 * then finish up storing before going t 71 * } 72 * } else { 73 * if src is word aligned and count < MED_ 74 * move words in 8-way unrolled loop 75 * move final 0-31 bytes; exit with dst 76 * if count < MED_UMAX 77 * use alignaddr/faligndata combined wit 78 * unrolled loop to move data. 79 * go to unalign_done 80 * else 81 * setup alignaddr for faligndata instru 82 * align dst on 64 byte boundary; prefet 83 * loadx8, falign, block-store, prefetch 84 * (only use block-init-store when src/d 85 * unalign_done: 86 * move remaining bytes for unaligned ca 87 * } 88 * 89 */ 90 91 #include <asm/visasm.h> 92 #include <asm/asi.h> 93 94 #if !defined(EX_LD) && !defined(EX_ST) 95 #define NON_USER_COPY 96 #endif 97 98 #ifndef EX_LD 99 #define EX_LD(x,y) x 100 #endif 101 #ifndef EX_LD_FP 102 #define EX_LD_FP(x,y) x 103 #endif 104 105 #ifndef EX_ST 106 #define EX_ST(x,y) x 107 #endif 108 #ifndef EX_ST_FP 109 #define EX_ST_FP(x,y) x 110 #endif 111 112 #ifndef EX_RETVAL 113 #define EX_RETVAL(x) x 114 #endif 115 116 #ifndef LOAD 117 #define LOAD(type,addr,dest) type [addr], d 118 #endif 119 120 #ifndef STORE 121 #define STORE(type,src,addr) type src, [add 122 #endif 123 124 /* 125 * ASI_BLK_INIT_QUAD_LDD_P/ASI_BLK_INIT_QUAD_L 126 * line as "least recently used" which means i 127 * active, it has a high probability of being 128 * between the first initializing store and th 129 * Thus, we use ASI_ST_BLKINIT_MRU_P/ASI_ST_BL 130 * marks the cache line as "most recently used 131 * but the last cache line 132 */ 133 #ifndef STORE_ASI 134 #ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA 135 #define STORE_ASI ASI_BLK_INIT_QUAD_LDD_ 136 #else 137 #define STORE_ASI 0x80 /* ASI 138 #endif 139 #endif 140 141 #ifndef STORE_MRU_ASI 142 #ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA 143 #define STORE_MRU_ASI ASI_ST_BLKINIT_MRU_P 144 #else 145 #define STORE_MRU_ASI 0x80 /* ASI 146 #endif 147 #endif 148 149 #ifndef STORE_INIT 150 #define STORE_INIT(src,addr) stxa src, [add 151 #endif 152 153 #ifndef STORE_INIT_MRU 154 #define STORE_INIT_MRU(src,addr) stxa s 155 #endif 156 157 #ifndef FUNC_NAME 158 #define FUNC_NAME M7memcpy 159 #endif 160 161 #ifndef PREAMBLE 162 #define PREAMBLE 163 #endif 164 165 #define BLOCK_SIZE 64 166 #define SHORTCOPY 3 167 #define SHORTCHECK 14 168 #define SHORT_LONG 64 /* max copy fo 169 /* must be at 170 #define SMALL_MAX 128 171 #define MED_UMAX 1024 /* max copy fo 172 #define MED_WMAX 1024 /* max copy fo 173 #define MED_MAX 1024 /* max copy fo 174 #define ST_CHUNK 24 /* ST_CHUNK - 175 #define ALIGN_PRE 24 /* distance fo 176 177 .register %g2,#scratch 178 179 .section ".text" 180 .global FUNC_NAME 181 .type FUNC_NAME, #function 182 .align 16 183 FUNC_NAME: 184 srlx %o2, 31, %g2 185 cmp %g2, 0 186 tne %xcc, 5 187 PREAMBLE 188 mov %o0, %g1 ! save 189 brz,pn %o2, .Lsmallx 190 cmp %o2, 3 191 ble,pn %icc, .Ltiny_cp 192 cmp %o2, 19 193 ble,pn %icc, .Lsmall_cp 194 or %o0, %o1, %g2 195 cmp %o2, SMALL_MAX 196 bl,pn %icc, .Lmedium_cp 197 nop 198 199 .Lmedium: 200 neg %o0, %o5 201 andcc %o5, 7, %o5 ! byte 202 brz,pt %o5, .Ldst_aligned_on_8 203 204 ! %o5 has the bytes to be written in p 205 sub %o2, %o5, %o2 206 sub %o1, %o0, %o1 ! %o1 207 7: ! dst 208 add %o1, %o0, %o4 209 EX_LD(LOAD(ldub, %o4, %o4), memcpy_ret 210 subcc %o5, 1, %o5 211 EX_ST(STORE(stb, %o4, %o0), memcpy_ret 212 bgu,pt %xcc, 7b 213 add %o0, 1, %o0 ! adva 214 add %o1, %o0, %o1 ! rest 215 .Ldst_aligned_on_8: 216 andcc %o1, 7, %o5 217 brnz,pt %o5, .Lsrc_dst_unaligned_on_8 218 nop 219 220 .Lsrc_dst_aligned_on_8: 221 ! check if we are copying MED_MAX or m 222 set MED_MAX, %o3 223 cmp %o2, %o3 ! limi 224 bgu,pn %xcc, .Llarge_align8_copy 225 nop 226 227 /* 228 * Special case for handling when src and dest 229 * and total data to move is less than MED_MAX 230 */ 231 .Lmedlong: 232 subcc %o2, 63, %o2 ! adju 233 ble,pn %xcc, .Lmedl63 ! skip 234 nop 235 .Lmedl64: 236 EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl 237 subcc %o2, 64, %o2 ! decr 238 EX_ST(STORE(stx, %o4, %o0), memcpy_ret 239 EX_LD(LOAD(ldx, %o1+8, %o3), memcpy_re 240 EX_ST(STORE(stx, %o3, %o0+8), memcpy_r 241 EX_LD(LOAD(ldx, %o1+16, %o4), memcpy_r 242 EX_ST(STORE(stx, %o4, %o0+16), memcpy_ 243 EX_LD(LOAD(ldx, %o1+24, %o3), memcpy_r 244 EX_ST(STORE(stx, %o3, %o0+24), memcpy_ 245 EX_LD(LOAD(ldx, %o1+32, %o4), memcpy_r 246 EX_ST(STORE(stx, %o4, %o0+32), memcpy_ 247 EX_LD(LOAD(ldx, %o1+40, %o3), memcpy_r 248 add %o1, 64, %o1 ! incr 249 EX_ST(STORE(stx, %o3, %o0+40), memcpy_ 250 EX_LD(LOAD(ldx, %o1-16, %o4), memcpy_r 251 add %o0, 64, %o0 ! incr 252 EX_ST(STORE(stx, %o4, %o0-16), memcpy_ 253 EX_LD(LOAD(ldx, %o1-8, %o3), memcpy_re 254 bgu,pt %xcc, .Lmedl64 ! repe 255 EX_ST(STORE(stx, %o3, %o0-8), memcpy_ 256 .Lmedl63: 257 addcc %o2, 32, %o2 ! adju 258 ble,pt %xcc, .Lmedl31 ! to s 259 nop 260 EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl 261 sub %o2, 32, %o2 ! decr 262 EX_ST(STORE(stx, %o4, %o0), memcpy_ret 263 EX_LD(LOAD(ldx, %o1+8, %o3), memcpy_re 264 add %o1, 32, %o1 ! incr 265 EX_ST(STORE(stx, %o3, %o0+8), memcpy_r 266 EX_LD(LOAD(ldx, %o1-16, %o4), memcpy_r 267 add %o0, 32, %o0 ! incr 268 EX_ST(STORE(stx, %o4, %o0-16), memcpy_ 269 EX_LD(LOAD(ldx, %o1-8, %o3), memcpy_re 270 EX_ST(STORE(stx, %o3, %o0-8), memcpy_r 271 .Lmedl31: 272 addcc %o2, 16, %o2 ! adju 273 ble,pt %xcc, .Lmedl15 ! skip 274 nop ! 275 EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl 276 add %o1, 16, %o1 ! incr 277 EX_ST(STORE(stx, %o4, %o0), memcpy_ret 278 sub %o2, 16, %o2 ! decr 279 EX_LD(LOAD(ldx, %o1-8, %o3), memcpy_re 280 add %o0, 16, %o0 ! incr 281 EX_ST(STORE(stx, %o3, %o0-8), memcpy_r 282 .Lmedl15: 283 addcc %o2, 15, %o2 ! rest 284 bz,pt %xcc, .Lsmallx ! exit if fini 285 cmp %o2, 8 286 blt,pt %xcc, .Lmedw7 ! skip 287 tst %o2 288 EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl 289 add %o1, 8, %o1 ! incr 290 add %o0, 8, %o0 ! incr 291 subcc %o2, 8, %o2 ! decr 292 bnz,pn %xcc, .Lmedw7 293 EX_ST(STORE(stx, %o4, %o0-8), memcpy_ 294 retl 295 mov EX_RETVAL(%g1), %o0 ! rest 296 297 .align 16 298 .Lsrc_dst_unaligned_on_8: 299 ! DST is 8-byte aligned, src is not 300 2: 301 andcc %o1, 0x3, %o5 ! test 302 bnz,pt %xcc, .Lunalignsetup ! bran 303 nop 304 305 /* 306 * Handle all cases where src and dest are ali 307 * boundaries. Use unrolled loops for better p 308 * This option wins over standard large data m 309 * source and destination is in cache for.Lmed 310 * to short data moves. 311 */ 312 set MED_WMAX, %o3 313 cmp %o2, %o3 ! limi 314 bge,pt %xcc, .Lunalignrejoin ! othe 315 nop 316 317 subcc %o2, 31, %o2 ! adju 318 ! for 319 ble,pt %xcc, .Lmedw31 ! skip 320 .Lmedw32: 321 EX_LD(LOAD(ld, %o1, %o4), memcpy_retl_ 322 sllx %o4, 32, %o5 323 EX_LD(LOAD(ld, %o1+4, %o4), memcpy_ret 324 or %o4, %o5, %o5 325 EX_ST(STORE(stx, %o5, %o0), memcpy_ret 326 subcc %o2, 32, %o2 ! decr 327 EX_LD(LOAD(ld, %o1+8, %o4), memcpy_ret 328 sllx %o4, 32, %o5 329 EX_LD(LOAD(ld, %o1+12, %o4), memcpy_re 330 or %o4, %o5, %o5 331 EX_ST(STORE(stx, %o5, %o0+8), memcpy_r 332 add %o1, 32, %o1 ! incr 333 EX_LD(LOAD(ld, %o1-16, %o4), memcpy_re 334 sllx %o4, 32, %o5 335 EX_LD(LOAD(ld, %o1-12, %o4), memcpy_re 336 or %o4, %o5, %o5 337 EX_ST(STORE(stx, %o5, %o0+16), memcpy_ 338 add %o0, 32, %o0 ! incr 339 EX_LD(LOAD(ld, %o1-8, %o4), memcpy_ret 340 sllx %o4, 32, %o5 341 EX_LD(LOAD(ld, %o1-4, %o4), memcpy_ret 342 or %o4, %o5, %o5 343 bgu,pt %xcc, .Lmedw32 ! repe 344 EX_ST(STORE(stx, %o5, %o0-8), memcpy_ 345 .Lmedw31: 346 addcc %o2, 31, %o2 ! rest 347 348 bz,pt %xcc, .Lsmallx ! exit if fini 349 nop 350 cmp %o2, 16 351 blt,pt %xcc, .Lmedw15 352 nop 353 EX_LD(LOAD(ld, %o1, %o4), memcpy_retl_ 354 sllx %o4, 32, %o5 355 subcc %o2, 16, %o2 ! decr 356 EX_LD(LOAD(ld, %o1+4, %o4), memcpy_ret 357 or %o4, %o5, %o5 358 EX_ST(STORE(stx, %o5, %o0), memcpy_ret 359 add %o1, 16, %o1 ! incr 360 EX_LD(LOAD(ld, %o1-8, %o4), memcpy_ret 361 add %o0, 16, %o0 ! incr 362 sllx %o4, 32, %o5 363 EX_LD(LOAD(ld, %o1-4, %o4), memcpy_ret 364 or %o4, %o5, %o5 365 EX_ST(STORE(stx, %o5, %o0-8), memcpy_r 366 .Lmedw15: 367 bz,pt %xcc, .Lsmallx ! exit if fini 368 cmp %o2, 8 369 blt,pn %xcc, .Lmedw7 ! skip 370 tst %o2 371 EX_LD(LOAD(ld, %o1, %o4), memcpy_retl_ 372 subcc %o2, 8, %o2 ! decr 373 EX_ST(STORE(stw, %o4, %o0), memcpy_ret 374 add %o1, 8, %o1 ! incr 375 EX_LD(LOAD(ld, %o1-4, %o3), memcpy_ret 376 add %o0, 8, %o0 ! incr 377 EX_ST(STORE(stw, %o3, %o0-4), memcpy_r 378 bz,pt %xcc, .Lsmallx ! exit if fini 379 .Lmedw7: ! coun 380 cmp %o2, 4 ! chec 381 blt,pn %xcc, .Lsmallleft3 ! skip 382 nop ! 383 EX_LD(LOAD(ld, %o1, %o4), memcpy_retl_ 384 add %o1, 4, %o1 ! incr 385 add %o0, 4, %o0 ! incr 386 subcc %o2, 4, %o2 ! decr 387 bnz .Lsmallleft3 388 EX_ST(STORE(stw, %o4, %o0-4), memcpy_ 389 retl 390 mov EX_RETVAL(%g1), %o0 391 392 .align 16 393 .Llarge_align8_copy: ! Src 394 ! align dst to 64 byte boundary 395 andcc %o0, 0x3f, %o3 ! %o3 396 brz,pn %o3, .Laligned_to_64 397 andcc %o0, 8, %o3 ! odd 398 brz,pt %o3, .Laligned_to_16 399 nop 400 EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl 401 sub %o2, 8, %o2 402 add %o1, 8, %o1 ! incr 403 add %o0, 8, %o0 ! incr 404 EX_ST(STORE(stx, %o4, %o0-8), memcpy_r 405 .Laligned_to_16: 406 andcc %o0, 16, %o3 ! pair 407 brz,pt %o3, .Laligned_to_32 408 nop 409 EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl 410 sub %o2, 16, %o2 411 EX_ST(STORE(stx, %o4, %o0), memcpy_ret 412 add %o1, 16, %o1 ! incr 413 EX_LD(LOAD(ldx, %o1-8, %o4), memcpy_re 414 add %o0, 16, %o0 ! incr 415 EX_ST(STORE(stx, %o4, %o0-8), memcpy_r 416 .Laligned_to_32: 417 andcc %o0, 32, %o3 ! four 418 brz,pt %o3, .Laligned_to_64 419 nop 420 EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl 421 sub %o2, 32, %o2 422 EX_ST(STORE(stx, %o4, %o0), memcpy_ret 423 EX_LD(LOAD(ldx, %o1+8, %o4), memcpy_re 424 EX_ST(STORE(stx, %o4, %o0+8), memcpy_r 425 EX_LD(LOAD(ldx, %o1+16, %o4), memcpy_r 426 EX_ST(STORE(stx, %o4, %o0+16), memcpy_ 427 add %o1, 32, %o1 ! incr 428 EX_LD(LOAD(ldx, %o1-8, %o4), memcpy_re 429 add %o0, 32, %o0 ! incr 430 EX_ST(STORE(stx, %o4, %o0-8), memcpy_r 431 .Laligned_to_64: 432 ! 433 ! Using block init store (BIS) instructi 434 ! lines from memory. Use ST_CHUNK stores 435 ! line (similar to prefetching) to avoid 436 ! Gives existing cache lines time to be 437 ! Initial stores using MRU version of BI 438 ! cache until we are ready to store fina 439 ! Then store last element using the LRU 440 ! 441 andn %o2, 0x3f, %o5 ! %o5 442 and %o2, 0x3f, %o2 ! resi 443 ! 444 ! We use STORE_MRU_ASI for the first sev 445 ! followed by STORE_ASI (mark as LRU) fo 446 ! mixed approach reduces the probability 447 ! before we finish setting it, while min 448 ! other cached values during a large mem 449 ! 450 ! ST_CHUNK batches up initial BIS operat 451 ! to allow multiple requests to not be b 452 ! the store miss buffer. Then the matchi 453 ! BIS operations are executed. 454 ! 455 456 sub %o0, 8, %o0 ! adju 457 .Lalign_loop: 458 cmp %o5, ST_CHUNK*64 459 blu,pt %xcc, .Lalign_loop_fin 460 mov ST_CHUNK,%o3 461 .Lalign_loop_start: 462 prefetch [%o1 + (ALIGN_PRE * BLOCK_SIZ 463 subcc %o3, 1, %o3 464 EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl 465 add %o1, 64, %o1 466 add %o0, 8, %o0 467 EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy 468 bgu %xcc,.Lalign_loop_start 469 add %o0, 56, %o0 470 471 mov ST_CHUNK,%o3 472 sllx %o3, 6, %o4 ! ST_C 473 sub %o1, %o4, %o1 ! rese 474 sub %o0, %o4, %o0 ! rese 475 476 .Lalign_loop_rest: 477 EX_LD(LOAD(ldx, %o1+8, %o4), memcpy_re 478 add %o0, 16, %o0 479 EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy 480 EX_LD(LOAD(ldx, %o1+16, %o4), memcpy_r 481 add %o0, 8, %o0 482 EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy 483 subcc %o3, 1, %o3 484 EX_LD(LOAD(ldx, %o1+24, %o4), memcpy_r 485 add %o0, 8, %o0 486 EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy 487 EX_LD(LOAD(ldx, %o1+32, %o4), memcpy_r 488 add %o0, 8, %o0 489 EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy 490 EX_LD(LOAD(ldx, %o1+40, %o4), memcpy_r 491 add %o0, 8, %o0 492 EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy 493 EX_LD(LOAD(ldx, %o1+48, %o4), memcpy_r 494 add %o1, 64, %o1 495 add %o0, 8, %o0 496 EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy 497 add %o0, 8, %o0 498 EX_LD(LOAD(ldx, %o1-8, %o4), memcpy_re 499 sub %o5, 64, %o5 500 bgu %xcc,.Lalign_loop_rest 501 ! mark cache line as LRU 502 EX_ST(STORE_INIT(%o4, %o0), memcpy_re 503 504 cmp %o5, ST_CHUNK*64 505 bgu,pt %xcc, .Lalign_loop_start 506 mov ST_CHUNK,%o3 507 508 cmp %o5, 0 509 beq .Lalign_done 510 nop 511 .Lalign_loop_fin: 512 EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl 513 EX_ST(STORE(stx, %o4, %o0+8), memcpy_r 514 EX_LD(LOAD(ldx, %o1+8, %o4), memcpy_re 515 EX_ST(STORE(stx, %o4, %o0+8+8), memcpy 516 EX_LD(LOAD(ldx, %o1+16, %o4), memcpy_r 517 EX_ST(STORE(stx, %o4, %o0+8+16), memcp 518 subcc %o5, 64, %o5 519 EX_LD(LOAD(ldx, %o1+24, %o4), memcpy_r 520 EX_ST(STORE(stx, %o4, %o0+8+24), memcp 521 EX_LD(LOAD(ldx, %o1+32, %o4), memcpy_r 522 EX_ST(STORE(stx, %o4, %o0+8+32), memcp 523 EX_LD(LOAD(ldx, %o1+40, %o4), memcpy_r 524 EX_ST(STORE(stx, %o4, %o0+8+40), memcp 525 EX_LD(LOAD(ldx, %o1+48, %o4), memcpy_r 526 add %o1, 64, %o1 527 EX_ST(STORE(stx, %o4, %o0+8+48), memcp 528 add %o0, 64, %o0 529 EX_LD(LOAD(ldx, %o1-8, %o4), memcpy_re 530 bgu %xcc,.Lalign_loop_fin 531 EX_ST(STORE(stx, %o4, %o0), memcpy_re 532 533 .Lalign_done: 534 add %o0, 8, %o0 ! rest 535 membar #StoreStore 536 sub %o2, 63, %o2 ! adju 537 ba .Lmedl63 ! in . 538 nop 539 540 .align 16 541 ! Dst is on 8 byte boundary; src is no 542 .Lunalignsetup: 543 .Lunalignrejoin: 544 mov %g1, %o3 ! save %g1 as 545 #ifdef NON_USER_COPY 546 VISEntryHalfFast(.Lmedium_vis_entry_fa 547 #else 548 VISEntryHalf 549 #endif 550 mov %o3, %g1 ! restore %g1 551 552 set MED_UMAX, %o3 553 cmp %o2, %o3 ! check for.Lm 554 bge,pt %xcc,.Lunalign_large 555 prefetch [%o1 + (4 * BLOCK_SIZE)], 20 556 andn %o2, 0x3f, %o5 ! %o5 557 and %o2, 0x3f, %o2 ! resi 558 cmp %o2, 8 ! Insu 559 bgt .Lunalign_adjust ! end 560 andn %o1, 0x7, %o4 ! %o4 561 add %o2, 64, %o2 ! adju 562 sub %o5, 64, %o5 ! earl 563 .Lunalign_adjust: 564 alignaddr %o1, %g0, %g0 ! gene 565 add %o1, %o5, %o1 ! adva 566 EX_LD_FP(LOAD(ldd, %o4, %f0), memcpy_r 567 .Lunalign_loop: 568 EX_LD_FP(LOAD(ldd, %o4+8, %f2), memcpy 569 faligndata %f0, %f2, %f16 570 EX_LD_FP(LOAD(ldd, %o4+16, %f4), memcp 571 subcc %o5, BLOCK_SIZE, %o5 572 EX_ST_FP(STORE(std, %f16, %o0), memcpy 573 faligndata %f2, %f4, %f18 574 EX_LD_FP(LOAD(ldd, %o4+24, %f6), memcp 575 EX_ST_FP(STORE(std, %f18, %o0+8), memc 576 faligndata %f4, %f6, %f20 577 EX_LD_FP(LOAD(ldd, %o4+32, %f8), memcp 578 EX_ST_FP(STORE(std, %f20, %o0+16), mem 579 faligndata %f6, %f8, %f22 580 EX_LD_FP(LOAD(ldd, %o4+40, %f10), memc 581 EX_ST_FP(STORE(std, %f22, %o0+24), mem 582 faligndata %f8, %f10, %f24 583 EX_LD_FP(LOAD(ldd, %o4+48, %f12), memc 584 EX_ST_FP(STORE(std, %f24, %o0+32), mem 585 faligndata %f10, %f12, %f26 586 EX_LD_FP(LOAD(ldd, %o4+56, %f14), memc 587 add %o4, BLOCK_SIZE, %o4 588 EX_ST_FP(STORE(std, %f26, %o0+40), mem 589 faligndata %f12, %f14, %f28 590 EX_LD_FP(LOAD(ldd, %o4, %f0), memcpy_r 591 EX_ST_FP(STORE(std, %f28, %o0+48), mem 592 faligndata %f14, %f0, %f30 593 EX_ST_FP(STORE(std, %f30, %o0+56), mem 594 add %o0, BLOCK_SIZE, %o0 595 bgu,pt %xcc, .Lunalign_loop 596 prefetch [%o4 + (5 * BLOCK_SIZE)], 20 597 ba .Lunalign_done 598 nop 599 600 .Lunalign_large: 601 andcc %o0, 0x3f, %o3 ! is d 602 bz %xcc, .Lunalignsrc 603 sub %o3, 64, %o3 ! %o3 604 neg %o3 ! byte 605 sub %o2, %o3, %o2 ! upda 606 ! Move bytes according to source align 607 andcc %o1, 0x1, %o5 608 bnz %xcc, .Lunalignbyte ! chec 609 nop 610 andcc %o1, 2, %o5 ! chec 611 bnz %xcc, .Lunalignhalf 612 nop 613 ! Src is word aligned 614 .Lunalignword: 615 EX_LD_FP(LOAD(ld, %o1, %o4), memcpy_re 616 add %o1, 8, %o1 ! incr 617 EX_ST_FP(STORE(stw, %o4, %o0), memcpy_ 618 subcc %o3, 8, %o3 ! decr 619 EX_LD_FP(LOAD(ld, %o1-4, %o4), memcpy_ 620 add %o0, 8, %o0 ! incr 621 bnz %xcc, .Lunalignword 622 EX_ST_FP(STORE(stw, %o4, %o0-4), memc 623 ba .Lunalignsrc 624 nop 625 626 ! Src is half-word aligned 627 .Lunalignhalf: 628 EX_LD_FP(LOAD(lduh, %o1, %o4), memcpy_ 629 sllx %o4, 32, %o5 ! shif 630 EX_LD_FP(LOAD(lduw, %o1+2, %o4), memcp 631 or %o4, %o5, %o5 632 sllx %o5, 16, %o5 633 EX_LD_FP(LOAD(lduh, %o1+6, %o4), memcp 634 or %o4, %o5, %o5 635 EX_ST_FP(STORE(stx, %o5, %o0), memcpy_ 636 add %o1, 8, %o1 637 subcc %o3, 8, %o3 638 bnz %xcc, .Lunalignhalf 639 add %o0, 8, %o0 640 ba .Lunalignsrc 641 nop 642 643 ! Src is Byte aligned 644 .Lunalignbyte: 645 sub %o0, %o1, %o0 ! shar 646 .Lunalignbyte_loop: 647 EX_LD_FP(LOAD(ldub, %o1, %o4), memcpy_ 648 sllx %o4, 56, %o5 649 EX_LD_FP(LOAD(lduh, %o1+1, %o4), memcp 650 sllx %o4, 40, %o4 651 or %o4, %o5, %o5 652 EX_LD_FP(LOAD(lduh, %o1+3, %o4), memcp 653 sllx %o4, 24, %o4 654 or %o4, %o5, %o5 655 EX_LD_FP(LOAD(lduh, %o1+5, %o4), memcp 656 sllx %o4, 8, %o4 657 or %o4, %o5, %o5 658 EX_LD_FP(LOAD(ldub, %o1+7, %o4), memcp 659 or %o4, %o5, %o5 660 add %o0, %o1, %o0 661 EX_ST_FP(STORE(stx, %o5, %o0), memcpy_ 662 sub %o0, %o1, %o0 663 subcc %o3, 8, %o3 664 bnz %xcc, .Lunalignbyte_loop 665 add %o1, 8, %o1 666 add %o0,%o1, %o0 ! rest 667 668 ! Destination is now block (64 byte al 669 .Lunalignsrc: 670 andn %o2, 0x3f, %o5 ! %o5 671 and %o2, 0x3f, %o2 ! resi 672 add %o2, 64, %o2 ! Insu 673 sub %o5, 64, %o5 ! end 674 675 andn %o1, 0x7, %o4 ! %o4 676 alignaddr %o1, %g0, %g0 ! gene 677 add %o1, %o5, %o1 ! adva 678 679 EX_LD_FP(LOAD(ldd, %o4, %f14), memcpy_ 680 add %o4, 8, %o4 681 .Lunalign_sloop: 682 EX_LD_FP(LOAD(ldd, %o4, %f16), memcpy_ 683 faligndata %f14, %f16, %f0 684 EX_LD_FP(LOAD(ldd, %o4+8, %f18), memcp 685 faligndata %f16, %f18, %f2 686 EX_LD_FP(LOAD(ldd, %o4+16, %f20), memc 687 faligndata %f18, %f20, %f4 688 EX_ST_FP(STORE(std, %f0, %o0), memcpy_ 689 subcc %o5, 64, %o5 690 EX_LD_FP(LOAD(ldd, %o4+24, %f22), memc 691 faligndata %f20, %f22, %f6 692 EX_ST_FP(STORE(std, %f2, %o0+8), memcp 693 EX_LD_FP(LOAD(ldd, %o4+32, %f24), memc 694 faligndata %f22, %f24, %f8 695 EX_ST_FP(STORE(std, %f4, %o0+16), memc 696 EX_LD_FP(LOAD(ldd, %o4+40, %f26), memc 697 faligndata %f24, %f26, %f10 698 EX_ST_FP(STORE(std, %f6, %o0+24), memc 699 EX_LD_FP(LOAD(ldd, %o4+48, %f28), memc 700 faligndata %f26, %f28, %f12 701 EX_ST_FP(STORE(std, %f8, %o0+32), memc 702 add %o4, 64, %o4 703 EX_LD_FP(LOAD(ldd, %o4-8, %f30), memcp 704 faligndata %f28, %f30, %f14 705 EX_ST_FP(STORE(std, %f10, %o0+40), mem 706 EX_ST_FP(STORE(std, %f12, %o0+48), mem 707 add %o0, 64, %o0 708 EX_ST_FP(STORE(std, %f14, %o0-8), memc 709 fsrc2 %f30, %f14 710 bgu,pt %xcc, .Lunalign_sloop 711 prefetch [%o4 + (8 * BLOCK_SIZE)], 20 712 713 .Lunalign_done: 714 ! Handle trailing bytes, 64 to 127 715 ! Dest long word aligned, Src not long 716 cmp %o2, 15 717 bleu %xcc, .Lunalign_short 718 719 andn %o2, 0x7, %o5 ! %o5 720 and %o2, 0x7, %o2 ! resi 721 add %o2, 8, %o2 722 sub %o5, 8, %o5 ! insu 723 andn %o1, 0x7, %o4 ! %o4 724 add %o1, %o5, %o1 ! adva 725 EX_LD_FP(LOAD(ldd, %o4, %f0), memcpy_r 726 .Lunalign_by8: 727 EX_LD_FP(LOAD(ldd, %o4+8, %f2), memcpy 728 add %o4, 8, %o4 729 faligndata %f0, %f2, %f16 730 subcc %o5, 8, %o5 731 EX_ST_FP(STORE(std, %f16, %o0), memcpy 732 fsrc2 %f2, %f0 733 bgu,pt %xcc, .Lunalign_by8 734 add %o0, 8, %o0 735 736 .Lunalign_short: 737 #ifdef NON_USER_COPY 738 VISExitHalfFast 739 #else 740 VISExitHalf 741 #endif 742 ba .Lsmallrest 743 nop 744 745 /* 746 * This is a special case of nested memcpy. Th 747 * calls unaligned memcpy back to back without 748 * traps(context switch) to save/restore FP re 749 * memcpy without this trap sequence we will h 750 * the normal integer load/store method in thi 751 */ 752 753 #ifdef NON_USER_COPY 754 .Lmedium_vis_entry_fail_cp: 755 or %o0, %o1, %g2 756 #endif 757 .Lmedium_cp: 758 LOAD(prefetch, %o1 + 0x40, #n_reads_st 759 andcc %g2, 0x7, %g0 760 bne,pn %xcc, .Lmedium_unaligned_cp 761 nop 762 763 .Lmedium_noprefetch_cp: 764 andncc %o2, 0x20 - 1, %o5 765 be,pn %xcc, 2f 766 sub %o2, %o5, %o2 767 1: EX_LD(LOAD(ldx, %o1 + 0x00, %o3), memc 768 EX_LD(LOAD(ldx, %o1 + 0x08, %g2), memc 769 EX_LD(LOAD(ldx, %o1 + 0x10, %g7), memc 770 EX_LD(LOAD(ldx, %o1 + 0x18, %o4), memc 771 add %o1, 0x20, %o1 772 subcc %o5, 0x20, %o5 773 EX_ST(STORE(stx, %o3, %o0 + 0x00), mem 774 EX_ST(STORE(stx, %g2, %o0 + 0x08), mem 775 EX_ST(STORE(stx, %g7, %o0 + 0x10), mem 776 EX_ST(STORE(stx, %o4, %o0 + 0x18), mem 777 bne,pt %xcc, 1b 778 add %o0, 0x20, %o0 779 2: andcc %o2, 0x18, %o5 780 be,pt %xcc, 3f 781 sub %o2, %o5, %o2 782 1: EX_LD(LOAD(ldx, %o1 + 0x00, %o3), memc 783 add %o1, 0x08, %o1 784 add %o0, 0x08, %o0 785 subcc %o5, 0x08, %o5 786 bne,pt %xcc, 1b 787 EX_ST(STORE(stx, %o3, %o0 - 0x08), me 788 3: brz,pt %o2, .Lexit_cp 789 cmp %o2, 0x04 790 bl,pn %xcc, .Ltiny_cp 791 nop 792 EX_LD(LOAD(lduw, %o1 + 0x00, %o3), mem 793 add %o1, 0x04, %o1 794 add %o0, 0x04, %o0 795 subcc %o2, 0x04, %o2 796 bne,pn %xcc, .Ltiny_cp 797 EX_ST(STORE(stw, %o3, %o0 - 0x04), me 798 ba,a,pt %xcc, .Lexit_cp 799 800 .Lmedium_unaligned_cp: 801 /* First get dest 8 byte aligned. */ 802 sub %g0, %o0, %o3 803 and %o3, 0x7, %o3 804 brz,pt %o3, 2f 805 sub %o2, %o3, %o2 806 807 1: EX_LD(LOAD(ldub, %o1 + 0x00, %g2), mem 808 add %o1, 1, %o1 809 subcc %o3, 1, %o3 810 add %o0, 1, %o0 811 bne,pt %xcc, 1b 812 EX_ST(STORE(stb, %g2, %o0 - 0x01), me 813 2: 814 and %o1, 0x7, %o3 815 brz,pn %o3, .Lmedium_noprefetch_cp 816 sll %o3, 3, %o3 817 mov 64, %g2 818 sub %g2, %o3, %g2 819 andn %o1, 0x7, %o1 820 EX_LD(LOAD(ldx, %o1 + 0x00, %o4), memc 821 sllx %o4, %o3, %o4 822 andn %o2, 0x08 - 1, %o5 823 sub %o2, %o5, %o2 824 825 1: EX_LD(LOAD(ldx, %o1 + 0x08, %g3), memc 826 add %o1, 0x08, %o1 827 subcc %o5, 0x08, %o5 828 srlx %g3, %g2, %g7 829 or %g7, %o4, %g7 830 EX_ST(STORE(stx, %g7, %o0 + 0x00), mem 831 add %o0, 0x08, %o0 832 bne,pt %xcc, 1b 833 sllx %g3, %o3, %o4 834 srl %o3, 3, %o3 835 add %o1, %o3, %o1 836 brz,pn %o2, .Lexit_cp 837 nop 838 ba,pt %xcc, .Lsmall_unaligned_cp 839 840 .Ltiny_cp: 841 EX_LD(LOAD(ldub, %o1 + 0x00, %o3), mem 842 subcc %o2, 1, %o2 843 be,pn %xcc, .Lexit_cp 844 EX_ST(STORE(stb, %o3, %o0 + 0x00), me 845 EX_LD(LOAD(ldub, %o1 + 0x01, %o3), mem 846 subcc %o2, 1, %o2 847 be,pn %xcc, .Lexit_cp 848 EX_ST(STORE(stb, %o3, %o0 + 0x01), me 849 EX_LD(LOAD(ldub, %o1 + 0x02, %o3), mem 850 ba,pt %xcc, .Lexit_cp 851 EX_ST(STORE(stb, %o3, %o0 + 0x02), me 852 853 .Lsmall_cp: 854 andcc %g2, 0x3, %g0 855 bne,pn %xcc, .Lsmall_unaligned_cp 856 andn %o2, 0x4 - 1, %o5 857 sub %o2, %o5, %o2 858 1: 859 EX_LD(LOAD(lduw, %o1 + 0x00, %o3), mem 860 add %o1, 0x04, %o1 861 subcc %o5, 0x04, %o5 862 add %o0, 0x04, %o0 863 bne,pt %xcc, 1b 864 EX_ST(STORE(stw, %o3, %o0 - 0x04), me 865 brz,pt %o2, .Lexit_cp 866 nop 867 ba,a,pt %xcc, .Ltiny_cp 868 869 .Lsmall_unaligned_cp: 870 1: EX_LD(LOAD(ldub, %o1 + 0x00, %o3), mem 871 add %o1, 1, %o1 872 add %o0, 1, %o0 873 subcc %o2, 1, %o2 874 bne,pt %xcc, 1b 875 EX_ST(STORE(stb, %o3, %o0 - 0x01), me 876 ba,a,pt %xcc, .Lexit_cp 877 878 .Lsmallrest: 879 tst %o2 880 bz,pt %xcc, .Lsmallx 881 cmp %o2, 4 882 blt,pn %xcc, .Lsmallleft3 883 nop 884 sub %o2, 3, %o2 885 .Lsmallnotalign4: 886 EX_LD(LOAD(ldub, %o1, %o3), memcpy_ret 887 subcc %o2, 4, %o2 ! redu 888 EX_ST(STORE(stb, %o3, %o0), memcpy_ret 889 EX_LD(LOAD(ldub, %o1+1, %o3), memcpy_r 890 add %o1, 4, %o1 ! adva 891 EX_ST(STORE(stb, %o3, %o0+1), memcpy_r 892 EX_LD(LOAD(ldub, %o1-2, %o3), memcpy_r 893 add %o0, 4, %o0 ! adva 894 EX_ST(STORE(stb, %o3, %o0-2), memcpy_r 895 EX_LD(LOAD(ldub, %o1-1, %o3), memcpy_r 896 bgu,pt %xcc, .Lsmallnotalign4 ! loop 897 EX_ST(STORE(stb, %o3, %o0-1), memcpy_r 898 addcc %o2, 3, %o2 ! rest 899 bz,pt %xcc, .Lsmallx 900 .Lsmallleft3: ! 1, 2 901 subcc %o2, 1, %o2 902 EX_LD(LOAD(ldub, %o1, %o3), memcpy_ret 903 bz,pt %xcc, .Lsmallx 904 EX_ST(STORE(stb, %o3, %o0), memcpy_ret 905 EX_LD(LOAD(ldub, %o1+1, %o3), memcpy_r 906 subcc %o2, 1, %o2 907 bz,pt %xcc, .Lsmallx 908 EX_ST(STORE(stb, %o3, %o0+1), memcpy_r 909 EX_LD(LOAD(ldub, %o1+2, %o3), memcpy_r 910 EX_ST(STORE(stb, %o3, %o0+2), memcpy_r 911 .Lsmallx: 912 retl 913 mov EX_RETVAL(%g1), %o0 914 .Lsmallfin: 915 tst %o2 916 bnz,pn %xcc, .Lsmallleft3 917 nop 918 retl 919 mov EX_RETVAL(%g1), %o0 ! rest 920 .Lexit_cp: 921 retl 922 mov EX_RETVAL(%g1), %o0 923 .size FUNC_NAME, .-FUNC_NAME
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.