1 /* 2 * M7memcpy: Optimized SPARC M7 memcpy 3 * 4 * Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved. 5 */ 6 7 .file "M7memcpy.S" 8 9 /* 10 * memcpy(s1, s2, len) 11 * 12 * Copy s2 to s1, always copy n bytes. 13 * Note: this C code does not work for overlapped copies. 14 * 15 * Fast assembler language version of the following C-program for memcpy 16 * which represents the `standard' for the C-library. 17 * 18 * void * 19 * memcpy(void *s, const void *s0, size_t n) 20 * { 21 * if (n != 0) { 22 * char *s1 = s; 23 * const char *s2 = s0; 24 * do { 25 * *s1++ = *s2++; 26 * } while (--n != 0); 27 * } 28 * return (s); 29 * } 30 * 31 * 32 * SPARC T7/M7 Flow : 33 * 34 * if (count < SMALL_MAX) { 35 * if count < SHORTCOPY (SHORTCOPY=3) 36 * copy bytes; exit with dst addr 37 * if src & dst aligned on word boundary but not long word boundary, 38 * copy with ldw/stw; branch to finish_up 39 * if src & dst aligned on long word boundary 40 * copy with ldx/stx; branch to finish_up 41 * if src & dst not aligned and length <= SHORTCHECK (SHORTCHECK=14) 42 * copy bytes; exit with dst addr 43 * move enough bytes to get src to word boundary 44 * if dst now on word boundary 45 * move_words: 46 * copy words; branch to finish_up 47 * if dst now on half word boundary 48 * load words, shift half words, store words; branch to finish_up 49 * if dst on byte 1 50 * load words, shift 3 bytes, store words; branch to finish_up 51 * if dst on byte 3 52 * load words, shift 1 byte, store words; branch to finish_up 53 * finish_up: 54 * copy bytes; exit with dst addr 55 * } else { More than SMALL_MAX bytes 56 * move bytes until dst is on long word boundary 57 * if( src is on long word boundary ) { 58 * if (count < MED_MAX) { 59 * finish_long: src/dst aligned on 8 bytes 60 * copy with ldx/stx in 8-way unrolled loop; 61 * copy final 0-63 bytes; exit with dst addr 62 * } else { src/dst aligned; count > MED_MAX 63 * align dst on 64 byte boundary; for main data movement: 64 * prefetch src data to L2 cache; let HW prefetch move data to L1 cache 65 * Use BIS (block initializing store) to avoid copying store cache 66 * lines from memory. But pre-store first element of each cache line 67 * ST_CHUNK lines in advance of the rest of that cache line. That 68 * gives time for replacement cache lines to be written back without 69 * excess STQ and Miss Buffer filling. Repeat until near the end, 70 * then finish up storing before going to finish_long. 71 * } 72 * } else { src/dst not aligned on 8 bytes 73 * if src is word aligned and count < MED_WMAX 74 * move words in 8-way unrolled loop 75 * move final 0-31 bytes; exit with dst addr 76 * if count < MED_UMAX 77 * use alignaddr/faligndata combined with ldd/std in 8-way 78 * unrolled loop to move data. 79 * go to unalign_done 80 * else 81 * setup alignaddr for faligndata instructions 82 * align dst on 64 byte boundary; prefetch src data to L1 cache 83 * loadx8, falign, block-store, prefetch loop 84 * (only use block-init-store when src/dst on 8 byte boundaries.) 85 * unalign_done: 86 * move remaining bytes for unaligned cases. exit with dst addr. 87 * } 88 * 89 */ 90 91 #include <asm/visasm.h> 92 #include <asm/asi.h> 93 94 #if !defined(EX_LD) && !defined(EX_ST) 95 #define NON_USER_COPY 96 #endif 97 98 #ifndef EX_LD 99 #define EX_LD(x,y) x 100 #endif 101 #ifndef EX_LD_FP 102 #define EX_LD_FP(x,y) x 103 #endif 104 105 #ifndef EX_ST 106 #define EX_ST(x,y) x 107 #endif 108 #ifndef EX_ST_FP 109 #define EX_ST_FP(x,y) x 110 #endif 111 112 #ifndef EX_RETVAL 113 #define EX_RETVAL(x) x 114 #endif 115 116 #ifndef LOAD 117 #define LOAD(type,addr,dest) type [addr], dest 118 #endif 119 120 #ifndef STORE 121 #define STORE(type,src,addr) type src, [addr] 122 #endif 123 124 /* 125 * ASI_BLK_INIT_QUAD_LDD_P/ASI_BLK_INIT_QUAD_LDD_S marks the cache 126 * line as "least recently used" which means if many threads are 127 * active, it has a high probability of being pushed out of the cache 128 * between the first initializing store and the final stores. 129 * Thus, we use ASI_ST_BLKINIT_MRU_P/ASI_ST_BLKINIT_MRU_S which 130 * marks the cache line as "most recently used" for all 131 * but the last cache line 132 */ 133 #ifndef STORE_ASI 134 #ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA 135 #define STORE_ASI ASI_BLK_INIT_QUAD_LDD_P 136 #else 137 #define STORE_ASI 0x80 /* ASI_P */ 138 #endif 139 #endif 140 141 #ifndef STORE_MRU_ASI 142 #ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA 143 #define STORE_MRU_ASI ASI_ST_BLKINIT_MRU_P 144 #else 145 #define STORE_MRU_ASI 0x80 /* ASI_P */ 146 #endif 147 #endif 148 149 #ifndef STORE_INIT 150 #define STORE_INIT(src,addr) stxa src, [addr] STORE_ASI 151 #endif 152 153 #ifndef STORE_INIT_MRU 154 #define STORE_INIT_MRU(src,addr) stxa src, [addr] STORE_MRU_ASI 155 #endif 156 157 #ifndef FUNC_NAME 158 #define FUNC_NAME M7memcpy 159 #endif 160 161 #ifndef PREAMBLE 162 #define PREAMBLE 163 #endif 164 165 #define BLOCK_SIZE 64 166 #define SHORTCOPY 3 167 #define SHORTCHECK 14 168 #define SHORT_LONG 64 /* max copy for short longword-aligned case */ 169 /* must be at least 64 */ 170 #define SMALL_MAX 128 171 #define MED_UMAX 1024 /* max copy for medium un-aligned case */ 172 #define MED_WMAX 1024 /* max copy for medium word-aligned case */ 173 #define MED_MAX 1024 /* max copy for medium longword-aligned case */ 174 #define ST_CHUNK 24 /* ST_CHUNK - block of values for BIS Store */ 175 #define ALIGN_PRE 24 /* distance for aligned prefetch loop */ 176 177 .register %g2,#scratch 178 179 .section ".text" 180 .global FUNC_NAME 181 .type FUNC_NAME, #function 182 .align 16 183 FUNC_NAME: 184 srlx %o2, 31, %g2 185 cmp %g2, 0 186 tne %xcc, 5 187 PREAMBLE 188 mov %o0, %g1 ! save %o0 189 brz,pn %o2, .Lsmallx 190 cmp %o2, 3 191 ble,pn %icc, .Ltiny_cp 192 cmp %o2, 19 193 ble,pn %icc, .Lsmall_cp 194 or %o0, %o1, %g2 195 cmp %o2, SMALL_MAX 196 bl,pn %icc, .Lmedium_cp 197 nop 198 199 .Lmedium: 200 neg %o0, %o5 201 andcc %o5, 7, %o5 ! bytes till DST 8 byte aligned 202 brz,pt %o5, .Ldst_aligned_on_8 203 204 ! %o5 has the bytes to be written in partial store. 205 sub %o2, %o5, %o2 206 sub %o1, %o0, %o1 ! %o1 gets the difference 207 7: ! dst aligning loop 208 add %o1, %o0, %o4 209 EX_LD(LOAD(ldub, %o4, %o4), memcpy_retl_o2_plus_o5) ! load one byte 210 subcc %o5, 1, %o5 211 EX_ST(STORE(stb, %o4, %o0), memcpy_retl_o2_plus_o5_plus_1) 212 bgu,pt %xcc, 7b 213 add %o0, 1, %o0 ! advance dst 214 add %o1, %o0, %o1 ! restore %o1 215 .Ldst_aligned_on_8: 216 andcc %o1, 7, %o5 217 brnz,pt %o5, .Lsrc_dst_unaligned_on_8 218 nop 219 220 .Lsrc_dst_aligned_on_8: 221 ! check if we are copying MED_MAX or more bytes 222 set MED_MAX, %o3 223 cmp %o2, %o3 ! limit to store buffer size 224 bgu,pn %xcc, .Llarge_align8_copy 225 nop 226 227 /* 228 * Special case for handling when src and dest are both long word aligned 229 * and total data to move is less than MED_MAX bytes 230 */ 231 .Lmedlong: 232 subcc %o2, 63, %o2 ! adjust length to allow cc test 233 ble,pn %xcc, .Lmedl63 ! skip big loop if less than 64 bytes 234 nop 235 .Lmedl64: 236 EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_63) ! load 237 subcc %o2, 64, %o2 ! decrement length count 238 EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_63_64) ! and store 239 EX_LD(LOAD(ldx, %o1+8, %o3), memcpy_retl_o2_plus_63_56) ! a block of 64 240 EX_ST(STORE(stx, %o3, %o0+8), memcpy_retl_o2_plus_63_56) 241 EX_LD(LOAD(ldx, %o1+16, %o4), memcpy_retl_o2_plus_63_48) 242 EX_ST(STORE(stx, %o4, %o0+16), memcpy_retl_o2_plus_63_48) 243 EX_LD(LOAD(ldx, %o1+24, %o3), memcpy_retl_o2_plus_63_40) 244 EX_ST(STORE(stx, %o3, %o0+24), memcpy_retl_o2_plus_63_40) 245 EX_LD(LOAD(ldx, %o1+32, %o4), memcpy_retl_o2_plus_63_32)! load and store 246 EX_ST(STORE(stx, %o4, %o0+32), memcpy_retl_o2_plus_63_32) 247 EX_LD(LOAD(ldx, %o1+40, %o3), memcpy_retl_o2_plus_63_24)! a block of 64 248 add %o1, 64, %o1 ! increase src ptr by 64 249 EX_ST(STORE(stx, %o3, %o0+40), memcpy_retl_o2_plus_63_24) 250 EX_LD(LOAD(ldx, %o1-16, %o4), memcpy_retl_o2_plus_63_16) 251 add %o0, 64, %o0 ! increase dst ptr by 64 252 EX_ST(STORE(stx, %o4, %o0-16), memcpy_retl_o2_plus_63_16) 253 EX_LD(LOAD(ldx, %o1-8, %o3), memcpy_retl_o2_plus_63_8) 254 bgu,pt %xcc, .Lmedl64 ! repeat if at least 64 bytes left 255 EX_ST(STORE(stx, %o3, %o0-8), memcpy_retl_o2_plus_63_8) 256 .Lmedl63: 257 addcc %o2, 32, %o2 ! adjust remaining count 258 ble,pt %xcc, .Lmedl31 ! to skip if 31 or fewer bytes left 259 nop 260 EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_31) ! load 261 sub %o2, 32, %o2 ! decrement length count 262 EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_31_32) ! and store 263 EX_LD(LOAD(ldx, %o1+8, %o3), memcpy_retl_o2_plus_31_24) ! a block of 32 264 add %o1, 32, %o1 ! increase src ptr by 32 265 EX_ST(STORE(stx, %o3, %o0+8), memcpy_retl_o2_plus_31_24) 266 EX_LD(LOAD(ldx, %o1-16, %o4), memcpy_retl_o2_plus_31_16) 267 add %o0, 32, %o0 ! increase dst ptr by 32 268 EX_ST(STORE(stx, %o4, %o0-16), memcpy_retl_o2_plus_31_16) 269 EX_LD(LOAD(ldx, %o1-8, %o3), memcpy_retl_o2_plus_31_8) 270 EX_ST(STORE(stx, %o3, %o0-8), memcpy_retl_o2_plus_31_8) 271 .Lmedl31: 272 addcc %o2, 16, %o2 ! adjust remaining count 273 ble,pt %xcc, .Lmedl15 ! skip if 15 or fewer bytes left 274 nop ! 275 EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_15) 276 add %o1, 16, %o1 ! increase src ptr by 16 277 EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_15) 278 sub %o2, 16, %o2 ! decrease count by 16 279 EX_LD(LOAD(ldx, %o1-8, %o3), memcpy_retl_o2_plus_15_8) 280 add %o0, 16, %o0 ! increase dst ptr by 16 281 EX_ST(STORE(stx, %o3, %o0-8), memcpy_retl_o2_plus_15_8) 282 .Lmedl15: 283 addcc %o2, 15, %o2 ! restore count 284 bz,pt %xcc, .Lsmallx ! exit if finished 285 cmp %o2, 8 286 blt,pt %xcc, .Lmedw7 ! skip if 7 or fewer bytes left 287 tst %o2 288 EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2) ! load 8 bytes 289 add %o1, 8, %o1 ! increase src ptr by 8 290 add %o0, 8, %o0 ! increase dst ptr by 8 291 subcc %o2, 8, %o2 ! decrease count by 8 292 bnz,pn %xcc, .Lmedw7 293 EX_ST(STORE(stx, %o4, %o0-8), memcpy_retl_o2_plus_8) ! and store 8 294 retl 295 mov EX_RETVAL(%g1), %o0 ! restore %o0 296 297 .align 16 298 .Lsrc_dst_unaligned_on_8: 299 ! DST is 8-byte aligned, src is not 300 2: 301 andcc %o1, 0x3, %o5 ! test word alignment 302 bnz,pt %xcc, .Lunalignsetup ! branch to skip if not word aligned 303 nop 304 305 /* 306 * Handle all cases where src and dest are aligned on word 307 * boundaries. Use unrolled loops for better performance. 308 * This option wins over standard large data move when 309 * source and destination is in cache for.Lmedium 310 * to short data moves. 311 */ 312 set MED_WMAX, %o3 313 cmp %o2, %o3 ! limit to store buffer size 314 bge,pt %xcc, .Lunalignrejoin ! otherwise rejoin main loop 315 nop 316 317 subcc %o2, 31, %o2 ! adjust length to allow cc test 318 ! for end of loop 319 ble,pt %xcc, .Lmedw31 ! skip big loop if less than 16 320 .Lmedw32: 321 EX_LD(LOAD(ld, %o1, %o4), memcpy_retl_o2_plus_31)! move a block of 32 322 sllx %o4, 32, %o5 323 EX_LD(LOAD(ld, %o1+4, %o4), memcpy_retl_o2_plus_31) 324 or %o4, %o5, %o5 325 EX_ST(STORE(stx, %o5, %o0), memcpy_retl_o2_plus_31) 326 subcc %o2, 32, %o2 ! decrement length count 327 EX_LD(LOAD(ld, %o1+8, %o4), memcpy_retl_o2_plus_31_24) 328 sllx %o4, 32, %o5 329 EX_LD(LOAD(ld, %o1+12, %o4), memcpy_retl_o2_plus_31_24) 330 or %o4, %o5, %o5 331 EX_ST(STORE(stx, %o5, %o0+8), memcpy_retl_o2_plus_31_24) 332 add %o1, 32, %o1 ! increase src ptr by 32 333 EX_LD(LOAD(ld, %o1-16, %o4), memcpy_retl_o2_plus_31_16) 334 sllx %o4, 32, %o5 335 EX_LD(LOAD(ld, %o1-12, %o4), memcpy_retl_o2_plus_31_16) 336 or %o4, %o5, %o5 337 EX_ST(STORE(stx, %o5, %o0+16), memcpy_retl_o2_plus_31_16) 338 add %o0, 32, %o0 ! increase dst ptr by 32 339 EX_LD(LOAD(ld, %o1-8, %o4), memcpy_retl_o2_plus_31_8) 340 sllx %o4, 32, %o5 341 EX_LD(LOAD(ld, %o1-4, %o4), memcpy_retl_o2_plus_31_8) 342 or %o4, %o5, %o5 343 bgu,pt %xcc, .Lmedw32 ! repeat if at least 32 bytes left 344 EX_ST(STORE(stx, %o5, %o0-8), memcpy_retl_o2_plus_31_8) 345 .Lmedw31: 346 addcc %o2, 31, %o2 ! restore count 347 348 bz,pt %xcc, .Lsmallx ! exit if finished 349 nop 350 cmp %o2, 16 351 blt,pt %xcc, .Lmedw15 352 nop 353 EX_LD(LOAD(ld, %o1, %o4), memcpy_retl_o2)! move a block of 16 bytes 354 sllx %o4, 32, %o5 355 subcc %o2, 16, %o2 ! decrement length count 356 EX_LD(LOAD(ld, %o1+4, %o4), memcpy_retl_o2_plus_16) 357 or %o4, %o5, %o5 358 EX_ST(STORE(stx, %o5, %o0), memcpy_retl_o2_plus_16) 359 add %o1, 16, %o1 ! increase src ptr by 16 360 EX_LD(LOAD(ld, %o1-8, %o4), memcpy_retl_o2_plus_8) 361 add %o0, 16, %o0 ! increase dst ptr by 16 362 sllx %o4, 32, %o5 363 EX_LD(LOAD(ld, %o1-4, %o4), memcpy_retl_o2_plus_8) 364 or %o4, %o5, %o5 365 EX_ST(STORE(stx, %o5, %o0-8), memcpy_retl_o2_plus_8) 366 .Lmedw15: 367 bz,pt %xcc, .Lsmallx ! exit if finished 368 cmp %o2, 8 369 blt,pn %xcc, .Lmedw7 ! skip if 7 or fewer bytes left 370 tst %o2 371 EX_LD(LOAD(ld, %o1, %o4), memcpy_retl_o2) ! load 4 bytes 372 subcc %o2, 8, %o2 ! decrease count by 8 373 EX_ST(STORE(stw, %o4, %o0), memcpy_retl_o2_plus_8)! and store 4 bytes 374 add %o1, 8, %o1 ! increase src ptr by 8 375 EX_LD(LOAD(ld, %o1-4, %o3), memcpy_retl_o2_plus_4) ! load 4 bytes 376 add %o0, 8, %o0 ! increase dst ptr by 8 377 EX_ST(STORE(stw, %o3, %o0-4), memcpy_retl_o2_plus_4)! and store 4 bytes 378 bz,pt %xcc, .Lsmallx ! exit if finished 379 .Lmedw7: ! count is ge 1, less than 8 380 cmp %o2, 4 ! check for 4 bytes left 381 blt,pn %xcc, .Lsmallleft3 ! skip if 3 or fewer bytes left 382 nop ! 383 EX_LD(LOAD(ld, %o1, %o4), memcpy_retl_o2) ! load 4 bytes 384 add %o1, 4, %o1 ! increase src ptr by 4 385 add %o0, 4, %o0 ! increase dst ptr by 4 386 subcc %o2, 4, %o2 ! decrease count by 4 387 bnz .Lsmallleft3 388 EX_ST(STORE(stw, %o4, %o0-4), memcpy_retl_o2_plus_4)! and store 4 bytes 389 retl 390 mov EX_RETVAL(%g1), %o0 391 392 .align 16 393 .Llarge_align8_copy: ! Src and dst share 8 byte alignment 394 ! align dst to 64 byte boundary 395 andcc %o0, 0x3f, %o3 ! %o3 == 0 means dst is 64 byte aligned 396 brz,pn %o3, .Laligned_to_64 397 andcc %o0, 8, %o3 ! odd long words to move? 398 brz,pt %o3, .Laligned_to_16 399 nop 400 EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2) 401 sub %o2, 8, %o2 402 add %o1, 8, %o1 ! increment src ptr 403 add %o0, 8, %o0 ! increment dst ptr 404 EX_ST(STORE(stx, %o4, %o0-8), memcpy_retl_o2_plus_8) 405 .Laligned_to_16: 406 andcc %o0, 16, %o3 ! pair of long words to move? 407 brz,pt %o3, .Laligned_to_32 408 nop 409 EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2) 410 sub %o2, 16, %o2 411 EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_16) 412 add %o1, 16, %o1 ! increment src ptr 413 EX_LD(LOAD(ldx, %o1-8, %o4), memcpy_retl_o2_plus_8) 414 add %o0, 16, %o0 ! increment dst ptr 415 EX_ST(STORE(stx, %o4, %o0-8), memcpy_retl_o2_plus_8) 416 .Laligned_to_32: 417 andcc %o0, 32, %o3 ! four long words to move? 418 brz,pt %o3, .Laligned_to_64 419 nop 420 EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2) 421 sub %o2, 32, %o2 422 EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_32) 423 EX_LD(LOAD(ldx, %o1+8, %o4), memcpy_retl_o2_plus_24) 424 EX_ST(STORE(stx, %o4, %o0+8), memcpy_retl_o2_plus_24) 425 EX_LD(LOAD(ldx, %o1+16, %o4), memcpy_retl_o2_plus_16) 426 EX_ST(STORE(stx, %o4, %o0+16), memcpy_retl_o2_plus_16) 427 add %o1, 32, %o1 ! increment src ptr 428 EX_LD(LOAD(ldx, %o1-8, %o4), memcpy_retl_o2_plus_8) 429 add %o0, 32, %o0 ! increment dst ptr 430 EX_ST(STORE(stx, %o4, %o0-8), memcpy_retl_o2_plus_8) 431 .Laligned_to_64: 432 ! 433 ! Using block init store (BIS) instructions to avoid fetching cache 434 ! lines from memory. Use ST_CHUNK stores to first element of each cache 435 ! line (similar to prefetching) to avoid overfilling STQ or miss buffers. 436 ! Gives existing cache lines time to be moved out of L1/L2/L3 cache. 437 ! Initial stores using MRU version of BIS to keep cache line in 438 ! cache until we are ready to store final element of cache line. 439 ! Then store last element using the LRU version of BIS. 440 ! 441 andn %o2, 0x3f, %o5 ! %o5 is multiple of block size 442 and %o2, 0x3f, %o2 ! residue bytes in %o2 443 ! 444 ! We use STORE_MRU_ASI for the first seven stores to each cache line 445 ! followed by STORE_ASI (mark as LRU) for the last store. That 446 ! mixed approach reduces the probability that the cache line is removed 447 ! before we finish setting it, while minimizing the effects on 448 ! other cached values during a large memcpy 449 ! 450 ! ST_CHUNK batches up initial BIS operations for several cache lines 451 ! to allow multiple requests to not be blocked by overflowing the 452 ! the store miss buffer. Then the matching stores for all those 453 ! BIS operations are executed. 454 ! 455 456 sub %o0, 8, %o0 ! adjust %o0 for ASI alignment 457 .Lalign_loop: 458 cmp %o5, ST_CHUNK*64 459 blu,pt %xcc, .Lalign_loop_fin 460 mov ST_CHUNK,%o3 461 .Lalign_loop_start: 462 prefetch [%o1 + (ALIGN_PRE * BLOCK_SIZE)], 21 463 subcc %o3, 1, %o3 464 EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_o5) 465 add %o1, 64, %o1 466 add %o0, 8, %o0 467 EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5) 468 bgu %xcc,.Lalign_loop_start 469 add %o0, 56, %o0 470 471 mov ST_CHUNK,%o3 472 sllx %o3, 6, %o4 ! ST_CHUNK*64 473 sub %o1, %o4, %o1 ! reset %o1 474 sub %o0, %o4, %o0 ! reset %o0 475 476 .Lalign_loop_rest: 477 EX_LD(LOAD(ldx, %o1+8, %o4), memcpy_retl_o2_plus_o5) 478 add %o0, 16, %o0 479 EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5) 480 EX_LD(LOAD(ldx, %o1+16, %o4), memcpy_retl_o2_plus_o5) 481 add %o0, 8, %o0 482 EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5) 483 subcc %o3, 1, %o3 484 EX_LD(LOAD(ldx, %o1+24, %o4), memcpy_retl_o2_plus_o5) 485 add %o0, 8, %o0 486 EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5) 487 EX_LD(LOAD(ldx, %o1+32, %o4), memcpy_retl_o2_plus_o5) 488 add %o0, 8, %o0 489 EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5) 490 EX_LD(LOAD(ldx, %o1+40, %o4), memcpy_retl_o2_plus_o5) 491 add %o0, 8, %o0 492 EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5) 493 EX_LD(LOAD(ldx, %o1+48, %o4), memcpy_retl_o2_plus_o5) 494 add %o1, 64, %o1 495 add %o0, 8, %o0 496 EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5) 497 add %o0, 8, %o0 498 EX_LD(LOAD(ldx, %o1-8, %o4), memcpy_retl_o2_plus_o5) 499 sub %o5, 64, %o5 500 bgu %xcc,.Lalign_loop_rest 501 ! mark cache line as LRU 502 EX_ST(STORE_INIT(%o4, %o0), memcpy_retl_o2_plus_o5_plus_64) 503 504 cmp %o5, ST_CHUNK*64 505 bgu,pt %xcc, .Lalign_loop_start 506 mov ST_CHUNK,%o3 507 508 cmp %o5, 0 509 beq .Lalign_done 510 nop 511 .Lalign_loop_fin: 512 EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_o5) 513 EX_ST(STORE(stx, %o4, %o0+8), memcpy_retl_o2_plus_o5) 514 EX_LD(LOAD(ldx, %o1+8, %o4), memcpy_retl_o2_plus_o5) 515 EX_ST(STORE(stx, %o4, %o0+8+8), memcpy_retl_o2_plus_o5) 516 EX_LD(LOAD(ldx, %o1+16, %o4), memcpy_retl_o2_plus_o5) 517 EX_ST(STORE(stx, %o4, %o0+8+16), memcpy_retl_o2_plus_o5) 518 subcc %o5, 64, %o5 519 EX_LD(LOAD(ldx, %o1+24, %o4), memcpy_retl_o2_plus_o5_64) 520 EX_ST(STORE(stx, %o4, %o0+8+24), memcpy_retl_o2_plus_o5_64) 521 EX_LD(LOAD(ldx, %o1+32, %o4), memcpy_retl_o2_plus_o5_64) 522 EX_ST(STORE(stx, %o4, %o0+8+32), memcpy_retl_o2_plus_o5_64) 523 EX_LD(LOAD(ldx, %o1+40, %o4), memcpy_retl_o2_plus_o5_64) 524 EX_ST(STORE(stx, %o4, %o0+8+40), memcpy_retl_o2_plus_o5_64) 525 EX_LD(LOAD(ldx, %o1+48, %o4), memcpy_retl_o2_plus_o5_64) 526 add %o1, 64, %o1 527 EX_ST(STORE(stx, %o4, %o0+8+48), memcpy_retl_o2_plus_o5_64) 528 add %o0, 64, %o0 529 EX_LD(LOAD(ldx, %o1-8, %o4), memcpy_retl_o2_plus_o5_64) 530 bgu %xcc,.Lalign_loop_fin 531 EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_o5_64) 532 533 .Lalign_done: 534 add %o0, 8, %o0 ! restore %o0 from ASI alignment 535 membar #StoreStore 536 sub %o2, 63, %o2 ! adjust length to allow cc test 537 ba .Lmedl63 ! in .Lmedl63 538 nop 539 540 .align 16 541 ! Dst is on 8 byte boundary; src is not; remaining count > SMALL_MAX 542 .Lunalignsetup: 543 .Lunalignrejoin: 544 mov %g1, %o3 ! save %g1 as VISEntryHalf clobbers it 545 #ifdef NON_USER_COPY 546 VISEntryHalfFast(.Lmedium_vis_entry_fail_cp) 547 #else 548 VISEntryHalf 549 #endif 550 mov %o3, %g1 ! restore %g1 551 552 set MED_UMAX, %o3 553 cmp %o2, %o3 ! check for.Lmedium unaligned limit 554 bge,pt %xcc,.Lunalign_large 555 prefetch [%o1 + (4 * BLOCK_SIZE)], 20 556 andn %o2, 0x3f, %o5 ! %o5 is multiple of block size 557 and %o2, 0x3f, %o2 ! residue bytes in %o2 558 cmp %o2, 8 ! Insure we do not load beyond 559 bgt .Lunalign_adjust ! end of source buffer 560 andn %o1, 0x7, %o4 ! %o4 has long word aligned src address 561 add %o2, 64, %o2 ! adjust to leave loop 562 sub %o5, 64, %o5 ! early if necessary 563 .Lunalign_adjust: 564 alignaddr %o1, %g0, %g0 ! generate %gsr 565 add %o1, %o5, %o1 ! advance %o1 to after blocks 566 EX_LD_FP(LOAD(ldd, %o4, %f0), memcpy_retl_o2_plus_o5) 567 .Lunalign_loop: 568 EX_LD_FP(LOAD(ldd, %o4+8, %f2), memcpy_retl_o2_plus_o5) 569 faligndata %f0, %f2, %f16 570 EX_LD_FP(LOAD(ldd, %o4+16, %f4), memcpy_retl_o2_plus_o5) 571 subcc %o5, BLOCK_SIZE, %o5 572 EX_ST_FP(STORE(std, %f16, %o0), memcpy_retl_o2_plus_o5_plus_64) 573 faligndata %f2, %f4, %f18 574 EX_LD_FP(LOAD(ldd, %o4+24, %f6), memcpy_retl_o2_plus_o5_plus_56) 575 EX_ST_FP(STORE(std, %f18, %o0+8), memcpy_retl_o2_plus_o5_plus_56) 576 faligndata %f4, %f6, %f20 577 EX_LD_FP(LOAD(ldd, %o4+32, %f8), memcpy_retl_o2_plus_o5_plus_48) 578 EX_ST_FP(STORE(std, %f20, %o0+16), memcpy_retl_o2_plus_o5_plus_48) 579 faligndata %f6, %f8, %f22 580 EX_LD_FP(LOAD(ldd, %o4+40, %f10), memcpy_retl_o2_plus_o5_plus_40) 581 EX_ST_FP(STORE(std, %f22, %o0+24), memcpy_retl_o2_plus_o5_plus_40) 582 faligndata %f8, %f10, %f24 583 EX_LD_FP(LOAD(ldd, %o4+48, %f12), memcpy_retl_o2_plus_o5_plus_32) 584 EX_ST_FP(STORE(std, %f24, %o0+32), memcpy_retl_o2_plus_o5_plus_32) 585 faligndata %f10, %f12, %f26 586 EX_LD_FP(LOAD(ldd, %o4+56, %f14), memcpy_retl_o2_plus_o5_plus_24) 587 add %o4, BLOCK_SIZE, %o4 588 EX_ST_FP(STORE(std, %f26, %o0+40), memcpy_retl_o2_plus_o5_plus_24) 589 faligndata %f12, %f14, %f28 590 EX_LD_FP(LOAD(ldd, %o4, %f0), memcpy_retl_o2_plus_o5_plus_16) 591 EX_ST_FP(STORE(std, %f28, %o0+48), memcpy_retl_o2_plus_o5_plus_16) 592 faligndata %f14, %f0, %f30 593 EX_ST_FP(STORE(std, %f30, %o0+56), memcpy_retl_o2_plus_o5_plus_8) 594 add %o0, BLOCK_SIZE, %o0 595 bgu,pt %xcc, .Lunalign_loop 596 prefetch [%o4 + (5 * BLOCK_SIZE)], 20 597 ba .Lunalign_done 598 nop 599 600 .Lunalign_large: 601 andcc %o0, 0x3f, %o3 ! is dst 64-byte block aligned? 602 bz %xcc, .Lunalignsrc 603 sub %o3, 64, %o3 ! %o3 will be multiple of 8 604 neg %o3 ! bytes until dest is 64 byte aligned 605 sub %o2, %o3, %o2 ! update cnt with bytes to be moved 606 ! Move bytes according to source alignment 607 andcc %o1, 0x1, %o5 608 bnz %xcc, .Lunalignbyte ! check for byte alignment 609 nop 610 andcc %o1, 2, %o5 ! check for half word alignment 611 bnz %xcc, .Lunalignhalf 612 nop 613 ! Src is word aligned 614 .Lunalignword: 615 EX_LD_FP(LOAD(ld, %o1, %o4), memcpy_retl_o2_plus_o3) ! load 4 bytes 616 add %o1, 8, %o1 ! increase src ptr by 8 617 EX_ST_FP(STORE(stw, %o4, %o0), memcpy_retl_o2_plus_o3) ! and store 4 618 subcc %o3, 8, %o3 ! decrease count by 8 619 EX_LD_FP(LOAD(ld, %o1-4, %o4), memcpy_retl_o2_plus_o3_plus_4)! load 4 620 add %o0, 8, %o0 ! increase dst ptr by 8 621 bnz %xcc, .Lunalignword 622 EX_ST_FP(STORE(stw, %o4, %o0-4), memcpy_retl_o2_plus_o3_plus_4) 623 ba .Lunalignsrc 624 nop 625 626 ! Src is half-word aligned 627 .Lunalignhalf: 628 EX_LD_FP(LOAD(lduh, %o1, %o4), memcpy_retl_o2_plus_o3) ! load 2 bytes 629 sllx %o4, 32, %o5 ! shift left 630 EX_LD_FP(LOAD(lduw, %o1+2, %o4), memcpy_retl_o2_plus_o3) 631 or %o4, %o5, %o5 632 sllx %o5, 16, %o5 633 EX_LD_FP(LOAD(lduh, %o1+6, %o4), memcpy_retl_o2_plus_o3) 634 or %o4, %o5, %o5 635 EX_ST_FP(STORE(stx, %o5, %o0), memcpy_retl_o2_plus_o3) 636 add %o1, 8, %o1 637 subcc %o3, 8, %o3 638 bnz %xcc, .Lunalignhalf 639 add %o0, 8, %o0 640 ba .Lunalignsrc 641 nop 642 643 ! Src is Byte aligned 644 .Lunalignbyte: 645 sub %o0, %o1, %o0 ! share pointer advance 646 .Lunalignbyte_loop: 647 EX_LD_FP(LOAD(ldub, %o1, %o4), memcpy_retl_o2_plus_o3) 648 sllx %o4, 56, %o5 649 EX_LD_FP(LOAD(lduh, %o1+1, %o4), memcpy_retl_o2_plus_o3) 650 sllx %o4, 40, %o4 651 or %o4, %o5, %o5 652 EX_LD_FP(LOAD(lduh, %o1+3, %o4), memcpy_retl_o2_plus_o3) 653 sllx %o4, 24, %o4 654 or %o4, %o5, %o5 655 EX_LD_FP(LOAD(lduh, %o1+5, %o4), memcpy_retl_o2_plus_o3) 656 sllx %o4, 8, %o4 657 or %o4, %o5, %o5 658 EX_LD_FP(LOAD(ldub, %o1+7, %o4), memcpy_retl_o2_plus_o3) 659 or %o4, %o5, %o5 660 add %o0, %o1, %o0 661 EX_ST_FP(STORE(stx, %o5, %o0), memcpy_retl_o2_plus_o3) 662 sub %o0, %o1, %o0 663 subcc %o3, 8, %o3 664 bnz %xcc, .Lunalignbyte_loop 665 add %o1, 8, %o1 666 add %o0,%o1, %o0 ! restore pointer 667 668 ! Destination is now block (64 byte aligned) 669 .Lunalignsrc: 670 andn %o2, 0x3f, %o5 ! %o5 is multiple of block size 671 and %o2, 0x3f, %o2 ! residue bytes in %o2 672 add %o2, 64, %o2 ! Insure we do not load beyond 673 sub %o5, 64, %o5 ! end of source buffer 674 675 andn %o1, 0x7, %o4 ! %o4 has long word aligned src address 676 alignaddr %o1, %g0, %g0 ! generate %gsr 677 add %o1, %o5, %o1 ! advance %o1 to after blocks 678 679 EX_LD_FP(LOAD(ldd, %o4, %f14), memcpy_retl_o2_plus_o5) 680 add %o4, 8, %o4 681 .Lunalign_sloop: 682 EX_LD_FP(LOAD(ldd, %o4, %f16), memcpy_retl_o2_plus_o5) 683 faligndata %f14, %f16, %f0 684 EX_LD_FP(LOAD(ldd, %o4+8, %f18), memcpy_retl_o2_plus_o5) 685 faligndata %f16, %f18, %f2 686 EX_LD_FP(LOAD(ldd, %o4+16, %f20), memcpy_retl_o2_plus_o5) 687 faligndata %f18, %f20, %f4 688 EX_ST_FP(STORE(std, %f0, %o0), memcpy_retl_o2_plus_o5) 689 subcc %o5, 64, %o5 690 EX_LD_FP(LOAD(ldd, %o4+24, %f22), memcpy_retl_o2_plus_o5_plus_56) 691 faligndata %f20, %f22, %f6 692 EX_ST_FP(STORE(std, %f2, %o0+8), memcpy_retl_o2_plus_o5_plus_56) 693 EX_LD_FP(LOAD(ldd, %o4+32, %f24), memcpy_retl_o2_plus_o5_plus_48) 694 faligndata %f22, %f24, %f8 695 EX_ST_FP(STORE(std, %f4, %o0+16), memcpy_retl_o2_plus_o5_plus_48) 696 EX_LD_FP(LOAD(ldd, %o4+40, %f26), memcpy_retl_o2_plus_o5_plus_40) 697 faligndata %f24, %f26, %f10 698 EX_ST_FP(STORE(std, %f6, %o0+24), memcpy_retl_o2_plus_o5_plus_40) 699 EX_LD_FP(LOAD(ldd, %o4+48, %f28), memcpy_retl_o2_plus_o5_plus_40) 700 faligndata %f26, %f28, %f12 701 EX_ST_FP(STORE(std, %f8, %o0+32), memcpy_retl_o2_plus_o5_plus_40) 702 add %o4, 64, %o4 703 EX_LD_FP(LOAD(ldd, %o4-8, %f30), memcpy_retl_o2_plus_o5_plus_40) 704 faligndata %f28, %f30, %f14 705 EX_ST_FP(STORE(std, %f10, %o0+40), memcpy_retl_o2_plus_o5_plus_40) 706 EX_ST_FP(STORE(std, %f12, %o0+48), memcpy_retl_o2_plus_o5_plus_40) 707 add %o0, 64, %o0 708 EX_ST_FP(STORE(std, %f14, %o0-8), memcpy_retl_o2_plus_o5_plus_40) 709 fsrc2 %f30, %f14 710 bgu,pt %xcc, .Lunalign_sloop 711 prefetch [%o4 + (8 * BLOCK_SIZE)], 20 712 713 .Lunalign_done: 714 ! Handle trailing bytes, 64 to 127 715 ! Dest long word aligned, Src not long word aligned 716 cmp %o2, 15 717 bleu %xcc, .Lunalign_short 718 719 andn %o2, 0x7, %o5 ! %o5 is multiple of 8 720 and %o2, 0x7, %o2 ! residue bytes in %o2 721 add %o2, 8, %o2 722 sub %o5, 8, %o5 ! insure we do not load past end of src 723 andn %o1, 0x7, %o4 ! %o4 has long word aligned src address 724 add %o1, %o5, %o1 ! advance %o1 to after multiple of 8 725 EX_LD_FP(LOAD(ldd, %o4, %f0), memcpy_retl_o2_plus_o5)! fetch partialword 726 .Lunalign_by8: 727 EX_LD_FP(LOAD(ldd, %o4+8, %f2), memcpy_retl_o2_plus_o5) 728 add %o4, 8, %o4 729 faligndata %f0, %f2, %f16 730 subcc %o5, 8, %o5 731 EX_ST_FP(STORE(std, %f16, %o0), memcpy_retl_o2_plus_o5) 732 fsrc2 %f2, %f0 733 bgu,pt %xcc, .Lunalign_by8 734 add %o0, 8, %o0 735 736 .Lunalign_short: 737 #ifdef NON_USER_COPY 738 VISExitHalfFast 739 #else 740 VISExitHalf 741 #endif 742 ba .Lsmallrest 743 nop 744 745 /* 746 * This is a special case of nested memcpy. This can happen when kernel 747 * calls unaligned memcpy back to back without saving FP registers. We need 748 * traps(context switch) to save/restore FP registers. If the kernel calls 749 * memcpy without this trap sequence we will hit FP corruption. Let's use 750 * the normal integer load/store method in this case. 751 */ 752 753 #ifdef NON_USER_COPY 754 .Lmedium_vis_entry_fail_cp: 755 or %o0, %o1, %g2 756 #endif 757 .Lmedium_cp: 758 LOAD(prefetch, %o1 + 0x40, #n_reads_strong) 759 andcc %g2, 0x7, %g0 760 bne,pn %xcc, .Lmedium_unaligned_cp 761 nop 762 763 .Lmedium_noprefetch_cp: 764 andncc %o2, 0x20 - 1, %o5 765 be,pn %xcc, 2f 766 sub %o2, %o5, %o2 767 1: EX_LD(LOAD(ldx, %o1 + 0x00, %o3), memcpy_retl_o2_plus_o5) 768 EX_LD(LOAD(ldx, %o1 + 0x08, %g2), memcpy_retl_o2_plus_o5) 769 EX_LD(LOAD(ldx, %o1 + 0x10, %g7), memcpy_retl_o2_plus_o5) 770 EX_LD(LOAD(ldx, %o1 + 0x18, %o4), memcpy_retl_o2_plus_o5) 771 add %o1, 0x20, %o1 772 subcc %o5, 0x20, %o5 773 EX_ST(STORE(stx, %o3, %o0 + 0x00), memcpy_retl_o2_plus_o5_plus_32) 774 EX_ST(STORE(stx, %g2, %o0 + 0x08), memcpy_retl_o2_plus_o5_plus_24) 775 EX_ST(STORE(stx, %g7, %o0 + 0x10), memcpy_retl_o2_plus_o5_plus_24) 776 EX_ST(STORE(stx, %o4, %o0 + 0x18), memcpy_retl_o2_plus_o5_plus_8) 777 bne,pt %xcc, 1b 778 add %o0, 0x20, %o0 779 2: andcc %o2, 0x18, %o5 780 be,pt %xcc, 3f 781 sub %o2, %o5, %o2 782 1: EX_LD(LOAD(ldx, %o1 + 0x00, %o3), memcpy_retl_o2_plus_o5) 783 add %o1, 0x08, %o1 784 add %o0, 0x08, %o0 785 subcc %o5, 0x08, %o5 786 bne,pt %xcc, 1b 787 EX_ST(STORE(stx, %o3, %o0 - 0x08), memcpy_retl_o2_plus_o5_plus_8) 788 3: brz,pt %o2, .Lexit_cp 789 cmp %o2, 0x04 790 bl,pn %xcc, .Ltiny_cp 791 nop 792 EX_LD(LOAD(lduw, %o1 + 0x00, %o3), memcpy_retl_o2) 793 add %o1, 0x04, %o1 794 add %o0, 0x04, %o0 795 subcc %o2, 0x04, %o2 796 bne,pn %xcc, .Ltiny_cp 797 EX_ST(STORE(stw, %o3, %o0 - 0x04), memcpy_retl_o2_plus_4) 798 ba,a,pt %xcc, .Lexit_cp 799 800 .Lmedium_unaligned_cp: 801 /* First get dest 8 byte aligned. */ 802 sub %g0, %o0, %o3 803 and %o3, 0x7, %o3 804 brz,pt %o3, 2f 805 sub %o2, %o3, %o2 806 807 1: EX_LD(LOAD(ldub, %o1 + 0x00, %g2), memcpy_retl_o2_plus_g1) 808 add %o1, 1, %o1 809 subcc %o3, 1, %o3 810 add %o0, 1, %o0 811 bne,pt %xcc, 1b 812 EX_ST(STORE(stb, %g2, %o0 - 0x01), memcpy_retl_o2_plus_g1_plus_1) 813 2: 814 and %o1, 0x7, %o3 815 brz,pn %o3, .Lmedium_noprefetch_cp 816 sll %o3, 3, %o3 817 mov 64, %g2 818 sub %g2, %o3, %g2 819 andn %o1, 0x7, %o1 820 EX_LD(LOAD(ldx, %o1 + 0x00, %o4), memcpy_retl_o2) 821 sllx %o4, %o3, %o4 822 andn %o2, 0x08 - 1, %o5 823 sub %o2, %o5, %o2 824 825 1: EX_LD(LOAD(ldx, %o1 + 0x08, %g3), memcpy_retl_o2_plus_o5) 826 add %o1, 0x08, %o1 827 subcc %o5, 0x08, %o5 828 srlx %g3, %g2, %g7 829 or %g7, %o4, %g7 830 EX_ST(STORE(stx, %g7, %o0 + 0x00), memcpy_retl_o2_plus_o5_plus_8) 831 add %o0, 0x08, %o0 832 bne,pt %xcc, 1b 833 sllx %g3, %o3, %o4 834 srl %o3, 3, %o3 835 add %o1, %o3, %o1 836 brz,pn %o2, .Lexit_cp 837 nop 838 ba,pt %xcc, .Lsmall_unaligned_cp 839 840 .Ltiny_cp: 841 EX_LD(LOAD(ldub, %o1 + 0x00, %o3), memcpy_retl_o2) 842 subcc %o2, 1, %o2 843 be,pn %xcc, .Lexit_cp 844 EX_ST(STORE(stb, %o3, %o0 + 0x00), memcpy_retl_o2_plus_1) 845 EX_LD(LOAD(ldub, %o1 + 0x01, %o3), memcpy_retl_o2) 846 subcc %o2, 1, %o2 847 be,pn %xcc, .Lexit_cp 848 EX_ST(STORE(stb, %o3, %o0 + 0x01), memcpy_retl_o2_plus_1) 849 EX_LD(LOAD(ldub, %o1 + 0x02, %o3), memcpy_retl_o2) 850 ba,pt %xcc, .Lexit_cp 851 EX_ST(STORE(stb, %o3, %o0 + 0x02), memcpy_retl_o2) 852 853 .Lsmall_cp: 854 andcc %g2, 0x3, %g0 855 bne,pn %xcc, .Lsmall_unaligned_cp 856 andn %o2, 0x4 - 1, %o5 857 sub %o2, %o5, %o2 858 1: 859 EX_LD(LOAD(lduw, %o1 + 0x00, %o3), memcpy_retl_o2_plus_o5) 860 add %o1, 0x04, %o1 861 subcc %o5, 0x04, %o5 862 add %o0, 0x04, %o0 863 bne,pt %xcc, 1b 864 EX_ST(STORE(stw, %o3, %o0 - 0x04), memcpy_retl_o2_plus_o5_plus_4) 865 brz,pt %o2, .Lexit_cp 866 nop 867 ba,a,pt %xcc, .Ltiny_cp 868 869 .Lsmall_unaligned_cp: 870 1: EX_LD(LOAD(ldub, %o1 + 0x00, %o3), memcpy_retl_o2) 871 add %o1, 1, %o1 872 add %o0, 1, %o0 873 subcc %o2, 1, %o2 874 bne,pt %xcc, 1b 875 EX_ST(STORE(stb, %o3, %o0 - 0x01), memcpy_retl_o2_plus_1) 876 ba,a,pt %xcc, .Lexit_cp 877 878 .Lsmallrest: 879 tst %o2 880 bz,pt %xcc, .Lsmallx 881 cmp %o2, 4 882 blt,pn %xcc, .Lsmallleft3 883 nop 884 sub %o2, 3, %o2 885 .Lsmallnotalign4: 886 EX_LD(LOAD(ldub, %o1, %o3), memcpy_retl_o2_plus_3)! read byte 887 subcc %o2, 4, %o2 ! reduce count by 4 888 EX_ST(STORE(stb, %o3, %o0), memcpy_retl_o2_plus_7)! write byte & repeat 889 EX_LD(LOAD(ldub, %o1+1, %o3), memcpy_retl_o2_plus_6)! for total of 4 890 add %o1, 4, %o1 ! advance SRC by 4 891 EX_ST(STORE(stb, %o3, %o0+1), memcpy_retl_o2_plus_6) 892 EX_LD(LOAD(ldub, %o1-2, %o3), memcpy_retl_o2_plus_5) 893 add %o0, 4, %o0 ! advance DST by 4 894 EX_ST(STORE(stb, %o3, %o0-2), memcpy_retl_o2_plus_5) 895 EX_LD(LOAD(ldub, %o1-1, %o3), memcpy_retl_o2_plus_4) 896 bgu,pt %xcc, .Lsmallnotalign4 ! loop til 3 or fewer bytes remain 897 EX_ST(STORE(stb, %o3, %o0-1), memcpy_retl_o2_plus_4) 898 addcc %o2, 3, %o2 ! restore count 899 bz,pt %xcc, .Lsmallx 900 .Lsmallleft3: ! 1, 2, or 3 bytes remain 901 subcc %o2, 1, %o2 902 EX_LD(LOAD(ldub, %o1, %o3), memcpy_retl_o2_plus_1) ! load one byte 903 bz,pt %xcc, .Lsmallx 904 EX_ST(STORE(stb, %o3, %o0), memcpy_retl_o2_plus_1) ! store one byte 905 EX_LD(LOAD(ldub, %o1+1, %o3), memcpy_retl_o2) ! load second byte 906 subcc %o2, 1, %o2 907 bz,pt %xcc, .Lsmallx 908 EX_ST(STORE(stb, %o3, %o0+1), memcpy_retl_o2_plus_1)! store second byte 909 EX_LD(LOAD(ldub, %o1+2, %o3), memcpy_retl_o2) ! load third byte 910 EX_ST(STORE(stb, %o3, %o0+2), memcpy_retl_o2) ! store third byte 911 .Lsmallx: 912 retl 913 mov EX_RETVAL(%g1), %o0 914 .Lsmallfin: 915 tst %o2 916 bnz,pn %xcc, .Lsmallleft3 917 nop 918 retl 919 mov EX_RETVAL(%g1), %o0 ! restore %o0 920 .Lexit_cp: 921 retl 922 mov EX_RETVAL(%g1), %o0 923 .size FUNC_NAME, .-FUNC_NAME
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.