1 /* 1 /* 2 * M7memcpy: Optimized SPARC M7 memcpy 2 * M7memcpy: Optimized SPARC M7 memcpy 3 * 3 * 4 * Copyright (c) 2016, Oracle and/or its affil 4 * Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved. 5 */ 5 */ 6 6 7 .file "M7memcpy.S" 7 .file "M7memcpy.S" 8 8 9 /* 9 /* 10 * memcpy(s1, s2, len) 10 * memcpy(s1, s2, len) 11 * 11 * 12 * Copy s2 to s1, always copy n bytes. 12 * Copy s2 to s1, always copy n bytes. 13 * Note: this C code does not work for overlap 13 * Note: this C code does not work for overlapped copies. 14 * 14 * 15 * Fast assembler language version of the foll 15 * Fast assembler language version of the following C-program for memcpy 16 * which represents the `standard' for the C-l 16 * which represents the `standard' for the C-library. 17 * 17 * 18 * void * 18 * void * 19 * memcpy(void *s, const void *s0, size_t 19 * memcpy(void *s, const void *s0, size_t n) 20 * { 20 * { 21 * if (n != 0) { 21 * if (n != 0) { 22 * char *s1 = s; 22 * char *s1 = s; 23 * const char *s2 = s0; 23 * const char *s2 = s0; 24 * do { 24 * do { 25 * *s1++ = *s2++; 25 * *s1++ = *s2++; 26 * } while (--n != 0); 26 * } while (--n != 0); 27 * } 27 * } 28 * return (s); 28 * return (s); 29 * } 29 * } 30 * 30 * 31 * 31 * 32 * SPARC T7/M7 Flow : 32 * SPARC T7/M7 Flow : 33 * 33 * 34 * if (count < SMALL_MAX) { 34 * if (count < SMALL_MAX) { 35 * if count < SHORTCOPY (SHORTC 35 * if count < SHORTCOPY (SHORTCOPY=3) 36 * copy bytes; exit with dst addr 36 * copy bytes; exit with dst addr 37 * if src & dst aligned on word boundary but 37 * if src & dst aligned on word boundary but not long word boundary, 38 * copy with ldw/stw; branch to finish_up 38 * copy with ldw/stw; branch to finish_up 39 * if src & dst aligned on long word boundar 39 * if src & dst aligned on long word boundary 40 * copy with ldx/stx; branch to finish_up 40 * copy with ldx/stx; branch to finish_up 41 * if src & dst not aligned and length <= SH 41 * if src & dst not aligned and length <= SHORTCHECK (SHORTCHECK=14) 42 * copy bytes; exit with dst addr 42 * copy bytes; exit with dst addr 43 * move enough bytes to get src to word boun 43 * move enough bytes to get src to word boundary 44 * if dst now on word boundary 44 * if dst now on word boundary 45 * move_words: 45 * move_words: 46 * copy words; branch to finish_up 46 * copy words; branch to finish_up 47 * if dst now on half word boundary 47 * if dst now on half word boundary 48 * load words, shift half words, store wor 48 * load words, shift half words, store words; branch to finish_up 49 * if dst on byte 1 49 * if dst on byte 1 50 * load words, shift 3 bytes, store words; 50 * load words, shift 3 bytes, store words; branch to finish_up 51 * if dst on byte 3 51 * if dst on byte 3 52 * load words, shift 1 byte, store words; 52 * load words, shift 1 byte, store words; branch to finish_up 53 * finish_up: 53 * finish_up: 54 * copy bytes; exit with dst addr 54 * copy bytes; exit with dst addr 55 * } else { 55 * } else { More than SMALL_MAX bytes 56 * move bytes until dst is on long word boun 56 * move bytes until dst is on long word boundary 57 * if( src is on long word boundary ) { 57 * if( src is on long word boundary ) { 58 * if (count < MED_MAX) { 58 * if (count < MED_MAX) { 59 * finish_long: 59 * finish_long: src/dst aligned on 8 bytes 60 * copy with ldx/stx in 8-way unrolled l 60 * copy with ldx/stx in 8-way unrolled loop; 61 * copy final 0-63 bytes; exit with dst 61 * copy final 0-63 bytes; exit with dst addr 62 * } else { s 62 * } else { src/dst aligned; count > MED_MAX 63 * align dst on 64 byte boundary; for ma 63 * align dst on 64 byte boundary; for main data movement: 64 * prefetch src data to L2 cache; let HW 64 * prefetch src data to L2 cache; let HW prefetch move data to L1 cache 65 * Use BIS (block initializing store) to 65 * Use BIS (block initializing store) to avoid copying store cache 66 * lines from memory. But pre-store firs 66 * lines from memory. But pre-store first element of each cache line 67 * ST_CHUNK lines in advance of the rest 67 * ST_CHUNK lines in advance of the rest of that cache line. That 68 * gives time for replacement cache line 68 * gives time for replacement cache lines to be written back without 69 * excess STQ and Miss Buffer filling. R 69 * excess STQ and Miss Buffer filling. Repeat until near the end, 70 * then finish up storing before going t 70 * then finish up storing before going to finish_long. 71 * } 71 * } 72 * } else { 72 * } else { src/dst not aligned on 8 bytes 73 * if src is word aligned and count < MED_ 73 * if src is word aligned and count < MED_WMAX 74 * move words in 8-way unrolled loop 74 * move words in 8-way unrolled loop 75 * move final 0-31 bytes; exit with dst 75 * move final 0-31 bytes; exit with dst addr 76 * if count < MED_UMAX 76 * if count < MED_UMAX 77 * use alignaddr/faligndata combined wit 77 * use alignaddr/faligndata combined with ldd/std in 8-way 78 * unrolled loop to move data. 78 * unrolled loop to move data. 79 * go to unalign_done 79 * go to unalign_done 80 * else 80 * else 81 * setup alignaddr for faligndata instru 81 * setup alignaddr for faligndata instructions 82 * align dst on 64 byte boundary; prefet 82 * align dst on 64 byte boundary; prefetch src data to L1 cache 83 * loadx8, falign, block-store, prefetch 83 * loadx8, falign, block-store, prefetch loop 84 * (only use block-init-store when src/d 84 * (only use block-init-store when src/dst on 8 byte boundaries.) 85 * unalign_done: 85 * unalign_done: 86 * move remaining bytes for unaligned ca 86 * move remaining bytes for unaligned cases. exit with dst addr. 87 * } 87 * } 88 * 88 * 89 */ 89 */ 90 90 91 #include <asm/visasm.h> 91 #include <asm/visasm.h> 92 #include <asm/asi.h> 92 #include <asm/asi.h> 93 93 94 #if !defined(EX_LD) && !defined(EX_ST) 94 #if !defined(EX_LD) && !defined(EX_ST) 95 #define NON_USER_COPY 95 #define NON_USER_COPY 96 #endif 96 #endif 97 97 98 #ifndef EX_LD 98 #ifndef EX_LD 99 #define EX_LD(x,y) x 99 #define EX_LD(x,y) x 100 #endif 100 #endif 101 #ifndef EX_LD_FP 101 #ifndef EX_LD_FP 102 #define EX_LD_FP(x,y) x 102 #define EX_LD_FP(x,y) x 103 #endif 103 #endif 104 104 105 #ifndef EX_ST 105 #ifndef EX_ST 106 #define EX_ST(x,y) x 106 #define EX_ST(x,y) x 107 #endif 107 #endif 108 #ifndef EX_ST_FP 108 #ifndef EX_ST_FP 109 #define EX_ST_FP(x,y) x 109 #define EX_ST_FP(x,y) x 110 #endif 110 #endif 111 111 112 #ifndef EX_RETVAL 112 #ifndef EX_RETVAL 113 #define EX_RETVAL(x) x 113 #define EX_RETVAL(x) x 114 #endif 114 #endif 115 115 116 #ifndef LOAD 116 #ifndef LOAD 117 #define LOAD(type,addr,dest) type [addr], d 117 #define LOAD(type,addr,dest) type [addr], dest 118 #endif 118 #endif 119 119 120 #ifndef STORE 120 #ifndef STORE 121 #define STORE(type,src,addr) type src, [add 121 #define STORE(type,src,addr) type src, [addr] 122 #endif 122 #endif 123 123 124 /* 124 /* 125 * ASI_BLK_INIT_QUAD_LDD_P/ASI_BLK_INIT_QUAD_L 125 * ASI_BLK_INIT_QUAD_LDD_P/ASI_BLK_INIT_QUAD_LDD_S marks the cache 126 * line as "least recently used" which means i 126 * line as "least recently used" which means if many threads are 127 * active, it has a high probability of being 127 * active, it has a high probability of being pushed out of the cache 128 * between the first initializing store and th 128 * between the first initializing store and the final stores. 129 * Thus, we use ASI_ST_BLKINIT_MRU_P/ASI_ST_BL 129 * Thus, we use ASI_ST_BLKINIT_MRU_P/ASI_ST_BLKINIT_MRU_S which 130 * marks the cache line as "most recently used 130 * marks the cache line as "most recently used" for all 131 * but the last cache line 131 * but the last cache line 132 */ 132 */ 133 #ifndef STORE_ASI 133 #ifndef STORE_ASI 134 #ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA 134 #ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA 135 #define STORE_ASI ASI_BLK_INIT_QUAD_LDD_ 135 #define STORE_ASI ASI_BLK_INIT_QUAD_LDD_P 136 #else 136 #else 137 #define STORE_ASI 0x80 /* ASI 137 #define STORE_ASI 0x80 /* ASI_P */ 138 #endif 138 #endif 139 #endif 139 #endif 140 140 141 #ifndef STORE_MRU_ASI 141 #ifndef STORE_MRU_ASI 142 #ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA 142 #ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA 143 #define STORE_MRU_ASI ASI_ST_BLKINIT_MRU_P 143 #define STORE_MRU_ASI ASI_ST_BLKINIT_MRU_P 144 #else 144 #else 145 #define STORE_MRU_ASI 0x80 /* ASI 145 #define STORE_MRU_ASI 0x80 /* ASI_P */ 146 #endif 146 #endif 147 #endif 147 #endif 148 148 149 #ifndef STORE_INIT 149 #ifndef STORE_INIT 150 #define STORE_INIT(src,addr) stxa src, [add 150 #define STORE_INIT(src,addr) stxa src, [addr] STORE_ASI 151 #endif 151 #endif 152 152 153 #ifndef STORE_INIT_MRU 153 #ifndef STORE_INIT_MRU 154 #define STORE_INIT_MRU(src,addr) stxa s 154 #define STORE_INIT_MRU(src,addr) stxa src, [addr] STORE_MRU_ASI 155 #endif 155 #endif 156 156 157 #ifndef FUNC_NAME 157 #ifndef FUNC_NAME 158 #define FUNC_NAME M7memcpy 158 #define FUNC_NAME M7memcpy 159 #endif 159 #endif 160 160 161 #ifndef PREAMBLE 161 #ifndef PREAMBLE 162 #define PREAMBLE 162 #define PREAMBLE 163 #endif 163 #endif 164 164 165 #define BLOCK_SIZE 64 165 #define BLOCK_SIZE 64 166 #define SHORTCOPY 3 166 #define SHORTCOPY 3 167 #define SHORTCHECK 14 167 #define SHORTCHECK 14 168 #define SHORT_LONG 64 /* max copy fo 168 #define SHORT_LONG 64 /* max copy for short longword-aligned case */ 169 /* must be at 169 /* must be at least 64 */ 170 #define SMALL_MAX 128 170 #define SMALL_MAX 128 171 #define MED_UMAX 1024 /* max copy fo 171 #define MED_UMAX 1024 /* max copy for medium un-aligned case */ 172 #define MED_WMAX 1024 /* max copy fo 172 #define MED_WMAX 1024 /* max copy for medium word-aligned case */ 173 #define MED_MAX 1024 /* max copy fo 173 #define MED_MAX 1024 /* max copy for medium longword-aligned case */ 174 #define ST_CHUNK 24 /* ST_CHUNK - 174 #define ST_CHUNK 24 /* ST_CHUNK - block of values for BIS Store */ 175 #define ALIGN_PRE 24 /* distance fo 175 #define ALIGN_PRE 24 /* distance for aligned prefetch loop */ 176 176 177 .register %g2,#scratch 177 .register %g2,#scratch 178 178 179 .section ".text" 179 .section ".text" 180 .global FUNC_NAME 180 .global FUNC_NAME 181 .type FUNC_NAME, #function 181 .type FUNC_NAME, #function 182 .align 16 182 .align 16 183 FUNC_NAME: 183 FUNC_NAME: 184 srlx %o2, 31, %g2 184 srlx %o2, 31, %g2 185 cmp %g2, 0 185 cmp %g2, 0 186 tne %xcc, 5 186 tne %xcc, 5 187 PREAMBLE 187 PREAMBLE 188 mov %o0, %g1 ! save 188 mov %o0, %g1 ! save %o0 189 brz,pn %o2, .Lsmallx 189 brz,pn %o2, .Lsmallx 190 cmp %o2, 3 190 cmp %o2, 3 191 ble,pn %icc, .Ltiny_cp 191 ble,pn %icc, .Ltiny_cp 192 cmp %o2, 19 192 cmp %o2, 19 193 ble,pn %icc, .Lsmall_cp 193 ble,pn %icc, .Lsmall_cp 194 or %o0, %o1, %g2 194 or %o0, %o1, %g2 195 cmp %o2, SMALL_MAX 195 cmp %o2, SMALL_MAX 196 bl,pn %icc, .Lmedium_cp 196 bl,pn %icc, .Lmedium_cp 197 nop 197 nop 198 198 199 .Lmedium: 199 .Lmedium: 200 neg %o0, %o5 200 neg %o0, %o5 201 andcc %o5, 7, %o5 ! byte 201 andcc %o5, 7, %o5 ! bytes till DST 8 byte aligned 202 brz,pt %o5, .Ldst_aligned_on_8 202 brz,pt %o5, .Ldst_aligned_on_8 203 203 204 ! %o5 has the bytes to be written in p 204 ! %o5 has the bytes to be written in partial store. 205 sub %o2, %o5, %o2 205 sub %o2, %o5, %o2 206 sub %o1, %o0, %o1 ! %o1 206 sub %o1, %o0, %o1 ! %o1 gets the difference 207 7: ! dst 207 7: ! dst aligning loop 208 add %o1, %o0, %o4 208 add %o1, %o0, %o4 209 EX_LD(LOAD(ldub, %o4, %o4), memcpy_ret 209 EX_LD(LOAD(ldub, %o4, %o4), memcpy_retl_o2_plus_o5) ! load one byte 210 subcc %o5, 1, %o5 210 subcc %o5, 1, %o5 211 EX_ST(STORE(stb, %o4, %o0), memcpy_ret 211 EX_ST(STORE(stb, %o4, %o0), memcpy_retl_o2_plus_o5_plus_1) 212 bgu,pt %xcc, 7b 212 bgu,pt %xcc, 7b 213 add %o0, 1, %o0 ! adva 213 add %o0, 1, %o0 ! advance dst 214 add %o1, %o0, %o1 ! rest 214 add %o1, %o0, %o1 ! restore %o1 215 .Ldst_aligned_on_8: 215 .Ldst_aligned_on_8: 216 andcc %o1, 7, %o5 216 andcc %o1, 7, %o5 217 brnz,pt %o5, .Lsrc_dst_unaligned_on_8 217 brnz,pt %o5, .Lsrc_dst_unaligned_on_8 218 nop 218 nop 219 219 220 .Lsrc_dst_aligned_on_8: 220 .Lsrc_dst_aligned_on_8: 221 ! check if we are copying MED_MAX or m 221 ! check if we are copying MED_MAX or more bytes 222 set MED_MAX, %o3 222 set MED_MAX, %o3 223 cmp %o2, %o3 ! limi 223 cmp %o2, %o3 ! limit to store buffer size 224 bgu,pn %xcc, .Llarge_align8_copy 224 bgu,pn %xcc, .Llarge_align8_copy 225 nop 225 nop 226 226 227 /* 227 /* 228 * Special case for handling when src and dest 228 * Special case for handling when src and dest are both long word aligned 229 * and total data to move is less than MED_MAX 229 * and total data to move is less than MED_MAX bytes 230 */ 230 */ 231 .Lmedlong: 231 .Lmedlong: 232 subcc %o2, 63, %o2 ! adju 232 subcc %o2, 63, %o2 ! adjust length to allow cc test 233 ble,pn %xcc, .Lmedl63 ! skip 233 ble,pn %xcc, .Lmedl63 ! skip big loop if less than 64 bytes 234 nop 234 nop 235 .Lmedl64: 235 .Lmedl64: 236 EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl 236 EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_63) ! load 237 subcc %o2, 64, %o2 ! decr 237 subcc %o2, 64, %o2 ! decrement length count 238 EX_ST(STORE(stx, %o4, %o0), memcpy_ret 238 EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_63_64) ! and store 239 EX_LD(LOAD(ldx, %o1+8, %o3), memcpy_re 239 EX_LD(LOAD(ldx, %o1+8, %o3), memcpy_retl_o2_plus_63_56) ! a block of 64 240 EX_ST(STORE(stx, %o3, %o0+8), memcpy_r 240 EX_ST(STORE(stx, %o3, %o0+8), memcpy_retl_o2_plus_63_56) 241 EX_LD(LOAD(ldx, %o1+16, %o4), memcpy_r 241 EX_LD(LOAD(ldx, %o1+16, %o4), memcpy_retl_o2_plus_63_48) 242 EX_ST(STORE(stx, %o4, %o0+16), memcpy_ 242 EX_ST(STORE(stx, %o4, %o0+16), memcpy_retl_o2_plus_63_48) 243 EX_LD(LOAD(ldx, %o1+24, %o3), memcpy_r 243 EX_LD(LOAD(ldx, %o1+24, %o3), memcpy_retl_o2_plus_63_40) 244 EX_ST(STORE(stx, %o3, %o0+24), memcpy_ 244 EX_ST(STORE(stx, %o3, %o0+24), memcpy_retl_o2_plus_63_40) 245 EX_LD(LOAD(ldx, %o1+32, %o4), memcpy_r 245 EX_LD(LOAD(ldx, %o1+32, %o4), memcpy_retl_o2_plus_63_32)! load and store 246 EX_ST(STORE(stx, %o4, %o0+32), memcpy_ 246 EX_ST(STORE(stx, %o4, %o0+32), memcpy_retl_o2_plus_63_32) 247 EX_LD(LOAD(ldx, %o1+40, %o3), memcpy_r 247 EX_LD(LOAD(ldx, %o1+40, %o3), memcpy_retl_o2_plus_63_24)! a block of 64 248 add %o1, 64, %o1 ! incr 248 add %o1, 64, %o1 ! increase src ptr by 64 249 EX_ST(STORE(stx, %o3, %o0+40), memcpy_ 249 EX_ST(STORE(stx, %o3, %o0+40), memcpy_retl_o2_plus_63_24) 250 EX_LD(LOAD(ldx, %o1-16, %o4), memcpy_r 250 EX_LD(LOAD(ldx, %o1-16, %o4), memcpy_retl_o2_plus_63_16) 251 add %o0, 64, %o0 ! incr 251 add %o0, 64, %o0 ! increase dst ptr by 64 252 EX_ST(STORE(stx, %o4, %o0-16), memcpy_ 252 EX_ST(STORE(stx, %o4, %o0-16), memcpy_retl_o2_plus_63_16) 253 EX_LD(LOAD(ldx, %o1-8, %o3), memcpy_re 253 EX_LD(LOAD(ldx, %o1-8, %o3), memcpy_retl_o2_plus_63_8) 254 bgu,pt %xcc, .Lmedl64 ! repe 254 bgu,pt %xcc, .Lmedl64 ! repeat if at least 64 bytes left 255 EX_ST(STORE(stx, %o3, %o0-8), memcpy_ 255 EX_ST(STORE(stx, %o3, %o0-8), memcpy_retl_o2_plus_63_8) 256 .Lmedl63: 256 .Lmedl63: 257 addcc %o2, 32, %o2 ! adju 257 addcc %o2, 32, %o2 ! adjust remaining count 258 ble,pt %xcc, .Lmedl31 ! to s 258 ble,pt %xcc, .Lmedl31 ! to skip if 31 or fewer bytes left 259 nop 259 nop 260 EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl 260 EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_31) ! load 261 sub %o2, 32, %o2 ! decr 261 sub %o2, 32, %o2 ! decrement length count 262 EX_ST(STORE(stx, %o4, %o0), memcpy_ret 262 EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_31_32) ! and store 263 EX_LD(LOAD(ldx, %o1+8, %o3), memcpy_re 263 EX_LD(LOAD(ldx, %o1+8, %o3), memcpy_retl_o2_plus_31_24) ! a block of 32 264 add %o1, 32, %o1 ! incr 264 add %o1, 32, %o1 ! increase src ptr by 32 265 EX_ST(STORE(stx, %o3, %o0+8), memcpy_r 265 EX_ST(STORE(stx, %o3, %o0+8), memcpy_retl_o2_plus_31_24) 266 EX_LD(LOAD(ldx, %o1-16, %o4), memcpy_r 266 EX_LD(LOAD(ldx, %o1-16, %o4), memcpy_retl_o2_plus_31_16) 267 add %o0, 32, %o0 ! incr 267 add %o0, 32, %o0 ! increase dst ptr by 32 268 EX_ST(STORE(stx, %o4, %o0-16), memcpy_ 268 EX_ST(STORE(stx, %o4, %o0-16), memcpy_retl_o2_plus_31_16) 269 EX_LD(LOAD(ldx, %o1-8, %o3), memcpy_re 269 EX_LD(LOAD(ldx, %o1-8, %o3), memcpy_retl_o2_plus_31_8) 270 EX_ST(STORE(stx, %o3, %o0-8), memcpy_r 270 EX_ST(STORE(stx, %o3, %o0-8), memcpy_retl_o2_plus_31_8) 271 .Lmedl31: 271 .Lmedl31: 272 addcc %o2, 16, %o2 ! adju 272 addcc %o2, 16, %o2 ! adjust remaining count 273 ble,pt %xcc, .Lmedl15 ! skip 273 ble,pt %xcc, .Lmedl15 ! skip if 15 or fewer bytes left 274 nop ! 274 nop ! 275 EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl 275 EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_15) 276 add %o1, 16, %o1 ! incr 276 add %o1, 16, %o1 ! increase src ptr by 16 277 EX_ST(STORE(stx, %o4, %o0), memcpy_ret 277 EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_15) 278 sub %o2, 16, %o2 ! decr 278 sub %o2, 16, %o2 ! decrease count by 16 279 EX_LD(LOAD(ldx, %o1-8, %o3), memcpy_re 279 EX_LD(LOAD(ldx, %o1-8, %o3), memcpy_retl_o2_plus_15_8) 280 add %o0, 16, %o0 ! incr 280 add %o0, 16, %o0 ! increase dst ptr by 16 281 EX_ST(STORE(stx, %o3, %o0-8), memcpy_r 281 EX_ST(STORE(stx, %o3, %o0-8), memcpy_retl_o2_plus_15_8) 282 .Lmedl15: 282 .Lmedl15: 283 addcc %o2, 15, %o2 ! rest 283 addcc %o2, 15, %o2 ! restore count 284 bz,pt %xcc, .Lsmallx ! exit if fini 284 bz,pt %xcc, .Lsmallx ! exit if finished 285 cmp %o2, 8 285 cmp %o2, 8 286 blt,pt %xcc, .Lmedw7 ! skip 286 blt,pt %xcc, .Lmedw7 ! skip if 7 or fewer bytes left 287 tst %o2 287 tst %o2 288 EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl 288 EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2) ! load 8 bytes 289 add %o1, 8, %o1 ! incr 289 add %o1, 8, %o1 ! increase src ptr by 8 290 add %o0, 8, %o0 ! incr 290 add %o0, 8, %o0 ! increase dst ptr by 8 291 subcc %o2, 8, %o2 ! decr 291 subcc %o2, 8, %o2 ! decrease count by 8 292 bnz,pn %xcc, .Lmedw7 292 bnz,pn %xcc, .Lmedw7 293 EX_ST(STORE(stx, %o4, %o0-8), memcpy_ 293 EX_ST(STORE(stx, %o4, %o0-8), memcpy_retl_o2_plus_8) ! and store 8 294 retl 294 retl 295 mov EX_RETVAL(%g1), %o0 ! rest 295 mov EX_RETVAL(%g1), %o0 ! restore %o0 296 296 297 .align 16 297 .align 16 298 .Lsrc_dst_unaligned_on_8: 298 .Lsrc_dst_unaligned_on_8: 299 ! DST is 8-byte aligned, src is not 299 ! DST is 8-byte aligned, src is not 300 2: 300 2: 301 andcc %o1, 0x3, %o5 ! test 301 andcc %o1, 0x3, %o5 ! test word alignment 302 bnz,pt %xcc, .Lunalignsetup ! bran 302 bnz,pt %xcc, .Lunalignsetup ! branch to skip if not word aligned 303 nop 303 nop 304 304 305 /* 305 /* 306 * Handle all cases where src and dest are ali 306 * Handle all cases where src and dest are aligned on word 307 * boundaries. Use unrolled loops for better p 307 * boundaries. Use unrolled loops for better performance. 308 * This option wins over standard large data m 308 * This option wins over standard large data move when 309 * source and destination is in cache for.Lmed 309 * source and destination is in cache for.Lmedium 310 * to short data moves. 310 * to short data moves. 311 */ 311 */ 312 set MED_WMAX, %o3 312 set MED_WMAX, %o3 313 cmp %o2, %o3 ! limi 313 cmp %o2, %o3 ! limit to store buffer size 314 bge,pt %xcc, .Lunalignrejoin ! othe 314 bge,pt %xcc, .Lunalignrejoin ! otherwise rejoin main loop 315 nop 315 nop 316 316 317 subcc %o2, 31, %o2 ! adju 317 subcc %o2, 31, %o2 ! adjust length to allow cc test 318 ! for 318 ! for end of loop 319 ble,pt %xcc, .Lmedw31 ! skip 319 ble,pt %xcc, .Lmedw31 ! skip big loop if less than 16 320 .Lmedw32: 320 .Lmedw32: 321 EX_LD(LOAD(ld, %o1, %o4), memcpy_retl_ 321 EX_LD(LOAD(ld, %o1, %o4), memcpy_retl_o2_plus_31)! move a block of 32 322 sllx %o4, 32, %o5 322 sllx %o4, 32, %o5 323 EX_LD(LOAD(ld, %o1+4, %o4), memcpy_ret 323 EX_LD(LOAD(ld, %o1+4, %o4), memcpy_retl_o2_plus_31) 324 or %o4, %o5, %o5 324 or %o4, %o5, %o5 325 EX_ST(STORE(stx, %o5, %o0), memcpy_ret 325 EX_ST(STORE(stx, %o5, %o0), memcpy_retl_o2_plus_31) 326 subcc %o2, 32, %o2 ! decr 326 subcc %o2, 32, %o2 ! decrement length count 327 EX_LD(LOAD(ld, %o1+8, %o4), memcpy_ret 327 EX_LD(LOAD(ld, %o1+8, %o4), memcpy_retl_o2_plus_31_24) 328 sllx %o4, 32, %o5 328 sllx %o4, 32, %o5 329 EX_LD(LOAD(ld, %o1+12, %o4), memcpy_re 329 EX_LD(LOAD(ld, %o1+12, %o4), memcpy_retl_o2_plus_31_24) 330 or %o4, %o5, %o5 330 or %o4, %o5, %o5 331 EX_ST(STORE(stx, %o5, %o0+8), memcpy_r 331 EX_ST(STORE(stx, %o5, %o0+8), memcpy_retl_o2_plus_31_24) 332 add %o1, 32, %o1 ! incr 332 add %o1, 32, %o1 ! increase src ptr by 32 333 EX_LD(LOAD(ld, %o1-16, %o4), memcpy_re 333 EX_LD(LOAD(ld, %o1-16, %o4), memcpy_retl_o2_plus_31_16) 334 sllx %o4, 32, %o5 334 sllx %o4, 32, %o5 335 EX_LD(LOAD(ld, %o1-12, %o4), memcpy_re 335 EX_LD(LOAD(ld, %o1-12, %o4), memcpy_retl_o2_plus_31_16) 336 or %o4, %o5, %o5 336 or %o4, %o5, %o5 337 EX_ST(STORE(stx, %o5, %o0+16), memcpy_ 337 EX_ST(STORE(stx, %o5, %o0+16), memcpy_retl_o2_plus_31_16) 338 add %o0, 32, %o0 ! incr 338 add %o0, 32, %o0 ! increase dst ptr by 32 339 EX_LD(LOAD(ld, %o1-8, %o4), memcpy_ret 339 EX_LD(LOAD(ld, %o1-8, %o4), memcpy_retl_o2_plus_31_8) 340 sllx %o4, 32, %o5 340 sllx %o4, 32, %o5 341 EX_LD(LOAD(ld, %o1-4, %o4), memcpy_ret 341 EX_LD(LOAD(ld, %o1-4, %o4), memcpy_retl_o2_plus_31_8) 342 or %o4, %o5, %o5 342 or %o4, %o5, %o5 343 bgu,pt %xcc, .Lmedw32 ! repe 343 bgu,pt %xcc, .Lmedw32 ! repeat if at least 32 bytes left 344 EX_ST(STORE(stx, %o5, %o0-8), memcpy_ 344 EX_ST(STORE(stx, %o5, %o0-8), memcpy_retl_o2_plus_31_8) 345 .Lmedw31: 345 .Lmedw31: 346 addcc %o2, 31, %o2 ! rest 346 addcc %o2, 31, %o2 ! restore count 347 347 348 bz,pt %xcc, .Lsmallx ! exit if fini 348 bz,pt %xcc, .Lsmallx ! exit if finished 349 nop 349 nop 350 cmp %o2, 16 350 cmp %o2, 16 351 blt,pt %xcc, .Lmedw15 351 blt,pt %xcc, .Lmedw15 352 nop 352 nop 353 EX_LD(LOAD(ld, %o1, %o4), memcpy_retl_ 353 EX_LD(LOAD(ld, %o1, %o4), memcpy_retl_o2)! move a block of 16 bytes 354 sllx %o4, 32, %o5 354 sllx %o4, 32, %o5 355 subcc %o2, 16, %o2 ! decr 355 subcc %o2, 16, %o2 ! decrement length count 356 EX_LD(LOAD(ld, %o1+4, %o4), memcpy_ret 356 EX_LD(LOAD(ld, %o1+4, %o4), memcpy_retl_o2_plus_16) 357 or %o4, %o5, %o5 357 or %o4, %o5, %o5 358 EX_ST(STORE(stx, %o5, %o0), memcpy_ret 358 EX_ST(STORE(stx, %o5, %o0), memcpy_retl_o2_plus_16) 359 add %o1, 16, %o1 ! incr 359 add %o1, 16, %o1 ! increase src ptr by 16 360 EX_LD(LOAD(ld, %o1-8, %o4), memcpy_ret 360 EX_LD(LOAD(ld, %o1-8, %o4), memcpy_retl_o2_plus_8) 361 add %o0, 16, %o0 ! incr 361 add %o0, 16, %o0 ! increase dst ptr by 16 362 sllx %o4, 32, %o5 362 sllx %o4, 32, %o5 363 EX_LD(LOAD(ld, %o1-4, %o4), memcpy_ret 363 EX_LD(LOAD(ld, %o1-4, %o4), memcpy_retl_o2_plus_8) 364 or %o4, %o5, %o5 364 or %o4, %o5, %o5 365 EX_ST(STORE(stx, %o5, %o0-8), memcpy_r 365 EX_ST(STORE(stx, %o5, %o0-8), memcpy_retl_o2_plus_8) 366 .Lmedw15: 366 .Lmedw15: 367 bz,pt %xcc, .Lsmallx ! exit if fini 367 bz,pt %xcc, .Lsmallx ! exit if finished 368 cmp %o2, 8 368 cmp %o2, 8 369 blt,pn %xcc, .Lmedw7 ! skip 369 blt,pn %xcc, .Lmedw7 ! skip if 7 or fewer bytes left 370 tst %o2 370 tst %o2 371 EX_LD(LOAD(ld, %o1, %o4), memcpy_retl_ 371 EX_LD(LOAD(ld, %o1, %o4), memcpy_retl_o2) ! load 4 bytes 372 subcc %o2, 8, %o2 ! decr 372 subcc %o2, 8, %o2 ! decrease count by 8 373 EX_ST(STORE(stw, %o4, %o0), memcpy_ret 373 EX_ST(STORE(stw, %o4, %o0), memcpy_retl_o2_plus_8)! and store 4 bytes 374 add %o1, 8, %o1 ! incr 374 add %o1, 8, %o1 ! increase src ptr by 8 375 EX_LD(LOAD(ld, %o1-4, %o3), memcpy_ret 375 EX_LD(LOAD(ld, %o1-4, %o3), memcpy_retl_o2_plus_4) ! load 4 bytes 376 add %o0, 8, %o0 ! incr 376 add %o0, 8, %o0 ! increase dst ptr by 8 377 EX_ST(STORE(stw, %o3, %o0-4), memcpy_r 377 EX_ST(STORE(stw, %o3, %o0-4), memcpy_retl_o2_plus_4)! and store 4 bytes 378 bz,pt %xcc, .Lsmallx ! exit if fini 378 bz,pt %xcc, .Lsmallx ! exit if finished 379 .Lmedw7: ! coun 379 .Lmedw7: ! count is ge 1, less than 8 380 cmp %o2, 4 ! chec 380 cmp %o2, 4 ! check for 4 bytes left 381 blt,pn %xcc, .Lsmallleft3 ! skip 381 blt,pn %xcc, .Lsmallleft3 ! skip if 3 or fewer bytes left 382 nop ! 382 nop ! 383 EX_LD(LOAD(ld, %o1, %o4), memcpy_retl_ 383 EX_LD(LOAD(ld, %o1, %o4), memcpy_retl_o2) ! load 4 bytes 384 add %o1, 4, %o1 ! incr 384 add %o1, 4, %o1 ! increase src ptr by 4 385 add %o0, 4, %o0 ! incr 385 add %o0, 4, %o0 ! increase dst ptr by 4 386 subcc %o2, 4, %o2 ! decr 386 subcc %o2, 4, %o2 ! decrease count by 4 387 bnz .Lsmallleft3 387 bnz .Lsmallleft3 388 EX_ST(STORE(stw, %o4, %o0-4), memcpy_ 388 EX_ST(STORE(stw, %o4, %o0-4), memcpy_retl_o2_plus_4)! and store 4 bytes 389 retl 389 retl 390 mov EX_RETVAL(%g1), %o0 390 mov EX_RETVAL(%g1), %o0 391 391 392 .align 16 392 .align 16 393 .Llarge_align8_copy: ! Src 393 .Llarge_align8_copy: ! Src and dst share 8 byte alignment 394 ! align dst to 64 byte boundary 394 ! align dst to 64 byte boundary 395 andcc %o0, 0x3f, %o3 ! %o3 395 andcc %o0, 0x3f, %o3 ! %o3 == 0 means dst is 64 byte aligned 396 brz,pn %o3, .Laligned_to_64 396 brz,pn %o3, .Laligned_to_64 397 andcc %o0, 8, %o3 ! odd 397 andcc %o0, 8, %o3 ! odd long words to move? 398 brz,pt %o3, .Laligned_to_16 398 brz,pt %o3, .Laligned_to_16 399 nop 399 nop 400 EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl 400 EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2) 401 sub %o2, 8, %o2 401 sub %o2, 8, %o2 402 add %o1, 8, %o1 ! incr 402 add %o1, 8, %o1 ! increment src ptr 403 add %o0, 8, %o0 ! incr 403 add %o0, 8, %o0 ! increment dst ptr 404 EX_ST(STORE(stx, %o4, %o0-8), memcpy_r 404 EX_ST(STORE(stx, %o4, %o0-8), memcpy_retl_o2_plus_8) 405 .Laligned_to_16: 405 .Laligned_to_16: 406 andcc %o0, 16, %o3 ! pair 406 andcc %o0, 16, %o3 ! pair of long words to move? 407 brz,pt %o3, .Laligned_to_32 407 brz,pt %o3, .Laligned_to_32 408 nop 408 nop 409 EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl 409 EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2) 410 sub %o2, 16, %o2 410 sub %o2, 16, %o2 411 EX_ST(STORE(stx, %o4, %o0), memcpy_ret 411 EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_16) 412 add %o1, 16, %o1 ! incr 412 add %o1, 16, %o1 ! increment src ptr 413 EX_LD(LOAD(ldx, %o1-8, %o4), memcpy_re 413 EX_LD(LOAD(ldx, %o1-8, %o4), memcpy_retl_o2_plus_8) 414 add %o0, 16, %o0 ! incr 414 add %o0, 16, %o0 ! increment dst ptr 415 EX_ST(STORE(stx, %o4, %o0-8), memcpy_r 415 EX_ST(STORE(stx, %o4, %o0-8), memcpy_retl_o2_plus_8) 416 .Laligned_to_32: 416 .Laligned_to_32: 417 andcc %o0, 32, %o3 ! four 417 andcc %o0, 32, %o3 ! four long words to move? 418 brz,pt %o3, .Laligned_to_64 418 brz,pt %o3, .Laligned_to_64 419 nop 419 nop 420 EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl 420 EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2) 421 sub %o2, 32, %o2 421 sub %o2, 32, %o2 422 EX_ST(STORE(stx, %o4, %o0), memcpy_ret 422 EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_32) 423 EX_LD(LOAD(ldx, %o1+8, %o4), memcpy_re 423 EX_LD(LOAD(ldx, %o1+8, %o4), memcpy_retl_o2_plus_24) 424 EX_ST(STORE(stx, %o4, %o0+8), memcpy_r 424 EX_ST(STORE(stx, %o4, %o0+8), memcpy_retl_o2_plus_24) 425 EX_LD(LOAD(ldx, %o1+16, %o4), memcpy_r 425 EX_LD(LOAD(ldx, %o1+16, %o4), memcpy_retl_o2_plus_16) 426 EX_ST(STORE(stx, %o4, %o0+16), memcpy_ 426 EX_ST(STORE(stx, %o4, %o0+16), memcpy_retl_o2_plus_16) 427 add %o1, 32, %o1 ! incr 427 add %o1, 32, %o1 ! increment src ptr 428 EX_LD(LOAD(ldx, %o1-8, %o4), memcpy_re 428 EX_LD(LOAD(ldx, %o1-8, %o4), memcpy_retl_o2_plus_8) 429 add %o0, 32, %o0 ! incr 429 add %o0, 32, %o0 ! increment dst ptr 430 EX_ST(STORE(stx, %o4, %o0-8), memcpy_r 430 EX_ST(STORE(stx, %o4, %o0-8), memcpy_retl_o2_plus_8) 431 .Laligned_to_64: 431 .Laligned_to_64: 432 ! 432 ! 433 ! Using block init store (BIS) instructi 433 ! Using block init store (BIS) instructions to avoid fetching cache 434 ! lines from memory. Use ST_CHUNK stores 434 ! lines from memory. Use ST_CHUNK stores to first element of each cache 435 ! line (similar to prefetching) to avoid 435 ! line (similar to prefetching) to avoid overfilling STQ or miss buffers. 436 ! Gives existing cache lines time to be 436 ! Gives existing cache lines time to be moved out of L1/L2/L3 cache. 437 ! Initial stores using MRU version of BI 437 ! Initial stores using MRU version of BIS to keep cache line in 438 ! cache until we are ready to store fina 438 ! cache until we are ready to store final element of cache line. 439 ! Then store last element using the LRU 439 ! Then store last element using the LRU version of BIS. 440 ! 440 ! 441 andn %o2, 0x3f, %o5 ! %o5 441 andn %o2, 0x3f, %o5 ! %o5 is multiple of block size 442 and %o2, 0x3f, %o2 ! resi 442 and %o2, 0x3f, %o2 ! residue bytes in %o2 443 ! 443 ! 444 ! We use STORE_MRU_ASI for the first sev 444 ! We use STORE_MRU_ASI for the first seven stores to each cache line 445 ! followed by STORE_ASI (mark as LRU) fo 445 ! followed by STORE_ASI (mark as LRU) for the last store. That 446 ! mixed approach reduces the probability 446 ! mixed approach reduces the probability that the cache line is removed 447 ! before we finish setting it, while min 447 ! before we finish setting it, while minimizing the effects on 448 ! other cached values during a large mem 448 ! other cached values during a large memcpy 449 ! 449 ! 450 ! ST_CHUNK batches up initial BIS operat 450 ! ST_CHUNK batches up initial BIS operations for several cache lines 451 ! to allow multiple requests to not be b 451 ! to allow multiple requests to not be blocked by overflowing the 452 ! the store miss buffer. Then the matchi 452 ! the store miss buffer. Then the matching stores for all those 453 ! BIS operations are executed. 453 ! BIS operations are executed. 454 ! 454 ! 455 455 456 sub %o0, 8, %o0 ! adju 456 sub %o0, 8, %o0 ! adjust %o0 for ASI alignment 457 .Lalign_loop: 457 .Lalign_loop: 458 cmp %o5, ST_CHUNK*64 458 cmp %o5, ST_CHUNK*64 459 blu,pt %xcc, .Lalign_loop_fin 459 blu,pt %xcc, .Lalign_loop_fin 460 mov ST_CHUNK,%o3 460 mov ST_CHUNK,%o3 461 .Lalign_loop_start: 461 .Lalign_loop_start: 462 prefetch [%o1 + (ALIGN_PRE * BLOCK_SIZ 462 prefetch [%o1 + (ALIGN_PRE * BLOCK_SIZE)], 21 463 subcc %o3, 1, %o3 463 subcc %o3, 1, %o3 464 EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl 464 EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_o5) 465 add %o1, 64, %o1 465 add %o1, 64, %o1 466 add %o0, 8, %o0 466 add %o0, 8, %o0 467 EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy 467 EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5) 468 bgu %xcc,.Lalign_loop_start 468 bgu %xcc,.Lalign_loop_start 469 add %o0, 56, %o0 469 add %o0, 56, %o0 470 470 471 mov ST_CHUNK,%o3 471 mov ST_CHUNK,%o3 472 sllx %o3, 6, %o4 ! ST_C 472 sllx %o3, 6, %o4 ! ST_CHUNK*64 473 sub %o1, %o4, %o1 ! rese 473 sub %o1, %o4, %o1 ! reset %o1 474 sub %o0, %o4, %o0 ! rese 474 sub %o0, %o4, %o0 ! reset %o0 475 475 476 .Lalign_loop_rest: 476 .Lalign_loop_rest: 477 EX_LD(LOAD(ldx, %o1+8, %o4), memcpy_re 477 EX_LD(LOAD(ldx, %o1+8, %o4), memcpy_retl_o2_plus_o5) 478 add %o0, 16, %o0 478 add %o0, 16, %o0 479 EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy 479 EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5) 480 EX_LD(LOAD(ldx, %o1+16, %o4), memcpy_r 480 EX_LD(LOAD(ldx, %o1+16, %o4), memcpy_retl_o2_plus_o5) 481 add %o0, 8, %o0 481 add %o0, 8, %o0 482 EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy 482 EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5) 483 subcc %o3, 1, %o3 483 subcc %o3, 1, %o3 484 EX_LD(LOAD(ldx, %o1+24, %o4), memcpy_r 484 EX_LD(LOAD(ldx, %o1+24, %o4), memcpy_retl_o2_plus_o5) 485 add %o0, 8, %o0 485 add %o0, 8, %o0 486 EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy 486 EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5) 487 EX_LD(LOAD(ldx, %o1+32, %o4), memcpy_r 487 EX_LD(LOAD(ldx, %o1+32, %o4), memcpy_retl_o2_plus_o5) 488 add %o0, 8, %o0 488 add %o0, 8, %o0 489 EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy 489 EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5) 490 EX_LD(LOAD(ldx, %o1+40, %o4), memcpy_r 490 EX_LD(LOAD(ldx, %o1+40, %o4), memcpy_retl_o2_plus_o5) 491 add %o0, 8, %o0 491 add %o0, 8, %o0 492 EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy 492 EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5) 493 EX_LD(LOAD(ldx, %o1+48, %o4), memcpy_r 493 EX_LD(LOAD(ldx, %o1+48, %o4), memcpy_retl_o2_plus_o5) 494 add %o1, 64, %o1 494 add %o1, 64, %o1 495 add %o0, 8, %o0 495 add %o0, 8, %o0 496 EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy 496 EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5) 497 add %o0, 8, %o0 497 add %o0, 8, %o0 498 EX_LD(LOAD(ldx, %o1-8, %o4), memcpy_re 498 EX_LD(LOAD(ldx, %o1-8, %o4), memcpy_retl_o2_plus_o5) 499 sub %o5, 64, %o5 499 sub %o5, 64, %o5 500 bgu %xcc,.Lalign_loop_rest 500 bgu %xcc,.Lalign_loop_rest 501 ! mark cache line as LRU 501 ! mark cache line as LRU 502 EX_ST(STORE_INIT(%o4, %o0), memcpy_re 502 EX_ST(STORE_INIT(%o4, %o0), memcpy_retl_o2_plus_o5_plus_64) 503 503 504 cmp %o5, ST_CHUNK*64 504 cmp %o5, ST_CHUNK*64 505 bgu,pt %xcc, .Lalign_loop_start 505 bgu,pt %xcc, .Lalign_loop_start 506 mov ST_CHUNK,%o3 506 mov ST_CHUNK,%o3 507 507 508 cmp %o5, 0 508 cmp %o5, 0 509 beq .Lalign_done 509 beq .Lalign_done 510 nop 510 nop 511 .Lalign_loop_fin: 511 .Lalign_loop_fin: 512 EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl 512 EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_o5) 513 EX_ST(STORE(stx, %o4, %o0+8), memcpy_r 513 EX_ST(STORE(stx, %o4, %o0+8), memcpy_retl_o2_plus_o5) 514 EX_LD(LOAD(ldx, %o1+8, %o4), memcpy_re 514 EX_LD(LOAD(ldx, %o1+8, %o4), memcpy_retl_o2_plus_o5) 515 EX_ST(STORE(stx, %o4, %o0+8+8), memcpy 515 EX_ST(STORE(stx, %o4, %o0+8+8), memcpy_retl_o2_plus_o5) 516 EX_LD(LOAD(ldx, %o1+16, %o4), memcpy_r 516 EX_LD(LOAD(ldx, %o1+16, %o4), memcpy_retl_o2_plus_o5) 517 EX_ST(STORE(stx, %o4, %o0+8+16), memcp 517 EX_ST(STORE(stx, %o4, %o0+8+16), memcpy_retl_o2_plus_o5) 518 subcc %o5, 64, %o5 518 subcc %o5, 64, %o5 519 EX_LD(LOAD(ldx, %o1+24, %o4), memcpy_r 519 EX_LD(LOAD(ldx, %o1+24, %o4), memcpy_retl_o2_plus_o5_64) 520 EX_ST(STORE(stx, %o4, %o0+8+24), memcp 520 EX_ST(STORE(stx, %o4, %o0+8+24), memcpy_retl_o2_plus_o5_64) 521 EX_LD(LOAD(ldx, %o1+32, %o4), memcpy_r 521 EX_LD(LOAD(ldx, %o1+32, %o4), memcpy_retl_o2_plus_o5_64) 522 EX_ST(STORE(stx, %o4, %o0+8+32), memcp 522 EX_ST(STORE(stx, %o4, %o0+8+32), memcpy_retl_o2_plus_o5_64) 523 EX_LD(LOAD(ldx, %o1+40, %o4), memcpy_r 523 EX_LD(LOAD(ldx, %o1+40, %o4), memcpy_retl_o2_plus_o5_64) 524 EX_ST(STORE(stx, %o4, %o0+8+40), memcp 524 EX_ST(STORE(stx, %o4, %o0+8+40), memcpy_retl_o2_plus_o5_64) 525 EX_LD(LOAD(ldx, %o1+48, %o4), memcpy_r 525 EX_LD(LOAD(ldx, %o1+48, %o4), memcpy_retl_o2_plus_o5_64) 526 add %o1, 64, %o1 526 add %o1, 64, %o1 527 EX_ST(STORE(stx, %o4, %o0+8+48), memcp 527 EX_ST(STORE(stx, %o4, %o0+8+48), memcpy_retl_o2_plus_o5_64) 528 add %o0, 64, %o0 528 add %o0, 64, %o0 529 EX_LD(LOAD(ldx, %o1-8, %o4), memcpy_re 529 EX_LD(LOAD(ldx, %o1-8, %o4), memcpy_retl_o2_plus_o5_64) 530 bgu %xcc,.Lalign_loop_fin 530 bgu %xcc,.Lalign_loop_fin 531 EX_ST(STORE(stx, %o4, %o0), memcpy_re 531 EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_o5_64) 532 532 533 .Lalign_done: 533 .Lalign_done: 534 add %o0, 8, %o0 ! rest 534 add %o0, 8, %o0 ! restore %o0 from ASI alignment 535 membar #StoreStore 535 membar #StoreStore 536 sub %o2, 63, %o2 ! adju 536 sub %o2, 63, %o2 ! adjust length to allow cc test 537 ba .Lmedl63 ! in . 537 ba .Lmedl63 ! in .Lmedl63 538 nop 538 nop 539 539 540 .align 16 540 .align 16 541 ! Dst is on 8 byte boundary; src is no 541 ! Dst is on 8 byte boundary; src is not; remaining count > SMALL_MAX 542 .Lunalignsetup: 542 .Lunalignsetup: 543 .Lunalignrejoin: 543 .Lunalignrejoin: 544 mov %g1, %o3 ! save %g1 as 544 mov %g1, %o3 ! save %g1 as VISEntryHalf clobbers it 545 #ifdef NON_USER_COPY 545 #ifdef NON_USER_COPY 546 VISEntryHalfFast(.Lmedium_vis_entry_fa 546 VISEntryHalfFast(.Lmedium_vis_entry_fail_cp) 547 #else 547 #else 548 VISEntryHalf 548 VISEntryHalf 549 #endif 549 #endif 550 mov %o3, %g1 ! restore %g1 550 mov %o3, %g1 ! restore %g1 551 551 552 set MED_UMAX, %o3 552 set MED_UMAX, %o3 553 cmp %o2, %o3 ! check for.Lm 553 cmp %o2, %o3 ! check for.Lmedium unaligned limit 554 bge,pt %xcc,.Lunalign_large 554 bge,pt %xcc,.Lunalign_large 555 prefetch [%o1 + (4 * BLOCK_SIZE)], 20 555 prefetch [%o1 + (4 * BLOCK_SIZE)], 20 556 andn %o2, 0x3f, %o5 ! %o5 556 andn %o2, 0x3f, %o5 ! %o5 is multiple of block size 557 and %o2, 0x3f, %o2 ! resi 557 and %o2, 0x3f, %o2 ! residue bytes in %o2 558 cmp %o2, 8 ! Insu 558 cmp %o2, 8 ! Insure we do not load beyond 559 bgt .Lunalign_adjust ! end 559 bgt .Lunalign_adjust ! end of source buffer 560 andn %o1, 0x7, %o4 ! %o4 560 andn %o1, 0x7, %o4 ! %o4 has long word aligned src address 561 add %o2, 64, %o2 ! adju 561 add %o2, 64, %o2 ! adjust to leave loop 562 sub %o5, 64, %o5 ! earl 562 sub %o5, 64, %o5 ! early if necessary 563 .Lunalign_adjust: 563 .Lunalign_adjust: 564 alignaddr %o1, %g0, %g0 ! gene 564 alignaddr %o1, %g0, %g0 ! generate %gsr 565 add %o1, %o5, %o1 ! adva 565 add %o1, %o5, %o1 ! advance %o1 to after blocks 566 EX_LD_FP(LOAD(ldd, %o4, %f0), memcpy_r 566 EX_LD_FP(LOAD(ldd, %o4, %f0), memcpy_retl_o2_plus_o5) 567 .Lunalign_loop: 567 .Lunalign_loop: 568 EX_LD_FP(LOAD(ldd, %o4+8, %f2), memcpy 568 EX_LD_FP(LOAD(ldd, %o4+8, %f2), memcpy_retl_o2_plus_o5) 569 faligndata %f0, %f2, %f16 569 faligndata %f0, %f2, %f16 570 EX_LD_FP(LOAD(ldd, %o4+16, %f4), memcp 570 EX_LD_FP(LOAD(ldd, %o4+16, %f4), memcpy_retl_o2_plus_o5) 571 subcc %o5, BLOCK_SIZE, %o5 571 subcc %o5, BLOCK_SIZE, %o5 572 EX_ST_FP(STORE(std, %f16, %o0), memcpy 572 EX_ST_FP(STORE(std, %f16, %o0), memcpy_retl_o2_plus_o5_plus_64) 573 faligndata %f2, %f4, %f18 573 faligndata %f2, %f4, %f18 574 EX_LD_FP(LOAD(ldd, %o4+24, %f6), memcp 574 EX_LD_FP(LOAD(ldd, %o4+24, %f6), memcpy_retl_o2_plus_o5_plus_56) 575 EX_ST_FP(STORE(std, %f18, %o0+8), memc 575 EX_ST_FP(STORE(std, %f18, %o0+8), memcpy_retl_o2_plus_o5_plus_56) 576 faligndata %f4, %f6, %f20 576 faligndata %f4, %f6, %f20 577 EX_LD_FP(LOAD(ldd, %o4+32, %f8), memcp 577 EX_LD_FP(LOAD(ldd, %o4+32, %f8), memcpy_retl_o2_plus_o5_plus_48) 578 EX_ST_FP(STORE(std, %f20, %o0+16), mem 578 EX_ST_FP(STORE(std, %f20, %o0+16), memcpy_retl_o2_plus_o5_plus_48) 579 faligndata %f6, %f8, %f22 579 faligndata %f6, %f8, %f22 580 EX_LD_FP(LOAD(ldd, %o4+40, %f10), memc 580 EX_LD_FP(LOAD(ldd, %o4+40, %f10), memcpy_retl_o2_plus_o5_plus_40) 581 EX_ST_FP(STORE(std, %f22, %o0+24), mem 581 EX_ST_FP(STORE(std, %f22, %o0+24), memcpy_retl_o2_plus_o5_plus_40) 582 faligndata %f8, %f10, %f24 582 faligndata %f8, %f10, %f24 583 EX_LD_FP(LOAD(ldd, %o4+48, %f12), memc 583 EX_LD_FP(LOAD(ldd, %o4+48, %f12), memcpy_retl_o2_plus_o5_plus_32) 584 EX_ST_FP(STORE(std, %f24, %o0+32), mem 584 EX_ST_FP(STORE(std, %f24, %o0+32), memcpy_retl_o2_plus_o5_plus_32) 585 faligndata %f10, %f12, %f26 585 faligndata %f10, %f12, %f26 586 EX_LD_FP(LOAD(ldd, %o4+56, %f14), memc 586 EX_LD_FP(LOAD(ldd, %o4+56, %f14), memcpy_retl_o2_plus_o5_plus_24) 587 add %o4, BLOCK_SIZE, %o4 587 add %o4, BLOCK_SIZE, %o4 588 EX_ST_FP(STORE(std, %f26, %o0+40), mem 588 EX_ST_FP(STORE(std, %f26, %o0+40), memcpy_retl_o2_plus_o5_plus_24) 589 faligndata %f12, %f14, %f28 589 faligndata %f12, %f14, %f28 590 EX_LD_FP(LOAD(ldd, %o4, %f0), memcpy_r 590 EX_LD_FP(LOAD(ldd, %o4, %f0), memcpy_retl_o2_plus_o5_plus_16) 591 EX_ST_FP(STORE(std, %f28, %o0+48), mem 591 EX_ST_FP(STORE(std, %f28, %o0+48), memcpy_retl_o2_plus_o5_plus_16) 592 faligndata %f14, %f0, %f30 592 faligndata %f14, %f0, %f30 593 EX_ST_FP(STORE(std, %f30, %o0+56), mem 593 EX_ST_FP(STORE(std, %f30, %o0+56), memcpy_retl_o2_plus_o5_plus_8) 594 add %o0, BLOCK_SIZE, %o0 594 add %o0, BLOCK_SIZE, %o0 595 bgu,pt %xcc, .Lunalign_loop 595 bgu,pt %xcc, .Lunalign_loop 596 prefetch [%o4 + (5 * BLOCK_SIZE)], 20 596 prefetch [%o4 + (5 * BLOCK_SIZE)], 20 597 ba .Lunalign_done 597 ba .Lunalign_done 598 nop 598 nop 599 599 600 .Lunalign_large: 600 .Lunalign_large: 601 andcc %o0, 0x3f, %o3 ! is d 601 andcc %o0, 0x3f, %o3 ! is dst 64-byte block aligned? 602 bz %xcc, .Lunalignsrc 602 bz %xcc, .Lunalignsrc 603 sub %o3, 64, %o3 ! %o3 603 sub %o3, 64, %o3 ! %o3 will be multiple of 8 604 neg %o3 ! byte 604 neg %o3 ! bytes until dest is 64 byte aligned 605 sub %o2, %o3, %o2 ! upda 605 sub %o2, %o3, %o2 ! update cnt with bytes to be moved 606 ! Move bytes according to source align 606 ! Move bytes according to source alignment 607 andcc %o1, 0x1, %o5 607 andcc %o1, 0x1, %o5 608 bnz %xcc, .Lunalignbyte ! chec 608 bnz %xcc, .Lunalignbyte ! check for byte alignment 609 nop 609 nop 610 andcc %o1, 2, %o5 ! chec 610 andcc %o1, 2, %o5 ! check for half word alignment 611 bnz %xcc, .Lunalignhalf 611 bnz %xcc, .Lunalignhalf 612 nop 612 nop 613 ! Src is word aligned 613 ! Src is word aligned 614 .Lunalignword: 614 .Lunalignword: 615 EX_LD_FP(LOAD(ld, %o1, %o4), memcpy_re 615 EX_LD_FP(LOAD(ld, %o1, %o4), memcpy_retl_o2_plus_o3) ! load 4 bytes 616 add %o1, 8, %o1 ! incr 616 add %o1, 8, %o1 ! increase src ptr by 8 617 EX_ST_FP(STORE(stw, %o4, %o0), memcpy_ 617 EX_ST_FP(STORE(stw, %o4, %o0), memcpy_retl_o2_plus_o3) ! and store 4 618 subcc %o3, 8, %o3 ! decr 618 subcc %o3, 8, %o3 ! decrease count by 8 619 EX_LD_FP(LOAD(ld, %o1-4, %o4), memcpy_ 619 EX_LD_FP(LOAD(ld, %o1-4, %o4), memcpy_retl_o2_plus_o3_plus_4)! load 4 620 add %o0, 8, %o0 ! incr 620 add %o0, 8, %o0 ! increase dst ptr by 8 621 bnz %xcc, .Lunalignword 621 bnz %xcc, .Lunalignword 622 EX_ST_FP(STORE(stw, %o4, %o0-4), memc 622 EX_ST_FP(STORE(stw, %o4, %o0-4), memcpy_retl_o2_plus_o3_plus_4) 623 ba .Lunalignsrc 623 ba .Lunalignsrc 624 nop 624 nop 625 625 626 ! Src is half-word aligned 626 ! Src is half-word aligned 627 .Lunalignhalf: 627 .Lunalignhalf: 628 EX_LD_FP(LOAD(lduh, %o1, %o4), memcpy_ 628 EX_LD_FP(LOAD(lduh, %o1, %o4), memcpy_retl_o2_plus_o3) ! load 2 bytes 629 sllx %o4, 32, %o5 ! shif 629 sllx %o4, 32, %o5 ! shift left 630 EX_LD_FP(LOAD(lduw, %o1+2, %o4), memcp 630 EX_LD_FP(LOAD(lduw, %o1+2, %o4), memcpy_retl_o2_plus_o3) 631 or %o4, %o5, %o5 631 or %o4, %o5, %o5 632 sllx %o5, 16, %o5 632 sllx %o5, 16, %o5 633 EX_LD_FP(LOAD(lduh, %o1+6, %o4), memcp 633 EX_LD_FP(LOAD(lduh, %o1+6, %o4), memcpy_retl_o2_plus_o3) 634 or %o4, %o5, %o5 634 or %o4, %o5, %o5 635 EX_ST_FP(STORE(stx, %o5, %o0), memcpy_ 635 EX_ST_FP(STORE(stx, %o5, %o0), memcpy_retl_o2_plus_o3) 636 add %o1, 8, %o1 636 add %o1, 8, %o1 637 subcc %o3, 8, %o3 637 subcc %o3, 8, %o3 638 bnz %xcc, .Lunalignhalf 638 bnz %xcc, .Lunalignhalf 639 add %o0, 8, %o0 639 add %o0, 8, %o0 640 ba .Lunalignsrc 640 ba .Lunalignsrc 641 nop 641 nop 642 642 643 ! Src is Byte aligned 643 ! Src is Byte aligned 644 .Lunalignbyte: 644 .Lunalignbyte: 645 sub %o0, %o1, %o0 ! shar 645 sub %o0, %o1, %o0 ! share pointer advance 646 .Lunalignbyte_loop: 646 .Lunalignbyte_loop: 647 EX_LD_FP(LOAD(ldub, %o1, %o4), memcpy_ 647 EX_LD_FP(LOAD(ldub, %o1, %o4), memcpy_retl_o2_plus_o3) 648 sllx %o4, 56, %o5 648 sllx %o4, 56, %o5 649 EX_LD_FP(LOAD(lduh, %o1+1, %o4), memcp 649 EX_LD_FP(LOAD(lduh, %o1+1, %o4), memcpy_retl_o2_plus_o3) 650 sllx %o4, 40, %o4 650 sllx %o4, 40, %o4 651 or %o4, %o5, %o5 651 or %o4, %o5, %o5 652 EX_LD_FP(LOAD(lduh, %o1+3, %o4), memcp 652 EX_LD_FP(LOAD(lduh, %o1+3, %o4), memcpy_retl_o2_plus_o3) 653 sllx %o4, 24, %o4 653 sllx %o4, 24, %o4 654 or %o4, %o5, %o5 654 or %o4, %o5, %o5 655 EX_LD_FP(LOAD(lduh, %o1+5, %o4), memcp 655 EX_LD_FP(LOAD(lduh, %o1+5, %o4), memcpy_retl_o2_plus_o3) 656 sllx %o4, 8, %o4 656 sllx %o4, 8, %o4 657 or %o4, %o5, %o5 657 or %o4, %o5, %o5 658 EX_LD_FP(LOAD(ldub, %o1+7, %o4), memcp 658 EX_LD_FP(LOAD(ldub, %o1+7, %o4), memcpy_retl_o2_plus_o3) 659 or %o4, %o5, %o5 659 or %o4, %o5, %o5 660 add %o0, %o1, %o0 660 add %o0, %o1, %o0 661 EX_ST_FP(STORE(stx, %o5, %o0), memcpy_ 661 EX_ST_FP(STORE(stx, %o5, %o0), memcpy_retl_o2_plus_o3) 662 sub %o0, %o1, %o0 662 sub %o0, %o1, %o0 663 subcc %o3, 8, %o3 663 subcc %o3, 8, %o3 664 bnz %xcc, .Lunalignbyte_loop 664 bnz %xcc, .Lunalignbyte_loop 665 add %o1, 8, %o1 665 add %o1, 8, %o1 666 add %o0,%o1, %o0 ! rest 666 add %o0,%o1, %o0 ! restore pointer 667 667 668 ! Destination is now block (64 byte al 668 ! Destination is now block (64 byte aligned) 669 .Lunalignsrc: 669 .Lunalignsrc: 670 andn %o2, 0x3f, %o5 ! %o5 670 andn %o2, 0x3f, %o5 ! %o5 is multiple of block size 671 and %o2, 0x3f, %o2 ! resi 671 and %o2, 0x3f, %o2 ! residue bytes in %o2 672 add %o2, 64, %o2 ! Insu 672 add %o2, 64, %o2 ! Insure we do not load beyond 673 sub %o5, 64, %o5 ! end 673 sub %o5, 64, %o5 ! end of source buffer 674 674 675 andn %o1, 0x7, %o4 ! %o4 675 andn %o1, 0x7, %o4 ! %o4 has long word aligned src address 676 alignaddr %o1, %g0, %g0 ! gene 676 alignaddr %o1, %g0, %g0 ! generate %gsr 677 add %o1, %o5, %o1 ! adva 677 add %o1, %o5, %o1 ! advance %o1 to after blocks 678 678 679 EX_LD_FP(LOAD(ldd, %o4, %f14), memcpy_ 679 EX_LD_FP(LOAD(ldd, %o4, %f14), memcpy_retl_o2_plus_o5) 680 add %o4, 8, %o4 680 add %o4, 8, %o4 681 .Lunalign_sloop: 681 .Lunalign_sloop: 682 EX_LD_FP(LOAD(ldd, %o4, %f16), memcpy_ 682 EX_LD_FP(LOAD(ldd, %o4, %f16), memcpy_retl_o2_plus_o5) 683 faligndata %f14, %f16, %f0 683 faligndata %f14, %f16, %f0 684 EX_LD_FP(LOAD(ldd, %o4+8, %f18), memcp 684 EX_LD_FP(LOAD(ldd, %o4+8, %f18), memcpy_retl_o2_plus_o5) 685 faligndata %f16, %f18, %f2 685 faligndata %f16, %f18, %f2 686 EX_LD_FP(LOAD(ldd, %o4+16, %f20), memc 686 EX_LD_FP(LOAD(ldd, %o4+16, %f20), memcpy_retl_o2_plus_o5) 687 faligndata %f18, %f20, %f4 687 faligndata %f18, %f20, %f4 688 EX_ST_FP(STORE(std, %f0, %o0), memcpy_ 688 EX_ST_FP(STORE(std, %f0, %o0), memcpy_retl_o2_plus_o5) 689 subcc %o5, 64, %o5 689 subcc %o5, 64, %o5 690 EX_LD_FP(LOAD(ldd, %o4+24, %f22), memc 690 EX_LD_FP(LOAD(ldd, %o4+24, %f22), memcpy_retl_o2_plus_o5_plus_56) 691 faligndata %f20, %f22, %f6 691 faligndata %f20, %f22, %f6 692 EX_ST_FP(STORE(std, %f2, %o0+8), memcp 692 EX_ST_FP(STORE(std, %f2, %o0+8), memcpy_retl_o2_plus_o5_plus_56) 693 EX_LD_FP(LOAD(ldd, %o4+32, %f24), memc 693 EX_LD_FP(LOAD(ldd, %o4+32, %f24), memcpy_retl_o2_plus_o5_plus_48) 694 faligndata %f22, %f24, %f8 694 faligndata %f22, %f24, %f8 695 EX_ST_FP(STORE(std, %f4, %o0+16), memc 695 EX_ST_FP(STORE(std, %f4, %o0+16), memcpy_retl_o2_plus_o5_plus_48) 696 EX_LD_FP(LOAD(ldd, %o4+40, %f26), memc 696 EX_LD_FP(LOAD(ldd, %o4+40, %f26), memcpy_retl_o2_plus_o5_plus_40) 697 faligndata %f24, %f26, %f10 697 faligndata %f24, %f26, %f10 698 EX_ST_FP(STORE(std, %f6, %o0+24), memc 698 EX_ST_FP(STORE(std, %f6, %o0+24), memcpy_retl_o2_plus_o5_plus_40) 699 EX_LD_FP(LOAD(ldd, %o4+48, %f28), memc 699 EX_LD_FP(LOAD(ldd, %o4+48, %f28), memcpy_retl_o2_plus_o5_plus_40) 700 faligndata %f26, %f28, %f12 700 faligndata %f26, %f28, %f12 701 EX_ST_FP(STORE(std, %f8, %o0+32), memc 701 EX_ST_FP(STORE(std, %f8, %o0+32), memcpy_retl_o2_plus_o5_plus_40) 702 add %o4, 64, %o4 702 add %o4, 64, %o4 703 EX_LD_FP(LOAD(ldd, %o4-8, %f30), memcp 703 EX_LD_FP(LOAD(ldd, %o4-8, %f30), memcpy_retl_o2_plus_o5_plus_40) 704 faligndata %f28, %f30, %f14 704 faligndata %f28, %f30, %f14 705 EX_ST_FP(STORE(std, %f10, %o0+40), mem 705 EX_ST_FP(STORE(std, %f10, %o0+40), memcpy_retl_o2_plus_o5_plus_40) 706 EX_ST_FP(STORE(std, %f12, %o0+48), mem 706 EX_ST_FP(STORE(std, %f12, %o0+48), memcpy_retl_o2_plus_o5_plus_40) 707 add %o0, 64, %o0 707 add %o0, 64, %o0 708 EX_ST_FP(STORE(std, %f14, %o0-8), memc 708 EX_ST_FP(STORE(std, %f14, %o0-8), memcpy_retl_o2_plus_o5_plus_40) 709 fsrc2 %f30, %f14 709 fsrc2 %f30, %f14 710 bgu,pt %xcc, .Lunalign_sloop 710 bgu,pt %xcc, .Lunalign_sloop 711 prefetch [%o4 + (8 * BLOCK_SIZE)], 20 711 prefetch [%o4 + (8 * BLOCK_SIZE)], 20 712 712 713 .Lunalign_done: 713 .Lunalign_done: 714 ! Handle trailing bytes, 64 to 127 714 ! Handle trailing bytes, 64 to 127 715 ! Dest long word aligned, Src not long 715 ! Dest long word aligned, Src not long word aligned 716 cmp %o2, 15 716 cmp %o2, 15 717 bleu %xcc, .Lunalign_short 717 bleu %xcc, .Lunalign_short 718 718 719 andn %o2, 0x7, %o5 ! %o5 719 andn %o2, 0x7, %o5 ! %o5 is multiple of 8 720 and %o2, 0x7, %o2 ! resi 720 and %o2, 0x7, %o2 ! residue bytes in %o2 721 add %o2, 8, %o2 721 add %o2, 8, %o2 722 sub %o5, 8, %o5 ! insu 722 sub %o5, 8, %o5 ! insure we do not load past end of src 723 andn %o1, 0x7, %o4 ! %o4 723 andn %o1, 0x7, %o4 ! %o4 has long word aligned src address 724 add %o1, %o5, %o1 ! adva 724 add %o1, %o5, %o1 ! advance %o1 to after multiple of 8 725 EX_LD_FP(LOAD(ldd, %o4, %f0), memcpy_r 725 EX_LD_FP(LOAD(ldd, %o4, %f0), memcpy_retl_o2_plus_o5)! fetch partialword 726 .Lunalign_by8: 726 .Lunalign_by8: 727 EX_LD_FP(LOAD(ldd, %o4+8, %f2), memcpy 727 EX_LD_FP(LOAD(ldd, %o4+8, %f2), memcpy_retl_o2_plus_o5) 728 add %o4, 8, %o4 728 add %o4, 8, %o4 729 faligndata %f0, %f2, %f16 729 faligndata %f0, %f2, %f16 730 subcc %o5, 8, %o5 730 subcc %o5, 8, %o5 731 EX_ST_FP(STORE(std, %f16, %o0), memcpy 731 EX_ST_FP(STORE(std, %f16, %o0), memcpy_retl_o2_plus_o5) 732 fsrc2 %f2, %f0 732 fsrc2 %f2, %f0 733 bgu,pt %xcc, .Lunalign_by8 733 bgu,pt %xcc, .Lunalign_by8 734 add %o0, 8, %o0 734 add %o0, 8, %o0 735 735 736 .Lunalign_short: 736 .Lunalign_short: 737 #ifdef NON_USER_COPY 737 #ifdef NON_USER_COPY 738 VISExitHalfFast 738 VISExitHalfFast 739 #else 739 #else 740 VISExitHalf 740 VISExitHalf 741 #endif 741 #endif 742 ba .Lsmallrest 742 ba .Lsmallrest 743 nop 743 nop 744 744 745 /* 745 /* 746 * This is a special case of nested memcpy. Th 746 * This is a special case of nested memcpy. This can happen when kernel 747 * calls unaligned memcpy back to back without 747 * calls unaligned memcpy back to back without saving FP registers. We need 748 * traps(context switch) to save/restore FP re 748 * traps(context switch) to save/restore FP registers. If the kernel calls 749 * memcpy without this trap sequence we will h 749 * memcpy without this trap sequence we will hit FP corruption. Let's use 750 * the normal integer load/store method in thi 750 * the normal integer load/store method in this case. 751 */ 751 */ 752 752 753 #ifdef NON_USER_COPY 753 #ifdef NON_USER_COPY 754 .Lmedium_vis_entry_fail_cp: 754 .Lmedium_vis_entry_fail_cp: 755 or %o0, %o1, %g2 755 or %o0, %o1, %g2 756 #endif 756 #endif 757 .Lmedium_cp: 757 .Lmedium_cp: 758 LOAD(prefetch, %o1 + 0x40, #n_reads_st 758 LOAD(prefetch, %o1 + 0x40, #n_reads_strong) 759 andcc %g2, 0x7, %g0 759 andcc %g2, 0x7, %g0 760 bne,pn %xcc, .Lmedium_unaligned_cp 760 bne,pn %xcc, .Lmedium_unaligned_cp 761 nop 761 nop 762 762 763 .Lmedium_noprefetch_cp: 763 .Lmedium_noprefetch_cp: 764 andncc %o2, 0x20 - 1, %o5 764 andncc %o2, 0x20 - 1, %o5 765 be,pn %xcc, 2f 765 be,pn %xcc, 2f 766 sub %o2, %o5, %o2 766 sub %o2, %o5, %o2 767 1: EX_LD(LOAD(ldx, %o1 + 0x00, %o3), memc 767 1: EX_LD(LOAD(ldx, %o1 + 0x00, %o3), memcpy_retl_o2_plus_o5) 768 EX_LD(LOAD(ldx, %o1 + 0x08, %g2), memc 768 EX_LD(LOAD(ldx, %o1 + 0x08, %g2), memcpy_retl_o2_plus_o5) 769 EX_LD(LOAD(ldx, %o1 + 0x10, %g7), memc 769 EX_LD(LOAD(ldx, %o1 + 0x10, %g7), memcpy_retl_o2_plus_o5) 770 EX_LD(LOAD(ldx, %o1 + 0x18, %o4), memc 770 EX_LD(LOAD(ldx, %o1 + 0x18, %o4), memcpy_retl_o2_plus_o5) 771 add %o1, 0x20, %o1 771 add %o1, 0x20, %o1 772 subcc %o5, 0x20, %o5 772 subcc %o5, 0x20, %o5 773 EX_ST(STORE(stx, %o3, %o0 + 0x00), mem 773 EX_ST(STORE(stx, %o3, %o0 + 0x00), memcpy_retl_o2_plus_o5_plus_32) 774 EX_ST(STORE(stx, %g2, %o0 + 0x08), mem 774 EX_ST(STORE(stx, %g2, %o0 + 0x08), memcpy_retl_o2_plus_o5_plus_24) 775 EX_ST(STORE(stx, %g7, %o0 + 0x10), mem 775 EX_ST(STORE(stx, %g7, %o0 + 0x10), memcpy_retl_o2_plus_o5_plus_24) 776 EX_ST(STORE(stx, %o4, %o0 + 0x18), mem 776 EX_ST(STORE(stx, %o4, %o0 + 0x18), memcpy_retl_o2_plus_o5_plus_8) 777 bne,pt %xcc, 1b 777 bne,pt %xcc, 1b 778 add %o0, 0x20, %o0 778 add %o0, 0x20, %o0 779 2: andcc %o2, 0x18, %o5 779 2: andcc %o2, 0x18, %o5 780 be,pt %xcc, 3f 780 be,pt %xcc, 3f 781 sub %o2, %o5, %o2 781 sub %o2, %o5, %o2 782 1: EX_LD(LOAD(ldx, %o1 + 0x00, %o3), memc 782 1: EX_LD(LOAD(ldx, %o1 + 0x00, %o3), memcpy_retl_o2_plus_o5) 783 add %o1, 0x08, %o1 783 add %o1, 0x08, %o1 784 add %o0, 0x08, %o0 784 add %o0, 0x08, %o0 785 subcc %o5, 0x08, %o5 785 subcc %o5, 0x08, %o5 786 bne,pt %xcc, 1b 786 bne,pt %xcc, 1b 787 EX_ST(STORE(stx, %o3, %o0 - 0x08), me 787 EX_ST(STORE(stx, %o3, %o0 - 0x08), memcpy_retl_o2_plus_o5_plus_8) 788 3: brz,pt %o2, .Lexit_cp 788 3: brz,pt %o2, .Lexit_cp 789 cmp %o2, 0x04 789 cmp %o2, 0x04 790 bl,pn %xcc, .Ltiny_cp 790 bl,pn %xcc, .Ltiny_cp 791 nop 791 nop 792 EX_LD(LOAD(lduw, %o1 + 0x00, %o3), mem 792 EX_LD(LOAD(lduw, %o1 + 0x00, %o3), memcpy_retl_o2) 793 add %o1, 0x04, %o1 793 add %o1, 0x04, %o1 794 add %o0, 0x04, %o0 794 add %o0, 0x04, %o0 795 subcc %o2, 0x04, %o2 795 subcc %o2, 0x04, %o2 796 bne,pn %xcc, .Ltiny_cp 796 bne,pn %xcc, .Ltiny_cp 797 EX_ST(STORE(stw, %o3, %o0 - 0x04), me 797 EX_ST(STORE(stw, %o3, %o0 - 0x04), memcpy_retl_o2_plus_4) 798 ba,a,pt %xcc, .Lexit_cp 798 ba,a,pt %xcc, .Lexit_cp 799 799 800 .Lmedium_unaligned_cp: 800 .Lmedium_unaligned_cp: 801 /* First get dest 8 byte aligned. */ 801 /* First get dest 8 byte aligned. */ 802 sub %g0, %o0, %o3 802 sub %g0, %o0, %o3 803 and %o3, 0x7, %o3 803 and %o3, 0x7, %o3 804 brz,pt %o3, 2f 804 brz,pt %o3, 2f 805 sub %o2, %o3, %o2 805 sub %o2, %o3, %o2 806 806 807 1: EX_LD(LOAD(ldub, %o1 + 0x00, %g2), mem 807 1: EX_LD(LOAD(ldub, %o1 + 0x00, %g2), memcpy_retl_o2_plus_g1) 808 add %o1, 1, %o1 808 add %o1, 1, %o1 809 subcc %o3, 1, %o3 809 subcc %o3, 1, %o3 810 add %o0, 1, %o0 810 add %o0, 1, %o0 811 bne,pt %xcc, 1b 811 bne,pt %xcc, 1b 812 EX_ST(STORE(stb, %g2, %o0 - 0x01), me 812 EX_ST(STORE(stb, %g2, %o0 - 0x01), memcpy_retl_o2_plus_g1_plus_1) 813 2: 813 2: 814 and %o1, 0x7, %o3 814 and %o1, 0x7, %o3 815 brz,pn %o3, .Lmedium_noprefetch_cp 815 brz,pn %o3, .Lmedium_noprefetch_cp 816 sll %o3, 3, %o3 816 sll %o3, 3, %o3 817 mov 64, %g2 817 mov 64, %g2 818 sub %g2, %o3, %g2 818 sub %g2, %o3, %g2 819 andn %o1, 0x7, %o1 819 andn %o1, 0x7, %o1 820 EX_LD(LOAD(ldx, %o1 + 0x00, %o4), memc 820 EX_LD(LOAD(ldx, %o1 + 0x00, %o4), memcpy_retl_o2) 821 sllx %o4, %o3, %o4 821 sllx %o4, %o3, %o4 822 andn %o2, 0x08 - 1, %o5 822 andn %o2, 0x08 - 1, %o5 823 sub %o2, %o5, %o2 823 sub %o2, %o5, %o2 824 824 825 1: EX_LD(LOAD(ldx, %o1 + 0x08, %g3), memc 825 1: EX_LD(LOAD(ldx, %o1 + 0x08, %g3), memcpy_retl_o2_plus_o5) 826 add %o1, 0x08, %o1 826 add %o1, 0x08, %o1 827 subcc %o5, 0x08, %o5 827 subcc %o5, 0x08, %o5 828 srlx %g3, %g2, %g7 828 srlx %g3, %g2, %g7 829 or %g7, %o4, %g7 829 or %g7, %o4, %g7 830 EX_ST(STORE(stx, %g7, %o0 + 0x00), mem 830 EX_ST(STORE(stx, %g7, %o0 + 0x00), memcpy_retl_o2_plus_o5_plus_8) 831 add %o0, 0x08, %o0 831 add %o0, 0x08, %o0 832 bne,pt %xcc, 1b 832 bne,pt %xcc, 1b 833 sllx %g3, %o3, %o4 833 sllx %g3, %o3, %o4 834 srl %o3, 3, %o3 834 srl %o3, 3, %o3 835 add %o1, %o3, %o1 835 add %o1, %o3, %o1 836 brz,pn %o2, .Lexit_cp 836 brz,pn %o2, .Lexit_cp 837 nop 837 nop 838 ba,pt %xcc, .Lsmall_unaligned_cp 838 ba,pt %xcc, .Lsmall_unaligned_cp 839 839 840 .Ltiny_cp: 840 .Ltiny_cp: 841 EX_LD(LOAD(ldub, %o1 + 0x00, %o3), mem 841 EX_LD(LOAD(ldub, %o1 + 0x00, %o3), memcpy_retl_o2) 842 subcc %o2, 1, %o2 842 subcc %o2, 1, %o2 843 be,pn %xcc, .Lexit_cp 843 be,pn %xcc, .Lexit_cp 844 EX_ST(STORE(stb, %o3, %o0 + 0x00), me 844 EX_ST(STORE(stb, %o3, %o0 + 0x00), memcpy_retl_o2_plus_1) 845 EX_LD(LOAD(ldub, %o1 + 0x01, %o3), mem 845 EX_LD(LOAD(ldub, %o1 + 0x01, %o3), memcpy_retl_o2) 846 subcc %o2, 1, %o2 846 subcc %o2, 1, %o2 847 be,pn %xcc, .Lexit_cp 847 be,pn %xcc, .Lexit_cp 848 EX_ST(STORE(stb, %o3, %o0 + 0x01), me 848 EX_ST(STORE(stb, %o3, %o0 + 0x01), memcpy_retl_o2_plus_1) 849 EX_LD(LOAD(ldub, %o1 + 0x02, %o3), mem 849 EX_LD(LOAD(ldub, %o1 + 0x02, %o3), memcpy_retl_o2) 850 ba,pt %xcc, .Lexit_cp 850 ba,pt %xcc, .Lexit_cp 851 EX_ST(STORE(stb, %o3, %o0 + 0x02), me 851 EX_ST(STORE(stb, %o3, %o0 + 0x02), memcpy_retl_o2) 852 852 853 .Lsmall_cp: 853 .Lsmall_cp: 854 andcc %g2, 0x3, %g0 854 andcc %g2, 0x3, %g0 855 bne,pn %xcc, .Lsmall_unaligned_cp 855 bne,pn %xcc, .Lsmall_unaligned_cp 856 andn %o2, 0x4 - 1, %o5 856 andn %o2, 0x4 - 1, %o5 857 sub %o2, %o5, %o2 857 sub %o2, %o5, %o2 858 1: 858 1: 859 EX_LD(LOAD(lduw, %o1 + 0x00, %o3), mem 859 EX_LD(LOAD(lduw, %o1 + 0x00, %o3), memcpy_retl_o2_plus_o5) 860 add %o1, 0x04, %o1 860 add %o1, 0x04, %o1 861 subcc %o5, 0x04, %o5 861 subcc %o5, 0x04, %o5 862 add %o0, 0x04, %o0 862 add %o0, 0x04, %o0 863 bne,pt %xcc, 1b 863 bne,pt %xcc, 1b 864 EX_ST(STORE(stw, %o3, %o0 - 0x04), me 864 EX_ST(STORE(stw, %o3, %o0 - 0x04), memcpy_retl_o2_plus_o5_plus_4) 865 brz,pt %o2, .Lexit_cp 865 brz,pt %o2, .Lexit_cp 866 nop 866 nop 867 ba,a,pt %xcc, .Ltiny_cp 867 ba,a,pt %xcc, .Ltiny_cp 868 868 869 .Lsmall_unaligned_cp: 869 .Lsmall_unaligned_cp: 870 1: EX_LD(LOAD(ldub, %o1 + 0x00, %o3), mem 870 1: EX_LD(LOAD(ldub, %o1 + 0x00, %o3), memcpy_retl_o2) 871 add %o1, 1, %o1 871 add %o1, 1, %o1 872 add %o0, 1, %o0 872 add %o0, 1, %o0 873 subcc %o2, 1, %o2 873 subcc %o2, 1, %o2 874 bne,pt %xcc, 1b 874 bne,pt %xcc, 1b 875 EX_ST(STORE(stb, %o3, %o0 - 0x01), me 875 EX_ST(STORE(stb, %o3, %o0 - 0x01), memcpy_retl_o2_plus_1) 876 ba,a,pt %xcc, .Lexit_cp 876 ba,a,pt %xcc, .Lexit_cp 877 877 878 .Lsmallrest: 878 .Lsmallrest: 879 tst %o2 879 tst %o2 880 bz,pt %xcc, .Lsmallx 880 bz,pt %xcc, .Lsmallx 881 cmp %o2, 4 881 cmp %o2, 4 882 blt,pn %xcc, .Lsmallleft3 882 blt,pn %xcc, .Lsmallleft3 883 nop 883 nop 884 sub %o2, 3, %o2 884 sub %o2, 3, %o2 885 .Lsmallnotalign4: 885 .Lsmallnotalign4: 886 EX_LD(LOAD(ldub, %o1, %o3), memcpy_ret 886 EX_LD(LOAD(ldub, %o1, %o3), memcpy_retl_o2_plus_3)! read byte 887 subcc %o2, 4, %o2 ! redu 887 subcc %o2, 4, %o2 ! reduce count by 4 888 EX_ST(STORE(stb, %o3, %o0), memcpy_ret 888 EX_ST(STORE(stb, %o3, %o0), memcpy_retl_o2_plus_7)! write byte & repeat 889 EX_LD(LOAD(ldub, %o1+1, %o3), memcpy_r 889 EX_LD(LOAD(ldub, %o1+1, %o3), memcpy_retl_o2_plus_6)! for total of 4 890 add %o1, 4, %o1 ! adva 890 add %o1, 4, %o1 ! advance SRC by 4 891 EX_ST(STORE(stb, %o3, %o0+1), memcpy_r 891 EX_ST(STORE(stb, %o3, %o0+1), memcpy_retl_o2_plus_6) 892 EX_LD(LOAD(ldub, %o1-2, %o3), memcpy_r 892 EX_LD(LOAD(ldub, %o1-2, %o3), memcpy_retl_o2_plus_5) 893 add %o0, 4, %o0 ! adva 893 add %o0, 4, %o0 ! advance DST by 4 894 EX_ST(STORE(stb, %o3, %o0-2), memcpy_r 894 EX_ST(STORE(stb, %o3, %o0-2), memcpy_retl_o2_plus_5) 895 EX_LD(LOAD(ldub, %o1-1, %o3), memcpy_r 895 EX_LD(LOAD(ldub, %o1-1, %o3), memcpy_retl_o2_plus_4) 896 bgu,pt %xcc, .Lsmallnotalign4 ! loop 896 bgu,pt %xcc, .Lsmallnotalign4 ! loop til 3 or fewer bytes remain 897 EX_ST(STORE(stb, %o3, %o0-1), memcpy_r 897 EX_ST(STORE(stb, %o3, %o0-1), memcpy_retl_o2_plus_4) 898 addcc %o2, 3, %o2 ! rest 898 addcc %o2, 3, %o2 ! restore count 899 bz,pt %xcc, .Lsmallx 899 bz,pt %xcc, .Lsmallx 900 .Lsmallleft3: ! 1, 2 900 .Lsmallleft3: ! 1, 2, or 3 bytes remain 901 subcc %o2, 1, %o2 901 subcc %o2, 1, %o2 902 EX_LD(LOAD(ldub, %o1, %o3), memcpy_ret 902 EX_LD(LOAD(ldub, %o1, %o3), memcpy_retl_o2_plus_1) ! load one byte 903 bz,pt %xcc, .Lsmallx 903 bz,pt %xcc, .Lsmallx 904 EX_ST(STORE(stb, %o3, %o0), memcpy_ret 904 EX_ST(STORE(stb, %o3, %o0), memcpy_retl_o2_plus_1) ! store one byte 905 EX_LD(LOAD(ldub, %o1+1, %o3), memcpy_r 905 EX_LD(LOAD(ldub, %o1+1, %o3), memcpy_retl_o2) ! load second byte 906 subcc %o2, 1, %o2 906 subcc %o2, 1, %o2 907 bz,pt %xcc, .Lsmallx 907 bz,pt %xcc, .Lsmallx 908 EX_ST(STORE(stb, %o3, %o0+1), memcpy_r 908 EX_ST(STORE(stb, %o3, %o0+1), memcpy_retl_o2_plus_1)! store second byte 909 EX_LD(LOAD(ldub, %o1+2, %o3), memcpy_r 909 EX_LD(LOAD(ldub, %o1+2, %o3), memcpy_retl_o2) ! load third byte 910 EX_ST(STORE(stb, %o3, %o0+2), memcpy_r 910 EX_ST(STORE(stb, %o3, %o0+2), memcpy_retl_o2) ! store third byte 911 .Lsmallx: 911 .Lsmallx: 912 retl 912 retl 913 mov EX_RETVAL(%g1), %o0 913 mov EX_RETVAL(%g1), %o0 914 .Lsmallfin: 914 .Lsmallfin: 915 tst %o2 915 tst %o2 916 bnz,pn %xcc, .Lsmallleft3 916 bnz,pn %xcc, .Lsmallleft3 917 nop 917 nop 918 retl 918 retl 919 mov EX_RETVAL(%g1), %o0 ! rest 919 mov EX_RETVAL(%g1), %o0 ! restore %o0 920 .Lexit_cp: 920 .Lexit_cp: 921 retl 921 retl 922 mov EX_RETVAL(%g1), %o0 922 mov EX_RETVAL(%g1), %o0 923 .size FUNC_NAME, .-FUNC_NAME 923 .size FUNC_NAME, .-FUNC_NAME
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.