1 /* SPDX-License-Identifier: GPL-2.0 */ << 2 /* NGmemcpy.S: Niagara optimized memcpy. 1 /* NGmemcpy.S: Niagara optimized memcpy. 3 * 2 * 4 * Copyright (C) 2006, 2007 David S. Miller (d 3 * Copyright (C) 2006, 2007 David S. Miller (davem@davemloft.net) 5 */ 4 */ 6 5 7 #ifdef __KERNEL__ 6 #ifdef __KERNEL__ 8 #include <linux/linkage.h> << 9 #include <asm/asi.h> 7 #include <asm/asi.h> 10 #include <asm/thread_info.h> 8 #include <asm/thread_info.h> 11 #define GLOBAL_SPARE %g7 9 #define GLOBAL_SPARE %g7 12 #define RESTORE_ASI(TMP) \ 10 #define RESTORE_ASI(TMP) \ 13 wr %g0, ASI_AIUS, %asi !! 11 ldub [%g6 + TI_CURRENT_DS], TMP; \ >> 12 wr TMP, 0x0, %asi; 14 #else 13 #else 15 #define GLOBAL_SPARE %g5 14 #define GLOBAL_SPARE %g5 16 #define RESTORE_ASI(TMP) \ 15 #define RESTORE_ASI(TMP) \ 17 wr %g0, ASI_PNF, %asi 16 wr %g0, ASI_PNF, %asi 18 #endif 17 #endif 19 18 20 #ifdef __sparc_v9__ 19 #ifdef __sparc_v9__ 21 #define SAVE_AMOUNT 128 20 #define SAVE_AMOUNT 128 22 #else 21 #else 23 #define SAVE_AMOUNT 64 22 #define SAVE_AMOUNT 64 24 #endif 23 #endif 25 24 26 #ifndef STORE_ASI 25 #ifndef STORE_ASI 27 #define STORE_ASI ASI_BLK_INIT_QUAD_LDD_ 26 #define STORE_ASI ASI_BLK_INIT_QUAD_LDD_P 28 #endif 27 #endif 29 28 30 #ifndef EX_LD 29 #ifndef EX_LD 31 #define EX_LD(x,y) x !! 30 #define EX_LD(x) x 32 #endif 31 #endif 33 32 34 #ifndef EX_ST 33 #ifndef EX_ST 35 #define EX_ST(x,y) x !! 34 #define EX_ST(x) x >> 35 #endif >> 36 >> 37 #ifndef EX_RETVAL >> 38 #define EX_RETVAL(x) x 36 #endif 39 #endif 37 40 38 #ifndef LOAD 41 #ifndef LOAD 39 #ifndef MEMCPY_DEBUG 42 #ifndef MEMCPY_DEBUG 40 #define LOAD(type,addr,dest) type [addr], d 43 #define LOAD(type,addr,dest) type [addr], dest 41 #else 44 #else 42 #define LOAD(type,addr,dest) type##a [addr] 45 #define LOAD(type,addr,dest) type##a [addr] 0x80, dest 43 #endif 46 #endif 44 #endif 47 #endif 45 48 46 #ifndef LOAD_TWIN 49 #ifndef LOAD_TWIN 47 #define LOAD_TWIN(addr_reg,dest0,dest1) \ 50 #define LOAD_TWIN(addr_reg,dest0,dest1) \ 48 ldda [addr_reg] ASI_BLK_INIT_QUAD_LDD_ 51 ldda [addr_reg] ASI_BLK_INIT_QUAD_LDD_P, dest0 49 #endif 52 #endif 50 53 51 #ifndef STORE 54 #ifndef STORE 52 #define STORE(type,src,addr) type src, [add 55 #define STORE(type,src,addr) type src, [addr] 53 #endif 56 #endif 54 57 55 #ifndef STORE_INIT 58 #ifndef STORE_INIT 56 #ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA 59 #ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA 57 #define STORE_INIT(src,addr) stxa src, [add 60 #define STORE_INIT(src,addr) stxa src, [addr] %asi 58 #else 61 #else 59 #define STORE_INIT(src,addr) stx src, [addr 62 #define STORE_INIT(src,addr) stx src, [addr + 0x00] 60 #endif 63 #endif 61 #endif 64 #endif 62 65 63 #ifndef FUNC_NAME 66 #ifndef FUNC_NAME 64 #define FUNC_NAME NGmemcpy 67 #define FUNC_NAME NGmemcpy 65 #endif 68 #endif 66 69 67 #ifndef PREAMBLE 70 #ifndef PREAMBLE 68 #define PREAMBLE 71 #define PREAMBLE 69 #endif 72 #endif 70 73 71 #ifndef XCC 74 #ifndef XCC 72 #define XCC xcc 75 #define XCC xcc 73 #endif 76 #endif 74 77 75 .register %g2,#scratch 78 .register %g2,#scratch 76 .register %g3,#scratch 79 .register %g3,#scratch 77 80 78 .text 81 .text 79 #ifndef EX_RETVAL << 80 #define EX_RETVAL(x) x << 81 __restore_asi: << 82 ret << 83 wr %g0, ASI_AIUS, %asi << 84 restore << 85 ENTRY(NG_ret_i2_plus_i4_plus_1) << 86 ba,pt %xcc, __restore_asi << 87 add %i2, %i5, %i0 << 88 ENDPROC(NG_ret_i2_plus_i4_plus_1) << 89 ENTRY(NG_ret_i2_plus_g1) << 90 ba,pt %xcc, __restore_asi << 91 add %i2, %g1, %i0 << 92 ENDPROC(NG_ret_i2_plus_g1) << 93 ENTRY(NG_ret_i2_plus_g1_minus_8) << 94 sub %g1, 8, %g1 << 95 ba,pt %xcc, __restore_asi << 96 add %i2, %g1, %i0 << 97 ENDPROC(NG_ret_i2_plus_g1_minus_8) << 98 ENTRY(NG_ret_i2_plus_g1_minus_16) << 99 sub %g1, 16, %g1 << 100 ba,pt %xcc, __restore_asi << 101 add %i2, %g1, %i0 << 102 ENDPROC(NG_ret_i2_plus_g1_minus_16) << 103 ENTRY(NG_ret_i2_plus_g1_minus_24) << 104 sub %g1, 24, %g1 << 105 ba,pt %xcc, __restore_asi << 106 add %i2, %g1, %i0 << 107 ENDPROC(NG_ret_i2_plus_g1_minus_24) << 108 ENTRY(NG_ret_i2_plus_g1_minus_32) << 109 sub %g1, 32, %g1 << 110 ba,pt %xcc, __restore_asi << 111 add %i2, %g1, %i0 << 112 ENDPROC(NG_ret_i2_plus_g1_minus_32) << 113 ENTRY(NG_ret_i2_plus_g1_minus_40) << 114 sub %g1, 40, %g1 << 115 ba,pt %xcc, __restore_asi << 116 add %i2, %g1, %i0 << 117 ENDPROC(NG_ret_i2_plus_g1_minus_40) << 118 ENTRY(NG_ret_i2_plus_g1_minus_48) << 119 sub %g1, 48, %g1 << 120 ba,pt %xcc, __restore_asi << 121 add %i2, %g1, %i0 << 122 ENDPROC(NG_ret_i2_plus_g1_minus_48) << 123 ENTRY(NG_ret_i2_plus_g1_minus_56) << 124 sub %g1, 56, %g1 << 125 ba,pt %xcc, __restore_asi << 126 add %i2, %g1, %i0 << 127 ENDPROC(NG_ret_i2_plus_g1_minus_56) << 128 ENTRY(NG_ret_i2_plus_i4) << 129 ba,pt %xcc, __restore_asi << 130 add %i2, %i4, %i0 << 131 ENDPROC(NG_ret_i2_plus_i4) << 132 ENTRY(NG_ret_i2_plus_i4_minus_8) << 133 sub %i4, 8, %i4 << 134 ba,pt %xcc, __restore_asi << 135 add %i2, %i4, %i0 << 136 ENDPROC(NG_ret_i2_plus_i4_minus_8) << 137 ENTRY(NG_ret_i2_plus_8) << 138 ba,pt %xcc, __restore_asi << 139 add %i2, 8, %i0 << 140 ENDPROC(NG_ret_i2_plus_8) << 141 ENTRY(NG_ret_i2_plus_4) << 142 ba,pt %xcc, __restore_asi << 143 add %i2, 4, %i0 << 144 ENDPROC(NG_ret_i2_plus_4) << 145 ENTRY(NG_ret_i2_plus_1) << 146 ba,pt %xcc, __restore_asi << 147 add %i2, 1, %i0 << 148 ENDPROC(NG_ret_i2_plus_1) << 149 ENTRY(NG_ret_i2_plus_g1_plus_1) << 150 add %g1, 1, %g1 << 151 ba,pt %xcc, __restore_asi << 152 add %i2, %g1, %i0 << 153 ENDPROC(NG_ret_i2_plus_g1_plus_1) << 154 ENTRY(NG_ret_i2) << 155 ba,pt %xcc, __restore_asi << 156 mov %i2, %i0 << 157 ENDPROC(NG_ret_i2) << 158 ENTRY(NG_ret_i2_and_7_plus_i4) << 159 and %i2, 7, %i2 << 160 ba,pt %xcc, __restore_asi << 161 add %i2, %i4, %i0 << 162 ENDPROC(NG_ret_i2_and_7_plus_i4) << 163 #endif << 164 << 165 .align 64 82 .align 64 166 83 167 .globl FUNC_NAME 84 .globl FUNC_NAME 168 .type FUNC_NAME,#function 85 .type FUNC_NAME,#function 169 FUNC_NAME: /* %i0=dst, %i1=src, %i2=len * 86 FUNC_NAME: /* %i0=dst, %i1=src, %i2=len */ 170 PREAMBLE 87 PREAMBLE 171 save %sp, -SAVE_AMOUNT, %sp 88 save %sp, -SAVE_AMOUNT, %sp 172 srlx %i2, 31, %g2 89 srlx %i2, 31, %g2 173 cmp %g2, 0 90 cmp %g2, 0 174 tne %xcc, 5 91 tne %xcc, 5 175 mov %i0, %o0 92 mov %i0, %o0 176 cmp %i2, 0 93 cmp %i2, 0 177 be,pn %XCC, 85f 94 be,pn %XCC, 85f 178 or %o0, %i1, %i3 95 or %o0, %i1, %i3 179 cmp %i2, 16 96 cmp %i2, 16 180 blu,a,pn %XCC, 80f 97 blu,a,pn %XCC, 80f 181 or %i3, %i2, %i3 98 or %i3, %i2, %i3 182 99 183 /* 2 blocks (128 bytes) is the minimum 100 /* 2 blocks (128 bytes) is the minimum we can do the block 184 * copy with. We need to ensure that 101 * copy with. We need to ensure that we'll iterate at least 185 * once in the block copy loop. At wo 102 * once in the block copy loop. At worst we'll need to align 186 * the destination to a 64-byte bounda 103 * the destination to a 64-byte boundary which can chew up 187 * to (64 - 1) bytes from the length b 104 * to (64 - 1) bytes from the length before we perform the 188 * block copy loop. 105 * block copy loop. 189 */ 106 */ 190 cmp %i2, (2 * 64) 107 cmp %i2, (2 * 64) 191 blu,pt %XCC, 70f 108 blu,pt %XCC, 70f 192 andcc %i3, 0x7, %g0 109 andcc %i3, 0x7, %g0 193 110 194 /* %o0: dst 111 /* %o0: dst 195 * %i1: src 112 * %i1: src 196 * %i2: len (known to be >= 128) 113 * %i2: len (known to be >= 128) 197 * 114 * 198 * The block copy loops will use %i4/% 115 * The block copy loops will use %i4/%i5,%g2/%g3 as 199 * temporaries while copying the data. 116 * temporaries while copying the data. 200 */ 117 */ 201 118 202 LOAD(prefetch, %i1, #one_read) 119 LOAD(prefetch, %i1, #one_read) 203 wr %g0, STORE_ASI, %asi 120 wr %g0, STORE_ASI, %asi 204 121 205 /* Align destination on 64-byte bounda 122 /* Align destination on 64-byte boundary. */ 206 andcc %o0, (64 - 1), %i4 123 andcc %o0, (64 - 1), %i4 207 be,pt %XCC, 2f 124 be,pt %XCC, 2f 208 sub %i4, 64, %i4 125 sub %i4, 64, %i4 209 sub %g0, %i4, %i4 ! byte 126 sub %g0, %i4, %i4 ! bytes to align dst 210 sub %i2, %i4, %i2 127 sub %i2, %i4, %i2 211 1: subcc %i4, 1, %i4 128 1: subcc %i4, 1, %i4 212 EX_LD(LOAD(ldub, %i1, %g1), NG_ret_i2_ !! 129 EX_LD(LOAD(ldub, %i1, %g1)) 213 EX_ST(STORE(stb, %g1, %o0), NG_ret_i2_ !! 130 EX_ST(STORE(stb, %g1, %o0)) 214 add %i1, 1, %i1 131 add %i1, 1, %i1 215 bne,pt %XCC, 1b 132 bne,pt %XCC, 1b 216 add %o0, 1, %o0 133 add %o0, 1, %o0 217 134 218 /* If the source is on a 16-byte bound 135 /* If the source is on a 16-byte boundary we can do 219 * the direct block copy loop. If it 136 * the direct block copy loop. If it is 8-byte aligned 220 * we can do the 16-byte loads offset 137 * we can do the 16-byte loads offset by -8 bytes and the 221 * init stores offset by one register. 138 * init stores offset by one register. 222 * 139 * 223 * If the source is not even 8-byte al 140 * If the source is not even 8-byte aligned, we need to do 224 * shifting and masking (basically int 141 * shifting and masking (basically integer faligndata). 225 * 142 * 226 * The careful bit with init stores is 143 * The careful bit with init stores is that if we store 227 * to any part of the cache line we ha 144 * to any part of the cache line we have to store the whole 228 * cacheline else we can end up with c 145 * cacheline else we can end up with corrupt L2 cache line 229 * contents. Since the loop works on 146 * contents. Since the loop works on 64-bytes of 64-byte 230 * aligned store data at a time, this 147 * aligned store data at a time, this is easy to ensure. 231 */ 148 */ 232 2: 149 2: 233 andcc %i1, (16 - 1), %i4 150 andcc %i1, (16 - 1), %i4 234 andn %i2, (64 - 1), %g1 151 andn %i2, (64 - 1), %g1 ! block copy loop iterator 235 be,pt %XCC, 50f 152 be,pt %XCC, 50f 236 sub %i2, %g1, %i2 153 sub %i2, %g1, %i2 ! final sub-block copy bytes 237 154 238 cmp %i4, 8 155 cmp %i4, 8 239 be,pt %XCC, 10f 156 be,pt %XCC, 10f 240 sub %i1, %i4, %i1 157 sub %i1, %i4, %i1 241 158 242 /* Neither 8-byte nor 16-byte aligned, 159 /* Neither 8-byte nor 16-byte aligned, shift and mask. */ 243 and %i4, 0x7, GLOBAL_SPARE 160 and %i4, 0x7, GLOBAL_SPARE 244 sll GLOBAL_SPARE, 3, GLOBA 161 sll GLOBAL_SPARE, 3, GLOBAL_SPARE 245 mov 64, %i5 162 mov 64, %i5 246 EX_LD(LOAD_TWIN(%i1, %g2, %g3), NG_ret !! 163 EX_LD(LOAD_TWIN(%i1, %g2, %g3)) 247 sub %i5, GLOBAL_SPARE, %i5 164 sub %i5, GLOBAL_SPARE, %i5 248 mov 16, %o4 165 mov 16, %o4 249 mov 32, %o5 166 mov 32, %o5 250 mov 48, %o7 167 mov 48, %o7 251 mov 64, %i3 168 mov 64, %i3 252 169 253 bg,pn %XCC, 9f 170 bg,pn %XCC, 9f 254 nop 171 nop 255 172 256 #define MIX_THREE_WORDS(WORD1, WORD2, WORD3, P 173 #define MIX_THREE_WORDS(WORD1, WORD2, WORD3, PRE_SHIFT, POST_SHIFT, TMP) \ 257 sllx WORD1, POST_SHIFT, WOR 174 sllx WORD1, POST_SHIFT, WORD1; \ 258 srlx WORD2, PRE_SHIFT, TMP; 175 srlx WORD2, PRE_SHIFT, TMP; \ 259 sllx WORD2, POST_SHIFT, WOR 176 sllx WORD2, POST_SHIFT, WORD2; \ 260 or WORD1, TMP, WORD1; \ 177 or WORD1, TMP, WORD1; \ 261 srlx WORD3, PRE_SHIFT, TMP; 178 srlx WORD3, PRE_SHIFT, TMP; \ 262 or WORD2, TMP, WORD2; 179 or WORD2, TMP, WORD2; 263 180 264 8: EX_LD(LOAD_TWIN(%i1 + %o4, %o2, %o3), !! 181 8: EX_LD(LOAD_TWIN(%i1 + %o4, %o2, %o3)) 265 MIX_THREE_WORDS(%g2, %g3, %o2, %i5, GL 182 MIX_THREE_WORDS(%g2, %g3, %o2, %i5, GLOBAL_SPARE, %o1) 266 LOAD(prefetch, %i1 + %i3, #one_read) 183 LOAD(prefetch, %i1 + %i3, #one_read) 267 184 268 EX_ST(STORE_INIT(%g2, %o0 + 0x00), NG_ !! 185 EX_ST(STORE_INIT(%g2, %o0 + 0x00)) 269 EX_ST(STORE_INIT(%g3, %o0 + 0x08), NG_ !! 186 EX_ST(STORE_INIT(%g3, %o0 + 0x08)) 270 187 271 EX_LD(LOAD_TWIN(%i1 + %o5, %g2, %g3), !! 188 EX_LD(LOAD_TWIN(%i1 + %o5, %g2, %g3)) 272 MIX_THREE_WORDS(%o2, %o3, %g2, %i5, GL 189 MIX_THREE_WORDS(%o2, %o3, %g2, %i5, GLOBAL_SPARE, %o1) 273 190 274 EX_ST(STORE_INIT(%o2, %o0 + 0x10), NG_ !! 191 EX_ST(STORE_INIT(%o2, %o0 + 0x10)) 275 EX_ST(STORE_INIT(%o3, %o0 + 0x18), NG_ !! 192 EX_ST(STORE_INIT(%o3, %o0 + 0x18)) 276 193 277 EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3), !! 194 EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3)) 278 MIX_THREE_WORDS(%g2, %g3, %o2, %i5, GL 195 MIX_THREE_WORDS(%g2, %g3, %o2, %i5, GLOBAL_SPARE, %o1) 279 196 280 EX_ST(STORE_INIT(%g2, %o0 + 0x20), NG_ !! 197 EX_ST(STORE_INIT(%g2, %o0 + 0x20)) 281 EX_ST(STORE_INIT(%g3, %o0 + 0x28), NG_ !! 198 EX_ST(STORE_INIT(%g3, %o0 + 0x28)) 282 199 283 EX_LD(LOAD_TWIN(%i1 + %i3, %g2, %g3), !! 200 EX_LD(LOAD_TWIN(%i1 + %i3, %g2, %g3)) 284 add %i1, 64, %i1 201 add %i1, 64, %i1 285 MIX_THREE_WORDS(%o2, %o3, %g2, %i5, GL 202 MIX_THREE_WORDS(%o2, %o3, %g2, %i5, GLOBAL_SPARE, %o1) 286 203 287 EX_ST(STORE_INIT(%o2, %o0 + 0x30), NG_ !! 204 EX_ST(STORE_INIT(%o2, %o0 + 0x30)) 288 EX_ST(STORE_INIT(%o3, %o0 + 0x38), NG_ !! 205 EX_ST(STORE_INIT(%o3, %o0 + 0x38)) 289 206 290 subcc %g1, 64, %g1 207 subcc %g1, 64, %g1 291 bne,pt %XCC, 8b 208 bne,pt %XCC, 8b 292 add %o0, 64, %o0 209 add %o0, 64, %o0 293 210 294 ba,pt %XCC, 60f 211 ba,pt %XCC, 60f 295 add %i1, %i4, %i1 212 add %i1, %i4, %i1 296 213 297 9: EX_LD(LOAD_TWIN(%i1 + %o4, %o2, %o3), !! 214 9: EX_LD(LOAD_TWIN(%i1 + %o4, %o2, %o3)) 298 MIX_THREE_WORDS(%g3, %o2, %o3, %i5, GL 215 MIX_THREE_WORDS(%g3, %o2, %o3, %i5, GLOBAL_SPARE, %o1) 299 LOAD(prefetch, %i1 + %i3, #one_read) 216 LOAD(prefetch, %i1 + %i3, #one_read) 300 217 301 EX_ST(STORE_INIT(%g3, %o0 + 0x00), NG_ !! 218 EX_ST(STORE_INIT(%g3, %o0 + 0x00)) 302 EX_ST(STORE_INIT(%o2, %o0 + 0x08), NG_ !! 219 EX_ST(STORE_INIT(%o2, %o0 + 0x08)) 303 220 304 EX_LD(LOAD_TWIN(%i1 + %o5, %g2, %g3), !! 221 EX_LD(LOAD_TWIN(%i1 + %o5, %g2, %g3)) 305 MIX_THREE_WORDS(%o3, %g2, %g3, %i5, GL 222 MIX_THREE_WORDS(%o3, %g2, %g3, %i5, GLOBAL_SPARE, %o1) 306 223 307 EX_ST(STORE_INIT(%o3, %o0 + 0x10), NG_ !! 224 EX_ST(STORE_INIT(%o3, %o0 + 0x10)) 308 EX_ST(STORE_INIT(%g2, %o0 + 0x18), NG_ !! 225 EX_ST(STORE_INIT(%g2, %o0 + 0x18)) 309 226 310 EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3), !! 227 EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3)) 311 MIX_THREE_WORDS(%g3, %o2, %o3, %i5, GL 228 MIX_THREE_WORDS(%g3, %o2, %o3, %i5, GLOBAL_SPARE, %o1) 312 229 313 EX_ST(STORE_INIT(%g3, %o0 + 0x20), NG_ !! 230 EX_ST(STORE_INIT(%g3, %o0 + 0x20)) 314 EX_ST(STORE_INIT(%o2, %o0 + 0x28), NG_ !! 231 EX_ST(STORE_INIT(%o2, %o0 + 0x28)) 315 232 316 EX_LD(LOAD_TWIN(%i1 + %i3, %g2, %g3), !! 233 EX_LD(LOAD_TWIN(%i1 + %i3, %g2, %g3)) 317 add %i1, 64, %i1 234 add %i1, 64, %i1 318 MIX_THREE_WORDS(%o3, %g2, %g3, %i5, GL 235 MIX_THREE_WORDS(%o3, %g2, %g3, %i5, GLOBAL_SPARE, %o1) 319 236 320 EX_ST(STORE_INIT(%o3, %o0 + 0x30), NG_ !! 237 EX_ST(STORE_INIT(%o3, %o0 + 0x30)) 321 EX_ST(STORE_INIT(%g2, %o0 + 0x38), NG_ !! 238 EX_ST(STORE_INIT(%g2, %o0 + 0x38)) 322 239 323 subcc %g1, 64, %g1 240 subcc %g1, 64, %g1 324 bne,pt %XCC, 9b 241 bne,pt %XCC, 9b 325 add %o0, 64, %o0 242 add %o0, 64, %o0 326 243 327 ba,pt %XCC, 60f 244 ba,pt %XCC, 60f 328 add %i1, %i4, %i1 245 add %i1, %i4, %i1 329 246 330 10: /* Destination is 64-byte aligned, sou 247 10: /* Destination is 64-byte aligned, source was only 8-byte 331 * aligned but it has been subtracted 248 * aligned but it has been subtracted by 8 and we perform 332 * one twin load ahead, then add 8 bac 249 * one twin load ahead, then add 8 back into source when 333 * we finish the loop. 250 * we finish the loop. 334 */ 251 */ 335 EX_LD(LOAD_TWIN(%i1, %o4, %o5), NG_ret !! 252 EX_LD(LOAD_TWIN(%i1, %o4, %o5)) 336 mov 16, %o7 253 mov 16, %o7 337 mov 32, %g2 254 mov 32, %g2 338 mov 48, %g3 255 mov 48, %g3 339 mov 64, %o1 256 mov 64, %o1 340 1: EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3), !! 257 1: EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3)) 341 LOAD(prefetch, %i1 + %o1, #one_read) 258 LOAD(prefetch, %i1 + %o1, #one_read) 342 EX_ST(STORE_INIT(%o5, %o0 + 0x00), NG_ !! 259 EX_ST(STORE_INIT(%o5, %o0 + 0x00)) ! initializes cache line 343 EX_ST(STORE_INIT(%o2, %o0 + 0x08), NG_ !! 260 EX_ST(STORE_INIT(%o2, %o0 + 0x08)) 344 EX_LD(LOAD_TWIN(%i1 + %g2, %o4, %o5), !! 261 EX_LD(LOAD_TWIN(%i1 + %g2, %o4, %o5)) 345 EX_ST(STORE_INIT(%o3, %o0 + 0x10), NG_ !! 262 EX_ST(STORE_INIT(%o3, %o0 + 0x10)) 346 EX_ST(STORE_INIT(%o4, %o0 + 0x18), NG_ !! 263 EX_ST(STORE_INIT(%o4, %o0 + 0x18)) 347 EX_LD(LOAD_TWIN(%i1 + %g3, %o2, %o3), !! 264 EX_LD(LOAD_TWIN(%i1 + %g3, %o2, %o3)) 348 EX_ST(STORE_INIT(%o5, %o0 + 0x20), NG_ !! 265 EX_ST(STORE_INIT(%o5, %o0 + 0x20)) 349 EX_ST(STORE_INIT(%o2, %o0 + 0x28), NG_ !! 266 EX_ST(STORE_INIT(%o2, %o0 + 0x28)) 350 EX_LD(LOAD_TWIN(%i1 + %o1, %o4, %o5), !! 267 EX_LD(LOAD_TWIN(%i1 + %o1, %o4, %o5)) 351 add %i1, 64, %i1 268 add %i1, 64, %i1 352 EX_ST(STORE_INIT(%o3, %o0 + 0x30), NG_ !! 269 EX_ST(STORE_INIT(%o3, %o0 + 0x30)) 353 EX_ST(STORE_INIT(%o4, %o0 + 0x38), NG_ !! 270 EX_ST(STORE_INIT(%o4, %o0 + 0x38)) 354 subcc %g1, 64, %g1 271 subcc %g1, 64, %g1 355 bne,pt %XCC, 1b 272 bne,pt %XCC, 1b 356 add %o0, 64, %o0 273 add %o0, 64, %o0 357 274 358 ba,pt %XCC, 60f 275 ba,pt %XCC, 60f 359 add %i1, 0x8, %i1 276 add %i1, 0x8, %i1 360 277 361 50: /* Destination is 64-byte aligned, and 278 50: /* Destination is 64-byte aligned, and source is 16-byte 362 * aligned. 279 * aligned. 363 */ 280 */ 364 mov 16, %o7 281 mov 16, %o7 365 mov 32, %g2 282 mov 32, %g2 366 mov 48, %g3 283 mov 48, %g3 367 mov 64, %o1 284 mov 64, %o1 368 1: EX_LD(LOAD_TWIN(%i1 + %g0, %o4, %o5), !! 285 1: EX_LD(LOAD_TWIN(%i1 + %g0, %o4, %o5)) 369 EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3), !! 286 EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3)) 370 LOAD(prefetch, %i1 + %o1, #one_read) 287 LOAD(prefetch, %i1 + %o1, #one_read) 371 EX_ST(STORE_INIT(%o4, %o0 + 0x00), NG_ !! 288 EX_ST(STORE_INIT(%o4, %o0 + 0x00)) ! initializes cache line 372 EX_ST(STORE_INIT(%o5, %o0 + 0x08), NG_ !! 289 EX_ST(STORE_INIT(%o5, %o0 + 0x08)) 373 EX_LD(LOAD_TWIN(%i1 + %g2, %o4, %o5), !! 290 EX_LD(LOAD_TWIN(%i1 + %g2, %o4, %o5)) 374 EX_ST(STORE_INIT(%o2, %o0 + 0x10), NG_ !! 291 EX_ST(STORE_INIT(%o2, %o0 + 0x10)) 375 EX_ST(STORE_INIT(%o3, %o0 + 0x18), NG_ !! 292 EX_ST(STORE_INIT(%o3, %o0 + 0x18)) 376 EX_LD(LOAD_TWIN(%i1 + %g3, %o2, %o3), !! 293 EX_LD(LOAD_TWIN(%i1 + %g3, %o2, %o3)) 377 add %i1, 64, %i1 294 add %i1, 64, %i1 378 EX_ST(STORE_INIT(%o4, %o0 + 0x20), NG_ !! 295 EX_ST(STORE_INIT(%o4, %o0 + 0x20)) 379 EX_ST(STORE_INIT(%o5, %o0 + 0x28), NG_ !! 296 EX_ST(STORE_INIT(%o5, %o0 + 0x28)) 380 EX_ST(STORE_INIT(%o2, %o0 + 0x30), NG_ !! 297 EX_ST(STORE_INIT(%o2, %o0 + 0x30)) 381 EX_ST(STORE_INIT(%o3, %o0 + 0x38), NG_ !! 298 EX_ST(STORE_INIT(%o3, %o0 + 0x38)) 382 subcc %g1, 64, %g1 299 subcc %g1, 64, %g1 383 bne,pt %XCC, 1b 300 bne,pt %XCC, 1b 384 add %o0, 64, %o0 301 add %o0, 64, %o0 385 /* fall through */ 302 /* fall through */ 386 303 387 60: 304 60: 388 membar #Sync 305 membar #Sync 389 306 390 /* %i2 contains any final bytes still 307 /* %i2 contains any final bytes still needed to be copied 391 * over. If anything is left, we copy 308 * over. If anything is left, we copy it one byte at a time. 392 */ 309 */ 393 RESTORE_ASI(%i3) 310 RESTORE_ASI(%i3) 394 brz,pt %i2, 85f 311 brz,pt %i2, 85f 395 sub %o0, %i1, %i3 312 sub %o0, %i1, %i3 396 ba,a,pt %XCC, 90f 313 ba,a,pt %XCC, 90f 397 nop << 398 314 399 .align 64 315 .align 64 400 70: /* 16 < len <= 64 */ 316 70: /* 16 < len <= 64 */ 401 bne,pn %XCC, 75f 317 bne,pn %XCC, 75f 402 sub %o0, %i1, %i3 318 sub %o0, %i1, %i3 403 319 404 72: 320 72: 405 andn %i2, 0xf, %i4 321 andn %i2, 0xf, %i4 406 and %i2, 0xf, %i2 322 and %i2, 0xf, %i2 407 1: subcc %i4, 0x10, %i4 323 1: subcc %i4, 0x10, %i4 408 EX_LD(LOAD(ldx, %i1, %o4), NG_ret_i2_p !! 324 EX_LD(LOAD(ldx, %i1, %o4)) 409 add %i1, 0x08, %i1 325 add %i1, 0x08, %i1 410 EX_LD(LOAD(ldx, %i1, %g1), NG_ret_i2_p !! 326 EX_LD(LOAD(ldx, %i1, %g1)) 411 sub %i1, 0x08, %i1 327 sub %i1, 0x08, %i1 412 EX_ST(STORE(stx, %o4, %i1 + %i3), NG_r !! 328 EX_ST(STORE(stx, %o4, %i1 + %i3)) 413 add %i1, 0x8, %i1 329 add %i1, 0x8, %i1 414 EX_ST(STORE(stx, %g1, %i1 + %i3), NG_r !! 330 EX_ST(STORE(stx, %g1, %i1 + %i3)) 415 bgu,pt %XCC, 1b 331 bgu,pt %XCC, 1b 416 add %i1, 0x8, %i1 332 add %i1, 0x8, %i1 417 73: andcc %i2, 0x8, %g0 333 73: andcc %i2, 0x8, %g0 418 be,pt %XCC, 1f 334 be,pt %XCC, 1f 419 nop 335 nop 420 sub %i2, 0x8, %i2 336 sub %i2, 0x8, %i2 421 EX_LD(LOAD(ldx, %i1, %o4), NG_ret_i2_p !! 337 EX_LD(LOAD(ldx, %i1, %o4)) 422 EX_ST(STORE(stx, %o4, %i1 + %i3), NG_r !! 338 EX_ST(STORE(stx, %o4, %i1 + %i3)) 423 add %i1, 0x8, %i1 339 add %i1, 0x8, %i1 424 1: andcc %i2, 0x4, %g0 340 1: andcc %i2, 0x4, %g0 425 be,pt %XCC, 1f 341 be,pt %XCC, 1f 426 nop 342 nop 427 sub %i2, 0x4, %i2 343 sub %i2, 0x4, %i2 428 EX_LD(LOAD(lduw, %i1, %i5), NG_ret_i2_ !! 344 EX_LD(LOAD(lduw, %i1, %i5)) 429 EX_ST(STORE(stw, %i5, %i1 + %i3), NG_r !! 345 EX_ST(STORE(stw, %i5, %i1 + %i3)) 430 add %i1, 0x4, %i1 346 add %i1, 0x4, %i1 431 1: cmp %i2, 0 347 1: cmp %i2, 0 432 be,pt %XCC, 85f 348 be,pt %XCC, 85f 433 nop 349 nop 434 ba,pt %xcc, 90f 350 ba,pt %xcc, 90f 435 nop 351 nop 436 352 437 75: 353 75: 438 andcc %o0, 0x7, %g1 354 andcc %o0, 0x7, %g1 439 sub %g1, 0x8, %g1 355 sub %g1, 0x8, %g1 440 be,pn %icc, 2f 356 be,pn %icc, 2f 441 sub %g0, %g1, %g1 357 sub %g0, %g1, %g1 442 sub %i2, %g1, %i2 358 sub %i2, %g1, %i2 443 359 444 1: subcc %g1, 1, %g1 360 1: subcc %g1, 1, %g1 445 EX_LD(LOAD(ldub, %i1, %i5), NG_ret_i2_ !! 361 EX_LD(LOAD(ldub, %i1, %i5)) 446 EX_ST(STORE(stb, %i5, %i1 + %i3), NG_r !! 362 EX_ST(STORE(stb, %i5, %i1 + %i3)) 447 bgu,pt %icc, 1b 363 bgu,pt %icc, 1b 448 add %i1, 1, %i1 364 add %i1, 1, %i1 449 365 450 2: add %i1, %i3, %o0 366 2: add %i1, %i3, %o0 451 andcc %i1, 0x7, %g1 367 andcc %i1, 0x7, %g1 452 bne,pt %icc, 8f 368 bne,pt %icc, 8f 453 sll %g1, 3, %g1 369 sll %g1, 3, %g1 454 370 455 cmp %i2, 16 371 cmp %i2, 16 456 bgeu,pt %icc, 72b 372 bgeu,pt %icc, 72b 457 nop 373 nop 458 ba,a,pt %xcc, 73b 374 ba,a,pt %xcc, 73b 459 375 460 8: mov 64, %i3 376 8: mov 64, %i3 461 andn %i1, 0x7, %i1 377 andn %i1, 0x7, %i1 462 EX_LD(LOAD(ldx, %i1, %g2), NG_ret_i2) !! 378 EX_LD(LOAD(ldx, %i1, %g2)) 463 sub %i3, %g1, %i3 379 sub %i3, %g1, %i3 464 andn %i2, 0x7, %i4 380 andn %i2, 0x7, %i4 465 sllx %g2, %g1, %g2 381 sllx %g2, %g1, %g2 466 1: add %i1, 0x8, %i1 382 1: add %i1, 0x8, %i1 467 EX_LD(LOAD(ldx, %i1, %g3), NG_ret_i2_a !! 383 EX_LD(LOAD(ldx, %i1, %g3)) 468 subcc %i4, 0x8, %i4 384 subcc %i4, 0x8, %i4 469 srlx %g3, %i3, %i5 385 srlx %g3, %i3, %i5 470 or %i5, %g2, %i5 386 or %i5, %g2, %i5 471 EX_ST(STORE(stx, %i5, %o0), NG_ret_i2_ !! 387 EX_ST(STORE(stx, %i5, %o0)) 472 add %o0, 0x8, %o0 388 add %o0, 0x8, %o0 473 bgu,pt %icc, 1b 389 bgu,pt %icc, 1b 474 sllx %g3, %g1, %g2 390 sllx %g3, %g1, %g2 475 391 476 srl %g1, 3, %g1 392 srl %g1, 3, %g1 477 andcc %i2, 0x7, %i2 393 andcc %i2, 0x7, %i2 478 be,pn %icc, 85f 394 be,pn %icc, 85f 479 add %i1, %g1, %i1 395 add %i1, %g1, %i1 480 ba,pt %xcc, 90f 396 ba,pt %xcc, 90f 481 sub %o0, %i1, %i3 397 sub %o0, %i1, %i3 482 398 483 .align 64 399 .align 64 484 80: /* 0 < len <= 16 */ 400 80: /* 0 < len <= 16 */ 485 andcc %i3, 0x3, %g0 401 andcc %i3, 0x3, %g0 486 bne,pn %XCC, 90f 402 bne,pn %XCC, 90f 487 sub %o0, %i1, %i3 403 sub %o0, %i1, %i3 488 404 489 1: 405 1: 490 subcc %i2, 4, %i2 406 subcc %i2, 4, %i2 491 EX_LD(LOAD(lduw, %i1, %g1), NG_ret_i2_ !! 407 EX_LD(LOAD(lduw, %i1, %g1)) 492 EX_ST(STORE(stw, %g1, %i1 + %i3), NG_r !! 408 EX_ST(STORE(stw, %g1, %i1 + %i3)) 493 bgu,pt %XCC, 1b 409 bgu,pt %XCC, 1b 494 add %i1, 4, %i1 410 add %i1, 4, %i1 495 411 496 85: ret 412 85: ret 497 restore EX_RETVAL(%i0), %g0, % 413 restore EX_RETVAL(%i0), %g0, %o0 498 414 499 .align 32 415 .align 32 500 90: 416 90: 501 subcc %i2, 1, %i2 417 subcc %i2, 1, %i2 502 EX_LD(LOAD(ldub, %i1, %g1), NG_ret_i2_ !! 418 EX_LD(LOAD(ldub, %i1, %g1)) 503 EX_ST(STORE(stb, %g1, %i1 + %i3), NG_r !! 419 EX_ST(STORE(stb, %g1, %i1 + %i3)) 504 bgu,pt %XCC, 90b 420 bgu,pt %XCC, 90b 505 add %i1, 1, %i1 421 add %i1, 1, %i1 506 ret 422 ret 507 restore EX_RETVAL(%i0), %g0, % 423 restore EX_RETVAL(%i0), %g0, %o0 508 424 509 .size FUNC_NAME, .-FUNC_NAME 425 .size FUNC_NAME, .-FUNC_NAME
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.