1 /* SPDX-License-Identifier: GPL-2.0 */ << 2 /* NGmemcpy.S: Niagara optimized memcpy. 1 /* NGmemcpy.S: Niagara optimized memcpy. 3 * 2 * 4 * Copyright (C) 2006, 2007 David S. Miller (d 3 * Copyright (C) 2006, 2007 David S. Miller (davem@davemloft.net) 5 */ 4 */ 6 5 7 #ifdef __KERNEL__ 6 #ifdef __KERNEL__ 8 #include <linux/linkage.h> 7 #include <linux/linkage.h> 9 #include <asm/asi.h> 8 #include <asm/asi.h> 10 #include <asm/thread_info.h> 9 #include <asm/thread_info.h> 11 #define GLOBAL_SPARE %g7 10 #define GLOBAL_SPARE %g7 12 #define RESTORE_ASI(TMP) \ 11 #define RESTORE_ASI(TMP) \ 13 wr %g0, ASI_AIUS, %asi !! 12 ldub [%g6 + TI_CURRENT_DS], TMP; \ >> 13 wr TMP, 0x0, %asi; 14 #else 14 #else 15 #define GLOBAL_SPARE %g5 15 #define GLOBAL_SPARE %g5 16 #define RESTORE_ASI(TMP) \ 16 #define RESTORE_ASI(TMP) \ 17 wr %g0, ASI_PNF, %asi 17 wr %g0, ASI_PNF, %asi 18 #endif 18 #endif 19 19 20 #ifdef __sparc_v9__ 20 #ifdef __sparc_v9__ 21 #define SAVE_AMOUNT 128 21 #define SAVE_AMOUNT 128 22 #else 22 #else 23 #define SAVE_AMOUNT 64 23 #define SAVE_AMOUNT 64 24 #endif 24 #endif 25 25 26 #ifndef STORE_ASI 26 #ifndef STORE_ASI 27 #define STORE_ASI ASI_BLK_INIT_QUAD_LDD_ 27 #define STORE_ASI ASI_BLK_INIT_QUAD_LDD_P 28 #endif 28 #endif 29 29 30 #ifndef EX_LD 30 #ifndef EX_LD 31 #define EX_LD(x,y) x 31 #define EX_LD(x,y) x 32 #endif 32 #endif 33 33 34 #ifndef EX_ST 34 #ifndef EX_ST 35 #define EX_ST(x,y) x 35 #define EX_ST(x,y) x 36 #endif 36 #endif 37 37 38 #ifndef LOAD 38 #ifndef LOAD 39 #ifndef MEMCPY_DEBUG 39 #ifndef MEMCPY_DEBUG 40 #define LOAD(type,addr,dest) type [addr], d 40 #define LOAD(type,addr,dest) type [addr], dest 41 #else 41 #else 42 #define LOAD(type,addr,dest) type##a [addr] 42 #define LOAD(type,addr,dest) type##a [addr] 0x80, dest 43 #endif 43 #endif 44 #endif 44 #endif 45 45 46 #ifndef LOAD_TWIN 46 #ifndef LOAD_TWIN 47 #define LOAD_TWIN(addr_reg,dest0,dest1) \ 47 #define LOAD_TWIN(addr_reg,dest0,dest1) \ 48 ldda [addr_reg] ASI_BLK_INIT_QUAD_LDD_ 48 ldda [addr_reg] ASI_BLK_INIT_QUAD_LDD_P, dest0 49 #endif 49 #endif 50 50 51 #ifndef STORE 51 #ifndef STORE 52 #define STORE(type,src,addr) type src, [add 52 #define STORE(type,src,addr) type src, [addr] 53 #endif 53 #endif 54 54 55 #ifndef STORE_INIT 55 #ifndef STORE_INIT 56 #ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA 56 #ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA 57 #define STORE_INIT(src,addr) stxa src, [add 57 #define STORE_INIT(src,addr) stxa src, [addr] %asi 58 #else 58 #else 59 #define STORE_INIT(src,addr) stx src, [addr 59 #define STORE_INIT(src,addr) stx src, [addr + 0x00] 60 #endif 60 #endif 61 #endif 61 #endif 62 62 63 #ifndef FUNC_NAME 63 #ifndef FUNC_NAME 64 #define FUNC_NAME NGmemcpy 64 #define FUNC_NAME NGmemcpy 65 #endif 65 #endif 66 66 67 #ifndef PREAMBLE 67 #ifndef PREAMBLE 68 #define PREAMBLE 68 #define PREAMBLE 69 #endif 69 #endif 70 70 71 #ifndef XCC 71 #ifndef XCC 72 #define XCC xcc 72 #define XCC xcc 73 #endif 73 #endif 74 74 75 .register %g2,#scratch 75 .register %g2,#scratch 76 .register %g3,#scratch 76 .register %g3,#scratch 77 77 78 .text 78 .text 79 #ifndef EX_RETVAL 79 #ifndef EX_RETVAL 80 #define EX_RETVAL(x) x 80 #define EX_RETVAL(x) x 81 __restore_asi: 81 __restore_asi: 82 ret 82 ret 83 wr %g0, ASI_AIUS, %asi 83 wr %g0, ASI_AIUS, %asi 84 restore 84 restore 85 ENTRY(NG_ret_i2_plus_i4_plus_1) 85 ENTRY(NG_ret_i2_plus_i4_plus_1) 86 ba,pt %xcc, __restore_asi 86 ba,pt %xcc, __restore_asi 87 add %i2, %i5, %i0 87 add %i2, %i5, %i0 88 ENDPROC(NG_ret_i2_plus_i4_plus_1) 88 ENDPROC(NG_ret_i2_plus_i4_plus_1) 89 ENTRY(NG_ret_i2_plus_g1) 89 ENTRY(NG_ret_i2_plus_g1) 90 ba,pt %xcc, __restore_asi 90 ba,pt %xcc, __restore_asi 91 add %i2, %g1, %i0 91 add %i2, %g1, %i0 92 ENDPROC(NG_ret_i2_plus_g1) 92 ENDPROC(NG_ret_i2_plus_g1) 93 ENTRY(NG_ret_i2_plus_g1_minus_8) 93 ENTRY(NG_ret_i2_plus_g1_minus_8) 94 sub %g1, 8, %g1 94 sub %g1, 8, %g1 95 ba,pt %xcc, __restore_asi 95 ba,pt %xcc, __restore_asi 96 add %i2, %g1, %i0 96 add %i2, %g1, %i0 97 ENDPROC(NG_ret_i2_plus_g1_minus_8) 97 ENDPROC(NG_ret_i2_plus_g1_minus_8) 98 ENTRY(NG_ret_i2_plus_g1_minus_16) 98 ENTRY(NG_ret_i2_plus_g1_minus_16) 99 sub %g1, 16, %g1 99 sub %g1, 16, %g1 100 ba,pt %xcc, __restore_asi 100 ba,pt %xcc, __restore_asi 101 add %i2, %g1, %i0 101 add %i2, %g1, %i0 102 ENDPROC(NG_ret_i2_plus_g1_minus_16) 102 ENDPROC(NG_ret_i2_plus_g1_minus_16) 103 ENTRY(NG_ret_i2_plus_g1_minus_24) 103 ENTRY(NG_ret_i2_plus_g1_minus_24) 104 sub %g1, 24, %g1 104 sub %g1, 24, %g1 105 ba,pt %xcc, __restore_asi 105 ba,pt %xcc, __restore_asi 106 add %i2, %g1, %i0 106 add %i2, %g1, %i0 107 ENDPROC(NG_ret_i2_plus_g1_minus_24) 107 ENDPROC(NG_ret_i2_plus_g1_minus_24) 108 ENTRY(NG_ret_i2_plus_g1_minus_32) 108 ENTRY(NG_ret_i2_plus_g1_minus_32) 109 sub %g1, 32, %g1 109 sub %g1, 32, %g1 110 ba,pt %xcc, __restore_asi 110 ba,pt %xcc, __restore_asi 111 add %i2, %g1, %i0 111 add %i2, %g1, %i0 112 ENDPROC(NG_ret_i2_plus_g1_minus_32) 112 ENDPROC(NG_ret_i2_plus_g1_minus_32) 113 ENTRY(NG_ret_i2_plus_g1_minus_40) 113 ENTRY(NG_ret_i2_plus_g1_minus_40) 114 sub %g1, 40, %g1 114 sub %g1, 40, %g1 115 ba,pt %xcc, __restore_asi 115 ba,pt %xcc, __restore_asi 116 add %i2, %g1, %i0 116 add %i2, %g1, %i0 117 ENDPROC(NG_ret_i2_plus_g1_minus_40) 117 ENDPROC(NG_ret_i2_plus_g1_minus_40) 118 ENTRY(NG_ret_i2_plus_g1_minus_48) 118 ENTRY(NG_ret_i2_plus_g1_minus_48) 119 sub %g1, 48, %g1 119 sub %g1, 48, %g1 120 ba,pt %xcc, __restore_asi 120 ba,pt %xcc, __restore_asi 121 add %i2, %g1, %i0 121 add %i2, %g1, %i0 122 ENDPROC(NG_ret_i2_plus_g1_minus_48) 122 ENDPROC(NG_ret_i2_plus_g1_minus_48) 123 ENTRY(NG_ret_i2_plus_g1_minus_56) 123 ENTRY(NG_ret_i2_plus_g1_minus_56) 124 sub %g1, 56, %g1 124 sub %g1, 56, %g1 125 ba,pt %xcc, __restore_asi 125 ba,pt %xcc, __restore_asi 126 add %i2, %g1, %i0 126 add %i2, %g1, %i0 127 ENDPROC(NG_ret_i2_plus_g1_minus_56) 127 ENDPROC(NG_ret_i2_plus_g1_minus_56) 128 ENTRY(NG_ret_i2_plus_i4) 128 ENTRY(NG_ret_i2_plus_i4) 129 ba,pt %xcc, __restore_asi 129 ba,pt %xcc, __restore_asi 130 add %i2, %i4, %i0 130 add %i2, %i4, %i0 131 ENDPROC(NG_ret_i2_plus_i4) 131 ENDPROC(NG_ret_i2_plus_i4) 132 ENTRY(NG_ret_i2_plus_i4_minus_8) 132 ENTRY(NG_ret_i2_plus_i4_minus_8) 133 sub %i4, 8, %i4 133 sub %i4, 8, %i4 134 ba,pt %xcc, __restore_asi 134 ba,pt %xcc, __restore_asi 135 add %i2, %i4, %i0 135 add %i2, %i4, %i0 136 ENDPROC(NG_ret_i2_plus_i4_minus_8) 136 ENDPROC(NG_ret_i2_plus_i4_minus_8) 137 ENTRY(NG_ret_i2_plus_8) 137 ENTRY(NG_ret_i2_plus_8) 138 ba,pt %xcc, __restore_asi 138 ba,pt %xcc, __restore_asi 139 add %i2, 8, %i0 139 add %i2, 8, %i0 140 ENDPROC(NG_ret_i2_plus_8) 140 ENDPROC(NG_ret_i2_plus_8) 141 ENTRY(NG_ret_i2_plus_4) 141 ENTRY(NG_ret_i2_plus_4) 142 ba,pt %xcc, __restore_asi 142 ba,pt %xcc, __restore_asi 143 add %i2, 4, %i0 143 add %i2, 4, %i0 144 ENDPROC(NG_ret_i2_plus_4) 144 ENDPROC(NG_ret_i2_plus_4) 145 ENTRY(NG_ret_i2_plus_1) 145 ENTRY(NG_ret_i2_plus_1) 146 ba,pt %xcc, __restore_asi 146 ba,pt %xcc, __restore_asi 147 add %i2, 1, %i0 147 add %i2, 1, %i0 148 ENDPROC(NG_ret_i2_plus_1) 148 ENDPROC(NG_ret_i2_plus_1) 149 ENTRY(NG_ret_i2_plus_g1_plus_1) 149 ENTRY(NG_ret_i2_plus_g1_plus_1) 150 add %g1, 1, %g1 150 add %g1, 1, %g1 151 ba,pt %xcc, __restore_asi 151 ba,pt %xcc, __restore_asi 152 add %i2, %g1, %i0 152 add %i2, %g1, %i0 153 ENDPROC(NG_ret_i2_plus_g1_plus_1) 153 ENDPROC(NG_ret_i2_plus_g1_plus_1) 154 ENTRY(NG_ret_i2) 154 ENTRY(NG_ret_i2) 155 ba,pt %xcc, __restore_asi 155 ba,pt %xcc, __restore_asi 156 mov %i2, %i0 156 mov %i2, %i0 157 ENDPROC(NG_ret_i2) 157 ENDPROC(NG_ret_i2) 158 ENTRY(NG_ret_i2_and_7_plus_i4) 158 ENTRY(NG_ret_i2_and_7_plus_i4) 159 and %i2, 7, %i2 159 and %i2, 7, %i2 160 ba,pt %xcc, __restore_asi 160 ba,pt %xcc, __restore_asi 161 add %i2, %i4, %i0 161 add %i2, %i4, %i0 162 ENDPROC(NG_ret_i2_and_7_plus_i4) 162 ENDPROC(NG_ret_i2_and_7_plus_i4) 163 #endif 163 #endif 164 164 165 .align 64 165 .align 64 166 166 167 .globl FUNC_NAME 167 .globl FUNC_NAME 168 .type FUNC_NAME,#function 168 .type FUNC_NAME,#function 169 FUNC_NAME: /* %i0=dst, %i1=src, %i2=len * 169 FUNC_NAME: /* %i0=dst, %i1=src, %i2=len */ 170 PREAMBLE 170 PREAMBLE 171 save %sp, -SAVE_AMOUNT, %sp 171 save %sp, -SAVE_AMOUNT, %sp 172 srlx %i2, 31, %g2 172 srlx %i2, 31, %g2 173 cmp %g2, 0 173 cmp %g2, 0 174 tne %xcc, 5 174 tne %xcc, 5 175 mov %i0, %o0 175 mov %i0, %o0 176 cmp %i2, 0 176 cmp %i2, 0 177 be,pn %XCC, 85f 177 be,pn %XCC, 85f 178 or %o0, %i1, %i3 178 or %o0, %i1, %i3 179 cmp %i2, 16 179 cmp %i2, 16 180 blu,a,pn %XCC, 80f 180 blu,a,pn %XCC, 80f 181 or %i3, %i2, %i3 181 or %i3, %i2, %i3 182 182 183 /* 2 blocks (128 bytes) is the minimum 183 /* 2 blocks (128 bytes) is the minimum we can do the block 184 * copy with. We need to ensure that 184 * copy with. We need to ensure that we'll iterate at least 185 * once in the block copy loop. At wo 185 * once in the block copy loop. At worst we'll need to align 186 * the destination to a 64-byte bounda 186 * the destination to a 64-byte boundary which can chew up 187 * to (64 - 1) bytes from the length b 187 * to (64 - 1) bytes from the length before we perform the 188 * block copy loop. 188 * block copy loop. 189 */ 189 */ 190 cmp %i2, (2 * 64) 190 cmp %i2, (2 * 64) 191 blu,pt %XCC, 70f 191 blu,pt %XCC, 70f 192 andcc %i3, 0x7, %g0 192 andcc %i3, 0x7, %g0 193 193 194 /* %o0: dst 194 /* %o0: dst 195 * %i1: src 195 * %i1: src 196 * %i2: len (known to be >= 128) 196 * %i2: len (known to be >= 128) 197 * 197 * 198 * The block copy loops will use %i4/% 198 * The block copy loops will use %i4/%i5,%g2/%g3 as 199 * temporaries while copying the data. 199 * temporaries while copying the data. 200 */ 200 */ 201 201 202 LOAD(prefetch, %i1, #one_read) 202 LOAD(prefetch, %i1, #one_read) 203 wr %g0, STORE_ASI, %asi 203 wr %g0, STORE_ASI, %asi 204 204 205 /* Align destination on 64-byte bounda 205 /* Align destination on 64-byte boundary. */ 206 andcc %o0, (64 - 1), %i4 206 andcc %o0, (64 - 1), %i4 207 be,pt %XCC, 2f 207 be,pt %XCC, 2f 208 sub %i4, 64, %i4 208 sub %i4, 64, %i4 209 sub %g0, %i4, %i4 ! byte 209 sub %g0, %i4, %i4 ! bytes to align dst 210 sub %i2, %i4, %i2 210 sub %i2, %i4, %i2 211 1: subcc %i4, 1, %i4 211 1: subcc %i4, 1, %i4 212 EX_LD(LOAD(ldub, %i1, %g1), NG_ret_i2_ 212 EX_LD(LOAD(ldub, %i1, %g1), NG_ret_i2_plus_i4_plus_1) 213 EX_ST(STORE(stb, %g1, %o0), NG_ret_i2_ 213 EX_ST(STORE(stb, %g1, %o0), NG_ret_i2_plus_i4_plus_1) 214 add %i1, 1, %i1 214 add %i1, 1, %i1 215 bne,pt %XCC, 1b 215 bne,pt %XCC, 1b 216 add %o0, 1, %o0 216 add %o0, 1, %o0 217 217 218 /* If the source is on a 16-byte bound 218 /* If the source is on a 16-byte boundary we can do 219 * the direct block copy loop. If it 219 * the direct block copy loop. If it is 8-byte aligned 220 * we can do the 16-byte loads offset 220 * we can do the 16-byte loads offset by -8 bytes and the 221 * init stores offset by one register. 221 * init stores offset by one register. 222 * 222 * 223 * If the source is not even 8-byte al 223 * If the source is not even 8-byte aligned, we need to do 224 * shifting and masking (basically int 224 * shifting and masking (basically integer faligndata). 225 * 225 * 226 * The careful bit with init stores is 226 * The careful bit with init stores is that if we store 227 * to any part of the cache line we ha 227 * to any part of the cache line we have to store the whole 228 * cacheline else we can end up with c 228 * cacheline else we can end up with corrupt L2 cache line 229 * contents. Since the loop works on 229 * contents. Since the loop works on 64-bytes of 64-byte 230 * aligned store data at a time, this 230 * aligned store data at a time, this is easy to ensure. 231 */ 231 */ 232 2: 232 2: 233 andcc %i1, (16 - 1), %i4 233 andcc %i1, (16 - 1), %i4 234 andn %i2, (64 - 1), %g1 234 andn %i2, (64 - 1), %g1 ! block copy loop iterator 235 be,pt %XCC, 50f 235 be,pt %XCC, 50f 236 sub %i2, %g1, %i2 236 sub %i2, %g1, %i2 ! final sub-block copy bytes 237 237 238 cmp %i4, 8 238 cmp %i4, 8 239 be,pt %XCC, 10f 239 be,pt %XCC, 10f 240 sub %i1, %i4, %i1 240 sub %i1, %i4, %i1 241 241 242 /* Neither 8-byte nor 16-byte aligned, 242 /* Neither 8-byte nor 16-byte aligned, shift and mask. */ 243 and %i4, 0x7, GLOBAL_SPARE 243 and %i4, 0x7, GLOBAL_SPARE 244 sll GLOBAL_SPARE, 3, GLOBA 244 sll GLOBAL_SPARE, 3, GLOBAL_SPARE 245 mov 64, %i5 245 mov 64, %i5 246 EX_LD(LOAD_TWIN(%i1, %g2, %g3), NG_ret 246 EX_LD(LOAD_TWIN(%i1, %g2, %g3), NG_ret_i2_plus_g1) 247 sub %i5, GLOBAL_SPARE, %i5 247 sub %i5, GLOBAL_SPARE, %i5 248 mov 16, %o4 248 mov 16, %o4 249 mov 32, %o5 249 mov 32, %o5 250 mov 48, %o7 250 mov 48, %o7 251 mov 64, %i3 251 mov 64, %i3 252 252 253 bg,pn %XCC, 9f 253 bg,pn %XCC, 9f 254 nop 254 nop 255 255 256 #define MIX_THREE_WORDS(WORD1, WORD2, WORD3, P 256 #define MIX_THREE_WORDS(WORD1, WORD2, WORD3, PRE_SHIFT, POST_SHIFT, TMP) \ 257 sllx WORD1, POST_SHIFT, WOR 257 sllx WORD1, POST_SHIFT, WORD1; \ 258 srlx WORD2, PRE_SHIFT, TMP; 258 srlx WORD2, PRE_SHIFT, TMP; \ 259 sllx WORD2, POST_SHIFT, WOR 259 sllx WORD2, POST_SHIFT, WORD2; \ 260 or WORD1, TMP, WORD1; \ 260 or WORD1, TMP, WORD1; \ 261 srlx WORD3, PRE_SHIFT, TMP; 261 srlx WORD3, PRE_SHIFT, TMP; \ 262 or WORD2, TMP, WORD2; 262 or WORD2, TMP, WORD2; 263 263 264 8: EX_LD(LOAD_TWIN(%i1 + %o4, %o2, %o3), 264 8: EX_LD(LOAD_TWIN(%i1 + %o4, %o2, %o3), NG_ret_i2_plus_g1) 265 MIX_THREE_WORDS(%g2, %g3, %o2, %i5, GL 265 MIX_THREE_WORDS(%g2, %g3, %o2, %i5, GLOBAL_SPARE, %o1) 266 LOAD(prefetch, %i1 + %i3, #one_read) 266 LOAD(prefetch, %i1 + %i3, #one_read) 267 267 268 EX_ST(STORE_INIT(%g2, %o0 + 0x00), NG_ 268 EX_ST(STORE_INIT(%g2, %o0 + 0x00), NG_ret_i2_plus_g1) 269 EX_ST(STORE_INIT(%g3, %o0 + 0x08), NG_ 269 EX_ST(STORE_INIT(%g3, %o0 + 0x08), NG_ret_i2_plus_g1_minus_8) 270 270 271 EX_LD(LOAD_TWIN(%i1 + %o5, %g2, %g3), 271 EX_LD(LOAD_TWIN(%i1 + %o5, %g2, %g3), NG_ret_i2_plus_g1_minus_16) 272 MIX_THREE_WORDS(%o2, %o3, %g2, %i5, GL 272 MIX_THREE_WORDS(%o2, %o3, %g2, %i5, GLOBAL_SPARE, %o1) 273 273 274 EX_ST(STORE_INIT(%o2, %o0 + 0x10), NG_ 274 EX_ST(STORE_INIT(%o2, %o0 + 0x10), NG_ret_i2_plus_g1_minus_16) 275 EX_ST(STORE_INIT(%o3, %o0 + 0x18), NG_ 275 EX_ST(STORE_INIT(%o3, %o0 + 0x18), NG_ret_i2_plus_g1_minus_24) 276 276 277 EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3), 277 EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3), NG_ret_i2_plus_g1_minus_32) 278 MIX_THREE_WORDS(%g2, %g3, %o2, %i5, GL 278 MIX_THREE_WORDS(%g2, %g3, %o2, %i5, GLOBAL_SPARE, %o1) 279 279 280 EX_ST(STORE_INIT(%g2, %o0 + 0x20), NG_ 280 EX_ST(STORE_INIT(%g2, %o0 + 0x20), NG_ret_i2_plus_g1_minus_32) 281 EX_ST(STORE_INIT(%g3, %o0 + 0x28), NG_ 281 EX_ST(STORE_INIT(%g3, %o0 + 0x28), NG_ret_i2_plus_g1_minus_40) 282 282 283 EX_LD(LOAD_TWIN(%i1 + %i3, %g2, %g3), 283 EX_LD(LOAD_TWIN(%i1 + %i3, %g2, %g3), NG_ret_i2_plus_g1_minus_48) 284 add %i1, 64, %i1 284 add %i1, 64, %i1 285 MIX_THREE_WORDS(%o2, %o3, %g2, %i5, GL 285 MIX_THREE_WORDS(%o2, %o3, %g2, %i5, GLOBAL_SPARE, %o1) 286 286 287 EX_ST(STORE_INIT(%o2, %o0 + 0x30), NG_ 287 EX_ST(STORE_INIT(%o2, %o0 + 0x30), NG_ret_i2_plus_g1_minus_48) 288 EX_ST(STORE_INIT(%o3, %o0 + 0x38), NG_ 288 EX_ST(STORE_INIT(%o3, %o0 + 0x38), NG_ret_i2_plus_g1_minus_56) 289 289 290 subcc %g1, 64, %g1 290 subcc %g1, 64, %g1 291 bne,pt %XCC, 8b 291 bne,pt %XCC, 8b 292 add %o0, 64, %o0 292 add %o0, 64, %o0 293 293 294 ba,pt %XCC, 60f 294 ba,pt %XCC, 60f 295 add %i1, %i4, %i1 295 add %i1, %i4, %i1 296 296 297 9: EX_LD(LOAD_TWIN(%i1 + %o4, %o2, %o3), 297 9: EX_LD(LOAD_TWIN(%i1 + %o4, %o2, %o3), NG_ret_i2_plus_g1) 298 MIX_THREE_WORDS(%g3, %o2, %o3, %i5, GL 298 MIX_THREE_WORDS(%g3, %o2, %o3, %i5, GLOBAL_SPARE, %o1) 299 LOAD(prefetch, %i1 + %i3, #one_read) 299 LOAD(prefetch, %i1 + %i3, #one_read) 300 300 301 EX_ST(STORE_INIT(%g3, %o0 + 0x00), NG_ 301 EX_ST(STORE_INIT(%g3, %o0 + 0x00), NG_ret_i2_plus_g1) 302 EX_ST(STORE_INIT(%o2, %o0 + 0x08), NG_ 302 EX_ST(STORE_INIT(%o2, %o0 + 0x08), NG_ret_i2_plus_g1_minus_8) 303 303 304 EX_LD(LOAD_TWIN(%i1 + %o5, %g2, %g3), 304 EX_LD(LOAD_TWIN(%i1 + %o5, %g2, %g3), NG_ret_i2_plus_g1_minus_16) 305 MIX_THREE_WORDS(%o3, %g2, %g3, %i5, GL 305 MIX_THREE_WORDS(%o3, %g2, %g3, %i5, GLOBAL_SPARE, %o1) 306 306 307 EX_ST(STORE_INIT(%o3, %o0 + 0x10), NG_ 307 EX_ST(STORE_INIT(%o3, %o0 + 0x10), NG_ret_i2_plus_g1_minus_16) 308 EX_ST(STORE_INIT(%g2, %o0 + 0x18), NG_ 308 EX_ST(STORE_INIT(%g2, %o0 + 0x18), NG_ret_i2_plus_g1_minus_24) 309 309 310 EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3), 310 EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3), NG_ret_i2_plus_g1_minus_32) 311 MIX_THREE_WORDS(%g3, %o2, %o3, %i5, GL 311 MIX_THREE_WORDS(%g3, %o2, %o3, %i5, GLOBAL_SPARE, %o1) 312 312 313 EX_ST(STORE_INIT(%g3, %o0 + 0x20), NG_ 313 EX_ST(STORE_INIT(%g3, %o0 + 0x20), NG_ret_i2_plus_g1_minus_32) 314 EX_ST(STORE_INIT(%o2, %o0 + 0x28), NG_ 314 EX_ST(STORE_INIT(%o2, %o0 + 0x28), NG_ret_i2_plus_g1_minus_40) 315 315 316 EX_LD(LOAD_TWIN(%i1 + %i3, %g2, %g3), 316 EX_LD(LOAD_TWIN(%i1 + %i3, %g2, %g3), NG_ret_i2_plus_g1_minus_48) 317 add %i1, 64, %i1 317 add %i1, 64, %i1 318 MIX_THREE_WORDS(%o3, %g2, %g3, %i5, GL 318 MIX_THREE_WORDS(%o3, %g2, %g3, %i5, GLOBAL_SPARE, %o1) 319 319 320 EX_ST(STORE_INIT(%o3, %o0 + 0x30), NG_ 320 EX_ST(STORE_INIT(%o3, %o0 + 0x30), NG_ret_i2_plus_g1_minus_48) 321 EX_ST(STORE_INIT(%g2, %o0 + 0x38), NG_ 321 EX_ST(STORE_INIT(%g2, %o0 + 0x38), NG_ret_i2_plus_g1_minus_56) 322 322 323 subcc %g1, 64, %g1 323 subcc %g1, 64, %g1 324 bne,pt %XCC, 9b 324 bne,pt %XCC, 9b 325 add %o0, 64, %o0 325 add %o0, 64, %o0 326 326 327 ba,pt %XCC, 60f 327 ba,pt %XCC, 60f 328 add %i1, %i4, %i1 328 add %i1, %i4, %i1 329 329 330 10: /* Destination is 64-byte aligned, sou 330 10: /* Destination is 64-byte aligned, source was only 8-byte 331 * aligned but it has been subtracted 331 * aligned but it has been subtracted by 8 and we perform 332 * one twin load ahead, then add 8 bac 332 * one twin load ahead, then add 8 back into source when 333 * we finish the loop. 333 * we finish the loop. 334 */ 334 */ 335 EX_LD(LOAD_TWIN(%i1, %o4, %o5), NG_ret 335 EX_LD(LOAD_TWIN(%i1, %o4, %o5), NG_ret_i2_plus_g1) 336 mov 16, %o7 336 mov 16, %o7 337 mov 32, %g2 337 mov 32, %g2 338 mov 48, %g3 338 mov 48, %g3 339 mov 64, %o1 339 mov 64, %o1 340 1: EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3), 340 1: EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3), NG_ret_i2_plus_g1) 341 LOAD(prefetch, %i1 + %o1, #one_read) 341 LOAD(prefetch, %i1 + %o1, #one_read) 342 EX_ST(STORE_INIT(%o5, %o0 + 0x00), NG_ 342 EX_ST(STORE_INIT(%o5, %o0 + 0x00), NG_ret_i2_plus_g1) ! initializes cache line 343 EX_ST(STORE_INIT(%o2, %o0 + 0x08), NG_ 343 EX_ST(STORE_INIT(%o2, %o0 + 0x08), NG_ret_i2_plus_g1_minus_8) 344 EX_LD(LOAD_TWIN(%i1 + %g2, %o4, %o5), 344 EX_LD(LOAD_TWIN(%i1 + %g2, %o4, %o5), NG_ret_i2_plus_g1_minus_16) 345 EX_ST(STORE_INIT(%o3, %o0 + 0x10), NG_ 345 EX_ST(STORE_INIT(%o3, %o0 + 0x10), NG_ret_i2_plus_g1_minus_16) 346 EX_ST(STORE_INIT(%o4, %o0 + 0x18), NG_ 346 EX_ST(STORE_INIT(%o4, %o0 + 0x18), NG_ret_i2_plus_g1_minus_24) 347 EX_LD(LOAD_TWIN(%i1 + %g3, %o2, %o3), 347 EX_LD(LOAD_TWIN(%i1 + %g3, %o2, %o3), NG_ret_i2_plus_g1_minus_32) 348 EX_ST(STORE_INIT(%o5, %o0 + 0x20), NG_ 348 EX_ST(STORE_INIT(%o5, %o0 + 0x20), NG_ret_i2_plus_g1_minus_32) 349 EX_ST(STORE_INIT(%o2, %o0 + 0x28), NG_ 349 EX_ST(STORE_INIT(%o2, %o0 + 0x28), NG_ret_i2_plus_g1_minus_40) 350 EX_LD(LOAD_TWIN(%i1 + %o1, %o4, %o5), 350 EX_LD(LOAD_TWIN(%i1 + %o1, %o4, %o5), NG_ret_i2_plus_g1_minus_48) 351 add %i1, 64, %i1 351 add %i1, 64, %i1 352 EX_ST(STORE_INIT(%o3, %o0 + 0x30), NG_ 352 EX_ST(STORE_INIT(%o3, %o0 + 0x30), NG_ret_i2_plus_g1_minus_48) 353 EX_ST(STORE_INIT(%o4, %o0 + 0x38), NG_ 353 EX_ST(STORE_INIT(%o4, %o0 + 0x38), NG_ret_i2_plus_g1_minus_56) 354 subcc %g1, 64, %g1 354 subcc %g1, 64, %g1 355 bne,pt %XCC, 1b 355 bne,pt %XCC, 1b 356 add %o0, 64, %o0 356 add %o0, 64, %o0 357 357 358 ba,pt %XCC, 60f 358 ba,pt %XCC, 60f 359 add %i1, 0x8, %i1 359 add %i1, 0x8, %i1 360 360 361 50: /* Destination is 64-byte aligned, and 361 50: /* Destination is 64-byte aligned, and source is 16-byte 362 * aligned. 362 * aligned. 363 */ 363 */ 364 mov 16, %o7 364 mov 16, %o7 365 mov 32, %g2 365 mov 32, %g2 366 mov 48, %g3 366 mov 48, %g3 367 mov 64, %o1 367 mov 64, %o1 368 1: EX_LD(LOAD_TWIN(%i1 + %g0, %o4, %o5), 368 1: EX_LD(LOAD_TWIN(%i1 + %g0, %o4, %o5), NG_ret_i2_plus_g1) 369 EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3), 369 EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3), NG_ret_i2_plus_g1) 370 LOAD(prefetch, %i1 + %o1, #one_read) 370 LOAD(prefetch, %i1 + %o1, #one_read) 371 EX_ST(STORE_INIT(%o4, %o0 + 0x00), NG_ 371 EX_ST(STORE_INIT(%o4, %o0 + 0x00), NG_ret_i2_plus_g1) ! initializes cache line 372 EX_ST(STORE_INIT(%o5, %o0 + 0x08), NG_ 372 EX_ST(STORE_INIT(%o5, %o0 + 0x08), NG_ret_i2_plus_g1_minus_8) 373 EX_LD(LOAD_TWIN(%i1 + %g2, %o4, %o5), 373 EX_LD(LOAD_TWIN(%i1 + %g2, %o4, %o5), NG_ret_i2_plus_g1_minus_16) 374 EX_ST(STORE_INIT(%o2, %o0 + 0x10), NG_ 374 EX_ST(STORE_INIT(%o2, %o0 + 0x10), NG_ret_i2_plus_g1_minus_16) 375 EX_ST(STORE_INIT(%o3, %o0 + 0x18), NG_ 375 EX_ST(STORE_INIT(%o3, %o0 + 0x18), NG_ret_i2_plus_g1_minus_24) 376 EX_LD(LOAD_TWIN(%i1 + %g3, %o2, %o3), 376 EX_LD(LOAD_TWIN(%i1 + %g3, %o2, %o3), NG_ret_i2_plus_g1_minus_32) 377 add %i1, 64, %i1 377 add %i1, 64, %i1 378 EX_ST(STORE_INIT(%o4, %o0 + 0x20), NG_ 378 EX_ST(STORE_INIT(%o4, %o0 + 0x20), NG_ret_i2_plus_g1_minus_32) 379 EX_ST(STORE_INIT(%o5, %o0 + 0x28), NG_ 379 EX_ST(STORE_INIT(%o5, %o0 + 0x28), NG_ret_i2_plus_g1_minus_40) 380 EX_ST(STORE_INIT(%o2, %o0 + 0x30), NG_ 380 EX_ST(STORE_INIT(%o2, %o0 + 0x30), NG_ret_i2_plus_g1_minus_48) 381 EX_ST(STORE_INIT(%o3, %o0 + 0x38), NG_ 381 EX_ST(STORE_INIT(%o3, %o0 + 0x38), NG_ret_i2_plus_g1_minus_56) 382 subcc %g1, 64, %g1 382 subcc %g1, 64, %g1 383 bne,pt %XCC, 1b 383 bne,pt %XCC, 1b 384 add %o0, 64, %o0 384 add %o0, 64, %o0 385 /* fall through */ 385 /* fall through */ 386 386 387 60: 387 60: 388 membar #Sync 388 membar #Sync 389 389 390 /* %i2 contains any final bytes still 390 /* %i2 contains any final bytes still needed to be copied 391 * over. If anything is left, we copy 391 * over. If anything is left, we copy it one byte at a time. 392 */ 392 */ 393 RESTORE_ASI(%i3) 393 RESTORE_ASI(%i3) 394 brz,pt %i2, 85f 394 brz,pt %i2, 85f 395 sub %o0, %i1, %i3 395 sub %o0, %i1, %i3 396 ba,a,pt %XCC, 90f 396 ba,a,pt %XCC, 90f 397 nop << 398 397 399 .align 64 398 .align 64 400 70: /* 16 < len <= 64 */ 399 70: /* 16 < len <= 64 */ 401 bne,pn %XCC, 75f 400 bne,pn %XCC, 75f 402 sub %o0, %i1, %i3 401 sub %o0, %i1, %i3 403 402 404 72: 403 72: 405 andn %i2, 0xf, %i4 404 andn %i2, 0xf, %i4 406 and %i2, 0xf, %i2 405 and %i2, 0xf, %i2 407 1: subcc %i4, 0x10, %i4 406 1: subcc %i4, 0x10, %i4 408 EX_LD(LOAD(ldx, %i1, %o4), NG_ret_i2_p 407 EX_LD(LOAD(ldx, %i1, %o4), NG_ret_i2_plus_i4) 409 add %i1, 0x08, %i1 408 add %i1, 0x08, %i1 410 EX_LD(LOAD(ldx, %i1, %g1), NG_ret_i2_p 409 EX_LD(LOAD(ldx, %i1, %g1), NG_ret_i2_plus_i4) 411 sub %i1, 0x08, %i1 410 sub %i1, 0x08, %i1 412 EX_ST(STORE(stx, %o4, %i1 + %i3), NG_r 411 EX_ST(STORE(stx, %o4, %i1 + %i3), NG_ret_i2_plus_i4) 413 add %i1, 0x8, %i1 412 add %i1, 0x8, %i1 414 EX_ST(STORE(stx, %g1, %i1 + %i3), NG_r 413 EX_ST(STORE(stx, %g1, %i1 + %i3), NG_ret_i2_plus_i4_minus_8) 415 bgu,pt %XCC, 1b 414 bgu,pt %XCC, 1b 416 add %i1, 0x8, %i1 415 add %i1, 0x8, %i1 417 73: andcc %i2, 0x8, %g0 416 73: andcc %i2, 0x8, %g0 418 be,pt %XCC, 1f 417 be,pt %XCC, 1f 419 nop 418 nop 420 sub %i2, 0x8, %i2 419 sub %i2, 0x8, %i2 421 EX_LD(LOAD(ldx, %i1, %o4), NG_ret_i2_p 420 EX_LD(LOAD(ldx, %i1, %o4), NG_ret_i2_plus_8) 422 EX_ST(STORE(stx, %o4, %i1 + %i3), NG_r 421 EX_ST(STORE(stx, %o4, %i1 + %i3), NG_ret_i2_plus_8) 423 add %i1, 0x8, %i1 422 add %i1, 0x8, %i1 424 1: andcc %i2, 0x4, %g0 423 1: andcc %i2, 0x4, %g0 425 be,pt %XCC, 1f 424 be,pt %XCC, 1f 426 nop 425 nop 427 sub %i2, 0x4, %i2 426 sub %i2, 0x4, %i2 428 EX_LD(LOAD(lduw, %i1, %i5), NG_ret_i2_ 427 EX_LD(LOAD(lduw, %i1, %i5), NG_ret_i2_plus_4) 429 EX_ST(STORE(stw, %i5, %i1 + %i3), NG_r 428 EX_ST(STORE(stw, %i5, %i1 + %i3), NG_ret_i2_plus_4) 430 add %i1, 0x4, %i1 429 add %i1, 0x4, %i1 431 1: cmp %i2, 0 430 1: cmp %i2, 0 432 be,pt %XCC, 85f 431 be,pt %XCC, 85f 433 nop 432 nop 434 ba,pt %xcc, 90f 433 ba,pt %xcc, 90f 435 nop 434 nop 436 435 437 75: 436 75: 438 andcc %o0, 0x7, %g1 437 andcc %o0, 0x7, %g1 439 sub %g1, 0x8, %g1 438 sub %g1, 0x8, %g1 440 be,pn %icc, 2f 439 be,pn %icc, 2f 441 sub %g0, %g1, %g1 440 sub %g0, %g1, %g1 442 sub %i2, %g1, %i2 441 sub %i2, %g1, %i2 443 442 444 1: subcc %g1, 1, %g1 443 1: subcc %g1, 1, %g1 445 EX_LD(LOAD(ldub, %i1, %i5), NG_ret_i2_ 444 EX_LD(LOAD(ldub, %i1, %i5), NG_ret_i2_plus_g1_plus_1) 446 EX_ST(STORE(stb, %i5, %i1 + %i3), NG_r 445 EX_ST(STORE(stb, %i5, %i1 + %i3), NG_ret_i2_plus_g1_plus_1) 447 bgu,pt %icc, 1b 446 bgu,pt %icc, 1b 448 add %i1, 1, %i1 447 add %i1, 1, %i1 449 448 450 2: add %i1, %i3, %o0 449 2: add %i1, %i3, %o0 451 andcc %i1, 0x7, %g1 450 andcc %i1, 0x7, %g1 452 bne,pt %icc, 8f 451 bne,pt %icc, 8f 453 sll %g1, 3, %g1 452 sll %g1, 3, %g1 454 453 455 cmp %i2, 16 454 cmp %i2, 16 456 bgeu,pt %icc, 72b 455 bgeu,pt %icc, 72b 457 nop 456 nop 458 ba,a,pt %xcc, 73b 457 ba,a,pt %xcc, 73b 459 458 460 8: mov 64, %i3 459 8: mov 64, %i3 461 andn %i1, 0x7, %i1 460 andn %i1, 0x7, %i1 462 EX_LD(LOAD(ldx, %i1, %g2), NG_ret_i2) 461 EX_LD(LOAD(ldx, %i1, %g2), NG_ret_i2) 463 sub %i3, %g1, %i3 462 sub %i3, %g1, %i3 464 andn %i2, 0x7, %i4 463 andn %i2, 0x7, %i4 465 sllx %g2, %g1, %g2 464 sllx %g2, %g1, %g2 466 1: add %i1, 0x8, %i1 465 1: add %i1, 0x8, %i1 467 EX_LD(LOAD(ldx, %i1, %g3), NG_ret_i2_a 466 EX_LD(LOAD(ldx, %i1, %g3), NG_ret_i2_and_7_plus_i4) 468 subcc %i4, 0x8, %i4 467 subcc %i4, 0x8, %i4 469 srlx %g3, %i3, %i5 468 srlx %g3, %i3, %i5 470 or %i5, %g2, %i5 469 or %i5, %g2, %i5 471 EX_ST(STORE(stx, %i5, %o0), NG_ret_i2_ 470 EX_ST(STORE(stx, %i5, %o0), NG_ret_i2_and_7_plus_i4) 472 add %o0, 0x8, %o0 471 add %o0, 0x8, %o0 473 bgu,pt %icc, 1b 472 bgu,pt %icc, 1b 474 sllx %g3, %g1, %g2 473 sllx %g3, %g1, %g2 475 474 476 srl %g1, 3, %g1 475 srl %g1, 3, %g1 477 andcc %i2, 0x7, %i2 476 andcc %i2, 0x7, %i2 478 be,pn %icc, 85f 477 be,pn %icc, 85f 479 add %i1, %g1, %i1 478 add %i1, %g1, %i1 480 ba,pt %xcc, 90f 479 ba,pt %xcc, 90f 481 sub %o0, %i1, %i3 480 sub %o0, %i1, %i3 482 481 483 .align 64 482 .align 64 484 80: /* 0 < len <= 16 */ 483 80: /* 0 < len <= 16 */ 485 andcc %i3, 0x3, %g0 484 andcc %i3, 0x3, %g0 486 bne,pn %XCC, 90f 485 bne,pn %XCC, 90f 487 sub %o0, %i1, %i3 486 sub %o0, %i1, %i3 488 487 489 1: 488 1: 490 subcc %i2, 4, %i2 489 subcc %i2, 4, %i2 491 EX_LD(LOAD(lduw, %i1, %g1), NG_ret_i2_ 490 EX_LD(LOAD(lduw, %i1, %g1), NG_ret_i2_plus_4) 492 EX_ST(STORE(stw, %g1, %i1 + %i3), NG_r 491 EX_ST(STORE(stw, %g1, %i1 + %i3), NG_ret_i2_plus_4) 493 bgu,pt %XCC, 1b 492 bgu,pt %XCC, 1b 494 add %i1, 4, %i1 493 add %i1, 4, %i1 495 494 496 85: ret 495 85: ret 497 restore EX_RETVAL(%i0), %g0, % 496 restore EX_RETVAL(%i0), %g0, %o0 498 497 499 .align 32 498 .align 32 500 90: 499 90: 501 subcc %i2, 1, %i2 500 subcc %i2, 1, %i2 502 EX_LD(LOAD(ldub, %i1, %g1), NG_ret_i2_ 501 EX_LD(LOAD(ldub, %i1, %g1), NG_ret_i2_plus_1) 503 EX_ST(STORE(stb, %g1, %i1 + %i3), NG_r 502 EX_ST(STORE(stb, %g1, %i1 + %i3), NG_ret_i2_plus_1) 504 bgu,pt %XCC, 90b 503 bgu,pt %XCC, 90b 505 add %i1, 1, %i1 504 add %i1, 1, %i1 506 ret 505 ret 507 restore EX_RETVAL(%i0), %g0, % 506 restore EX_RETVAL(%i0), %g0, %o0 508 507 509 .size FUNC_NAME, .-FUNC_NAME 508 .size FUNC_NAME, .-FUNC_NAME
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.