1 /* SPDX-License-Identifier: GPL-2.0 */ 1 /* SPDX-License-Identifier: GPL-2.0 */ 2 /* NGmemcpy.S: Niagara optimized memcpy. 2 /* NGmemcpy.S: Niagara optimized memcpy. 3 * 3 * 4 * Copyright (C) 2006, 2007 David S. Miller (d 4 * Copyright (C) 2006, 2007 David S. Miller (davem@davemloft.net) 5 */ 5 */ 6 6 7 #ifdef __KERNEL__ 7 #ifdef __KERNEL__ 8 #include <linux/linkage.h> 8 #include <linux/linkage.h> 9 #include <asm/asi.h> 9 #include <asm/asi.h> 10 #include <asm/thread_info.h> 10 #include <asm/thread_info.h> 11 #define GLOBAL_SPARE %g7 11 #define GLOBAL_SPARE %g7 12 #define RESTORE_ASI(TMP) \ 12 #define RESTORE_ASI(TMP) \ 13 wr %g0, ASI_AIUS, %asi !! 13 ldub [%g6 + TI_CURRENT_DS], TMP; \ >> 14 wr TMP, 0x0, %asi; 14 #else 15 #else 15 #define GLOBAL_SPARE %g5 16 #define GLOBAL_SPARE %g5 16 #define RESTORE_ASI(TMP) \ 17 #define RESTORE_ASI(TMP) \ 17 wr %g0, ASI_PNF, %asi 18 wr %g0, ASI_PNF, %asi 18 #endif 19 #endif 19 20 20 #ifdef __sparc_v9__ 21 #ifdef __sparc_v9__ 21 #define SAVE_AMOUNT 128 22 #define SAVE_AMOUNT 128 22 #else 23 #else 23 #define SAVE_AMOUNT 64 24 #define SAVE_AMOUNT 64 24 #endif 25 #endif 25 26 26 #ifndef STORE_ASI 27 #ifndef STORE_ASI 27 #define STORE_ASI ASI_BLK_INIT_QUAD_LDD_ 28 #define STORE_ASI ASI_BLK_INIT_QUAD_LDD_P 28 #endif 29 #endif 29 30 30 #ifndef EX_LD 31 #ifndef EX_LD 31 #define EX_LD(x,y) x 32 #define EX_LD(x,y) x 32 #endif 33 #endif 33 34 34 #ifndef EX_ST 35 #ifndef EX_ST 35 #define EX_ST(x,y) x 36 #define EX_ST(x,y) x 36 #endif 37 #endif 37 38 38 #ifndef LOAD 39 #ifndef LOAD 39 #ifndef MEMCPY_DEBUG 40 #ifndef MEMCPY_DEBUG 40 #define LOAD(type,addr,dest) type [addr], d 41 #define LOAD(type,addr,dest) type [addr], dest 41 #else 42 #else 42 #define LOAD(type,addr,dest) type##a [addr] 43 #define LOAD(type,addr,dest) type##a [addr] 0x80, dest 43 #endif 44 #endif 44 #endif 45 #endif 45 46 46 #ifndef LOAD_TWIN 47 #ifndef LOAD_TWIN 47 #define LOAD_TWIN(addr_reg,dest0,dest1) \ 48 #define LOAD_TWIN(addr_reg,dest0,dest1) \ 48 ldda [addr_reg] ASI_BLK_INIT_QUAD_LDD_ 49 ldda [addr_reg] ASI_BLK_INIT_QUAD_LDD_P, dest0 49 #endif 50 #endif 50 51 51 #ifndef STORE 52 #ifndef STORE 52 #define STORE(type,src,addr) type src, [add 53 #define STORE(type,src,addr) type src, [addr] 53 #endif 54 #endif 54 55 55 #ifndef STORE_INIT 56 #ifndef STORE_INIT 56 #ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA 57 #ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA 57 #define STORE_INIT(src,addr) stxa src, [add 58 #define STORE_INIT(src,addr) stxa src, [addr] %asi 58 #else 59 #else 59 #define STORE_INIT(src,addr) stx src, [addr 60 #define STORE_INIT(src,addr) stx src, [addr + 0x00] 60 #endif 61 #endif 61 #endif 62 #endif 62 63 63 #ifndef FUNC_NAME 64 #ifndef FUNC_NAME 64 #define FUNC_NAME NGmemcpy 65 #define FUNC_NAME NGmemcpy 65 #endif 66 #endif 66 67 67 #ifndef PREAMBLE 68 #ifndef PREAMBLE 68 #define PREAMBLE 69 #define PREAMBLE 69 #endif 70 #endif 70 71 71 #ifndef XCC 72 #ifndef XCC 72 #define XCC xcc 73 #define XCC xcc 73 #endif 74 #endif 74 75 75 .register %g2,#scratch 76 .register %g2,#scratch 76 .register %g3,#scratch 77 .register %g3,#scratch 77 78 78 .text 79 .text 79 #ifndef EX_RETVAL 80 #ifndef EX_RETVAL 80 #define EX_RETVAL(x) x 81 #define EX_RETVAL(x) x 81 __restore_asi: 82 __restore_asi: 82 ret 83 ret 83 wr %g0, ASI_AIUS, %asi 84 wr %g0, ASI_AIUS, %asi 84 restore 85 restore 85 ENTRY(NG_ret_i2_plus_i4_plus_1) 86 ENTRY(NG_ret_i2_plus_i4_plus_1) 86 ba,pt %xcc, __restore_asi 87 ba,pt %xcc, __restore_asi 87 add %i2, %i5, %i0 88 add %i2, %i5, %i0 88 ENDPROC(NG_ret_i2_plus_i4_plus_1) 89 ENDPROC(NG_ret_i2_plus_i4_plus_1) 89 ENTRY(NG_ret_i2_plus_g1) 90 ENTRY(NG_ret_i2_plus_g1) 90 ba,pt %xcc, __restore_asi 91 ba,pt %xcc, __restore_asi 91 add %i2, %g1, %i0 92 add %i2, %g1, %i0 92 ENDPROC(NG_ret_i2_plus_g1) 93 ENDPROC(NG_ret_i2_plus_g1) 93 ENTRY(NG_ret_i2_plus_g1_minus_8) 94 ENTRY(NG_ret_i2_plus_g1_minus_8) 94 sub %g1, 8, %g1 95 sub %g1, 8, %g1 95 ba,pt %xcc, __restore_asi 96 ba,pt %xcc, __restore_asi 96 add %i2, %g1, %i0 97 add %i2, %g1, %i0 97 ENDPROC(NG_ret_i2_plus_g1_minus_8) 98 ENDPROC(NG_ret_i2_plus_g1_minus_8) 98 ENTRY(NG_ret_i2_plus_g1_minus_16) 99 ENTRY(NG_ret_i2_plus_g1_minus_16) 99 sub %g1, 16, %g1 100 sub %g1, 16, %g1 100 ba,pt %xcc, __restore_asi 101 ba,pt %xcc, __restore_asi 101 add %i2, %g1, %i0 102 add %i2, %g1, %i0 102 ENDPROC(NG_ret_i2_plus_g1_minus_16) 103 ENDPROC(NG_ret_i2_plus_g1_minus_16) 103 ENTRY(NG_ret_i2_plus_g1_minus_24) 104 ENTRY(NG_ret_i2_plus_g1_minus_24) 104 sub %g1, 24, %g1 105 sub %g1, 24, %g1 105 ba,pt %xcc, __restore_asi 106 ba,pt %xcc, __restore_asi 106 add %i2, %g1, %i0 107 add %i2, %g1, %i0 107 ENDPROC(NG_ret_i2_plus_g1_minus_24) 108 ENDPROC(NG_ret_i2_plus_g1_minus_24) 108 ENTRY(NG_ret_i2_plus_g1_minus_32) 109 ENTRY(NG_ret_i2_plus_g1_minus_32) 109 sub %g1, 32, %g1 110 sub %g1, 32, %g1 110 ba,pt %xcc, __restore_asi 111 ba,pt %xcc, __restore_asi 111 add %i2, %g1, %i0 112 add %i2, %g1, %i0 112 ENDPROC(NG_ret_i2_plus_g1_minus_32) 113 ENDPROC(NG_ret_i2_plus_g1_minus_32) 113 ENTRY(NG_ret_i2_plus_g1_minus_40) 114 ENTRY(NG_ret_i2_plus_g1_minus_40) 114 sub %g1, 40, %g1 115 sub %g1, 40, %g1 115 ba,pt %xcc, __restore_asi 116 ba,pt %xcc, __restore_asi 116 add %i2, %g1, %i0 117 add %i2, %g1, %i0 117 ENDPROC(NG_ret_i2_plus_g1_minus_40) 118 ENDPROC(NG_ret_i2_plus_g1_minus_40) 118 ENTRY(NG_ret_i2_plus_g1_minus_48) 119 ENTRY(NG_ret_i2_plus_g1_minus_48) 119 sub %g1, 48, %g1 120 sub %g1, 48, %g1 120 ba,pt %xcc, __restore_asi 121 ba,pt %xcc, __restore_asi 121 add %i2, %g1, %i0 122 add %i2, %g1, %i0 122 ENDPROC(NG_ret_i2_plus_g1_minus_48) 123 ENDPROC(NG_ret_i2_plus_g1_minus_48) 123 ENTRY(NG_ret_i2_plus_g1_minus_56) 124 ENTRY(NG_ret_i2_plus_g1_minus_56) 124 sub %g1, 56, %g1 125 sub %g1, 56, %g1 125 ba,pt %xcc, __restore_asi 126 ba,pt %xcc, __restore_asi 126 add %i2, %g1, %i0 127 add %i2, %g1, %i0 127 ENDPROC(NG_ret_i2_plus_g1_minus_56) 128 ENDPROC(NG_ret_i2_plus_g1_minus_56) 128 ENTRY(NG_ret_i2_plus_i4) 129 ENTRY(NG_ret_i2_plus_i4) 129 ba,pt %xcc, __restore_asi 130 ba,pt %xcc, __restore_asi 130 add %i2, %i4, %i0 131 add %i2, %i4, %i0 131 ENDPROC(NG_ret_i2_plus_i4) 132 ENDPROC(NG_ret_i2_plus_i4) 132 ENTRY(NG_ret_i2_plus_i4_minus_8) 133 ENTRY(NG_ret_i2_plus_i4_minus_8) 133 sub %i4, 8, %i4 134 sub %i4, 8, %i4 134 ba,pt %xcc, __restore_asi 135 ba,pt %xcc, __restore_asi 135 add %i2, %i4, %i0 136 add %i2, %i4, %i0 136 ENDPROC(NG_ret_i2_plus_i4_minus_8) 137 ENDPROC(NG_ret_i2_plus_i4_minus_8) 137 ENTRY(NG_ret_i2_plus_8) 138 ENTRY(NG_ret_i2_plus_8) 138 ba,pt %xcc, __restore_asi 139 ba,pt %xcc, __restore_asi 139 add %i2, 8, %i0 140 add %i2, 8, %i0 140 ENDPROC(NG_ret_i2_plus_8) 141 ENDPROC(NG_ret_i2_plus_8) 141 ENTRY(NG_ret_i2_plus_4) 142 ENTRY(NG_ret_i2_plus_4) 142 ba,pt %xcc, __restore_asi 143 ba,pt %xcc, __restore_asi 143 add %i2, 4, %i0 144 add %i2, 4, %i0 144 ENDPROC(NG_ret_i2_plus_4) 145 ENDPROC(NG_ret_i2_plus_4) 145 ENTRY(NG_ret_i2_plus_1) 146 ENTRY(NG_ret_i2_plus_1) 146 ba,pt %xcc, __restore_asi 147 ba,pt %xcc, __restore_asi 147 add %i2, 1, %i0 148 add %i2, 1, %i0 148 ENDPROC(NG_ret_i2_plus_1) 149 ENDPROC(NG_ret_i2_plus_1) 149 ENTRY(NG_ret_i2_plus_g1_plus_1) 150 ENTRY(NG_ret_i2_plus_g1_plus_1) 150 add %g1, 1, %g1 151 add %g1, 1, %g1 151 ba,pt %xcc, __restore_asi 152 ba,pt %xcc, __restore_asi 152 add %i2, %g1, %i0 153 add %i2, %g1, %i0 153 ENDPROC(NG_ret_i2_plus_g1_plus_1) 154 ENDPROC(NG_ret_i2_plus_g1_plus_1) 154 ENTRY(NG_ret_i2) 155 ENTRY(NG_ret_i2) 155 ba,pt %xcc, __restore_asi 156 ba,pt %xcc, __restore_asi 156 mov %i2, %i0 157 mov %i2, %i0 157 ENDPROC(NG_ret_i2) 158 ENDPROC(NG_ret_i2) 158 ENTRY(NG_ret_i2_and_7_plus_i4) 159 ENTRY(NG_ret_i2_and_7_plus_i4) 159 and %i2, 7, %i2 160 and %i2, 7, %i2 160 ba,pt %xcc, __restore_asi 161 ba,pt %xcc, __restore_asi 161 add %i2, %i4, %i0 162 add %i2, %i4, %i0 162 ENDPROC(NG_ret_i2_and_7_plus_i4) 163 ENDPROC(NG_ret_i2_and_7_plus_i4) 163 #endif 164 #endif 164 165 165 .align 64 166 .align 64 166 167 167 .globl FUNC_NAME 168 .globl FUNC_NAME 168 .type FUNC_NAME,#function 169 .type FUNC_NAME,#function 169 FUNC_NAME: /* %i0=dst, %i1=src, %i2=len * 170 FUNC_NAME: /* %i0=dst, %i1=src, %i2=len */ 170 PREAMBLE 171 PREAMBLE 171 save %sp, -SAVE_AMOUNT, %sp 172 save %sp, -SAVE_AMOUNT, %sp 172 srlx %i2, 31, %g2 173 srlx %i2, 31, %g2 173 cmp %g2, 0 174 cmp %g2, 0 174 tne %xcc, 5 175 tne %xcc, 5 175 mov %i0, %o0 176 mov %i0, %o0 176 cmp %i2, 0 177 cmp %i2, 0 177 be,pn %XCC, 85f 178 be,pn %XCC, 85f 178 or %o0, %i1, %i3 179 or %o0, %i1, %i3 179 cmp %i2, 16 180 cmp %i2, 16 180 blu,a,pn %XCC, 80f 181 blu,a,pn %XCC, 80f 181 or %i3, %i2, %i3 182 or %i3, %i2, %i3 182 183 183 /* 2 blocks (128 bytes) is the minimum 184 /* 2 blocks (128 bytes) is the minimum we can do the block 184 * copy with. We need to ensure that 185 * copy with. We need to ensure that we'll iterate at least 185 * once in the block copy loop. At wo 186 * once in the block copy loop. At worst we'll need to align 186 * the destination to a 64-byte bounda 187 * the destination to a 64-byte boundary which can chew up 187 * to (64 - 1) bytes from the length b 188 * to (64 - 1) bytes from the length before we perform the 188 * block copy loop. 189 * block copy loop. 189 */ 190 */ 190 cmp %i2, (2 * 64) 191 cmp %i2, (2 * 64) 191 blu,pt %XCC, 70f 192 blu,pt %XCC, 70f 192 andcc %i3, 0x7, %g0 193 andcc %i3, 0x7, %g0 193 194 194 /* %o0: dst 195 /* %o0: dst 195 * %i1: src 196 * %i1: src 196 * %i2: len (known to be >= 128) 197 * %i2: len (known to be >= 128) 197 * 198 * 198 * The block copy loops will use %i4/% 199 * The block copy loops will use %i4/%i5,%g2/%g3 as 199 * temporaries while copying the data. 200 * temporaries while copying the data. 200 */ 201 */ 201 202 202 LOAD(prefetch, %i1, #one_read) 203 LOAD(prefetch, %i1, #one_read) 203 wr %g0, STORE_ASI, %asi 204 wr %g0, STORE_ASI, %asi 204 205 205 /* Align destination on 64-byte bounda 206 /* Align destination on 64-byte boundary. */ 206 andcc %o0, (64 - 1), %i4 207 andcc %o0, (64 - 1), %i4 207 be,pt %XCC, 2f 208 be,pt %XCC, 2f 208 sub %i4, 64, %i4 209 sub %i4, 64, %i4 209 sub %g0, %i4, %i4 ! byte 210 sub %g0, %i4, %i4 ! bytes to align dst 210 sub %i2, %i4, %i2 211 sub %i2, %i4, %i2 211 1: subcc %i4, 1, %i4 212 1: subcc %i4, 1, %i4 212 EX_LD(LOAD(ldub, %i1, %g1), NG_ret_i2_ 213 EX_LD(LOAD(ldub, %i1, %g1), NG_ret_i2_plus_i4_plus_1) 213 EX_ST(STORE(stb, %g1, %o0), NG_ret_i2_ 214 EX_ST(STORE(stb, %g1, %o0), NG_ret_i2_plus_i4_plus_1) 214 add %i1, 1, %i1 215 add %i1, 1, %i1 215 bne,pt %XCC, 1b 216 bne,pt %XCC, 1b 216 add %o0, 1, %o0 217 add %o0, 1, %o0 217 218 218 /* If the source is on a 16-byte bound 219 /* If the source is on a 16-byte boundary we can do 219 * the direct block copy loop. If it 220 * the direct block copy loop. If it is 8-byte aligned 220 * we can do the 16-byte loads offset 221 * we can do the 16-byte loads offset by -8 bytes and the 221 * init stores offset by one register. 222 * init stores offset by one register. 222 * 223 * 223 * If the source is not even 8-byte al 224 * If the source is not even 8-byte aligned, we need to do 224 * shifting and masking (basically int 225 * shifting and masking (basically integer faligndata). 225 * 226 * 226 * The careful bit with init stores is 227 * The careful bit with init stores is that if we store 227 * to any part of the cache line we ha 228 * to any part of the cache line we have to store the whole 228 * cacheline else we can end up with c 229 * cacheline else we can end up with corrupt L2 cache line 229 * contents. Since the loop works on 230 * contents. Since the loop works on 64-bytes of 64-byte 230 * aligned store data at a time, this 231 * aligned store data at a time, this is easy to ensure. 231 */ 232 */ 232 2: 233 2: 233 andcc %i1, (16 - 1), %i4 234 andcc %i1, (16 - 1), %i4 234 andn %i2, (64 - 1), %g1 235 andn %i2, (64 - 1), %g1 ! block copy loop iterator 235 be,pt %XCC, 50f 236 be,pt %XCC, 50f 236 sub %i2, %g1, %i2 237 sub %i2, %g1, %i2 ! final sub-block copy bytes 237 238 238 cmp %i4, 8 239 cmp %i4, 8 239 be,pt %XCC, 10f 240 be,pt %XCC, 10f 240 sub %i1, %i4, %i1 241 sub %i1, %i4, %i1 241 242 242 /* Neither 8-byte nor 16-byte aligned, 243 /* Neither 8-byte nor 16-byte aligned, shift and mask. */ 243 and %i4, 0x7, GLOBAL_SPARE 244 and %i4, 0x7, GLOBAL_SPARE 244 sll GLOBAL_SPARE, 3, GLOBA 245 sll GLOBAL_SPARE, 3, GLOBAL_SPARE 245 mov 64, %i5 246 mov 64, %i5 246 EX_LD(LOAD_TWIN(%i1, %g2, %g3), NG_ret 247 EX_LD(LOAD_TWIN(%i1, %g2, %g3), NG_ret_i2_plus_g1) 247 sub %i5, GLOBAL_SPARE, %i5 248 sub %i5, GLOBAL_SPARE, %i5 248 mov 16, %o4 249 mov 16, %o4 249 mov 32, %o5 250 mov 32, %o5 250 mov 48, %o7 251 mov 48, %o7 251 mov 64, %i3 252 mov 64, %i3 252 253 253 bg,pn %XCC, 9f 254 bg,pn %XCC, 9f 254 nop 255 nop 255 256 256 #define MIX_THREE_WORDS(WORD1, WORD2, WORD3, P 257 #define MIX_THREE_WORDS(WORD1, WORD2, WORD3, PRE_SHIFT, POST_SHIFT, TMP) \ 257 sllx WORD1, POST_SHIFT, WOR 258 sllx WORD1, POST_SHIFT, WORD1; \ 258 srlx WORD2, PRE_SHIFT, TMP; 259 srlx WORD2, PRE_SHIFT, TMP; \ 259 sllx WORD2, POST_SHIFT, WOR 260 sllx WORD2, POST_SHIFT, WORD2; \ 260 or WORD1, TMP, WORD1; \ 261 or WORD1, TMP, WORD1; \ 261 srlx WORD3, PRE_SHIFT, TMP; 262 srlx WORD3, PRE_SHIFT, TMP; \ 262 or WORD2, TMP, WORD2; 263 or WORD2, TMP, WORD2; 263 264 264 8: EX_LD(LOAD_TWIN(%i1 + %o4, %o2, %o3), 265 8: EX_LD(LOAD_TWIN(%i1 + %o4, %o2, %o3), NG_ret_i2_plus_g1) 265 MIX_THREE_WORDS(%g2, %g3, %o2, %i5, GL 266 MIX_THREE_WORDS(%g2, %g3, %o2, %i5, GLOBAL_SPARE, %o1) 266 LOAD(prefetch, %i1 + %i3, #one_read) 267 LOAD(prefetch, %i1 + %i3, #one_read) 267 268 268 EX_ST(STORE_INIT(%g2, %o0 + 0x00), NG_ 269 EX_ST(STORE_INIT(%g2, %o0 + 0x00), NG_ret_i2_plus_g1) 269 EX_ST(STORE_INIT(%g3, %o0 + 0x08), NG_ 270 EX_ST(STORE_INIT(%g3, %o0 + 0x08), NG_ret_i2_plus_g1_minus_8) 270 271 271 EX_LD(LOAD_TWIN(%i1 + %o5, %g2, %g3), 272 EX_LD(LOAD_TWIN(%i1 + %o5, %g2, %g3), NG_ret_i2_plus_g1_minus_16) 272 MIX_THREE_WORDS(%o2, %o3, %g2, %i5, GL 273 MIX_THREE_WORDS(%o2, %o3, %g2, %i5, GLOBAL_SPARE, %o1) 273 274 274 EX_ST(STORE_INIT(%o2, %o0 + 0x10), NG_ 275 EX_ST(STORE_INIT(%o2, %o0 + 0x10), NG_ret_i2_plus_g1_minus_16) 275 EX_ST(STORE_INIT(%o3, %o0 + 0x18), NG_ 276 EX_ST(STORE_INIT(%o3, %o0 + 0x18), NG_ret_i2_plus_g1_minus_24) 276 277 277 EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3), 278 EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3), NG_ret_i2_plus_g1_minus_32) 278 MIX_THREE_WORDS(%g2, %g3, %o2, %i5, GL 279 MIX_THREE_WORDS(%g2, %g3, %o2, %i5, GLOBAL_SPARE, %o1) 279 280 280 EX_ST(STORE_INIT(%g2, %o0 + 0x20), NG_ 281 EX_ST(STORE_INIT(%g2, %o0 + 0x20), NG_ret_i2_plus_g1_minus_32) 281 EX_ST(STORE_INIT(%g3, %o0 + 0x28), NG_ 282 EX_ST(STORE_INIT(%g3, %o0 + 0x28), NG_ret_i2_plus_g1_minus_40) 282 283 283 EX_LD(LOAD_TWIN(%i1 + %i3, %g2, %g3), 284 EX_LD(LOAD_TWIN(%i1 + %i3, %g2, %g3), NG_ret_i2_plus_g1_minus_48) 284 add %i1, 64, %i1 285 add %i1, 64, %i1 285 MIX_THREE_WORDS(%o2, %o3, %g2, %i5, GL 286 MIX_THREE_WORDS(%o2, %o3, %g2, %i5, GLOBAL_SPARE, %o1) 286 287 287 EX_ST(STORE_INIT(%o2, %o0 + 0x30), NG_ 288 EX_ST(STORE_INIT(%o2, %o0 + 0x30), NG_ret_i2_plus_g1_minus_48) 288 EX_ST(STORE_INIT(%o3, %o0 + 0x38), NG_ 289 EX_ST(STORE_INIT(%o3, %o0 + 0x38), NG_ret_i2_plus_g1_minus_56) 289 290 290 subcc %g1, 64, %g1 291 subcc %g1, 64, %g1 291 bne,pt %XCC, 8b 292 bne,pt %XCC, 8b 292 add %o0, 64, %o0 293 add %o0, 64, %o0 293 294 294 ba,pt %XCC, 60f 295 ba,pt %XCC, 60f 295 add %i1, %i4, %i1 296 add %i1, %i4, %i1 296 297 297 9: EX_LD(LOAD_TWIN(%i1 + %o4, %o2, %o3), 298 9: EX_LD(LOAD_TWIN(%i1 + %o4, %o2, %o3), NG_ret_i2_plus_g1) 298 MIX_THREE_WORDS(%g3, %o2, %o3, %i5, GL 299 MIX_THREE_WORDS(%g3, %o2, %o3, %i5, GLOBAL_SPARE, %o1) 299 LOAD(prefetch, %i1 + %i3, #one_read) 300 LOAD(prefetch, %i1 + %i3, #one_read) 300 301 301 EX_ST(STORE_INIT(%g3, %o0 + 0x00), NG_ 302 EX_ST(STORE_INIT(%g3, %o0 + 0x00), NG_ret_i2_plus_g1) 302 EX_ST(STORE_INIT(%o2, %o0 + 0x08), NG_ 303 EX_ST(STORE_INIT(%o2, %o0 + 0x08), NG_ret_i2_plus_g1_minus_8) 303 304 304 EX_LD(LOAD_TWIN(%i1 + %o5, %g2, %g3), 305 EX_LD(LOAD_TWIN(%i1 + %o5, %g2, %g3), NG_ret_i2_plus_g1_minus_16) 305 MIX_THREE_WORDS(%o3, %g2, %g3, %i5, GL 306 MIX_THREE_WORDS(%o3, %g2, %g3, %i5, GLOBAL_SPARE, %o1) 306 307 307 EX_ST(STORE_INIT(%o3, %o0 + 0x10), NG_ 308 EX_ST(STORE_INIT(%o3, %o0 + 0x10), NG_ret_i2_plus_g1_minus_16) 308 EX_ST(STORE_INIT(%g2, %o0 + 0x18), NG_ 309 EX_ST(STORE_INIT(%g2, %o0 + 0x18), NG_ret_i2_plus_g1_minus_24) 309 310 310 EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3), 311 EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3), NG_ret_i2_plus_g1_minus_32) 311 MIX_THREE_WORDS(%g3, %o2, %o3, %i5, GL 312 MIX_THREE_WORDS(%g3, %o2, %o3, %i5, GLOBAL_SPARE, %o1) 312 313 313 EX_ST(STORE_INIT(%g3, %o0 + 0x20), NG_ 314 EX_ST(STORE_INIT(%g3, %o0 + 0x20), NG_ret_i2_plus_g1_minus_32) 314 EX_ST(STORE_INIT(%o2, %o0 + 0x28), NG_ 315 EX_ST(STORE_INIT(%o2, %o0 + 0x28), NG_ret_i2_plus_g1_minus_40) 315 316 316 EX_LD(LOAD_TWIN(%i1 + %i3, %g2, %g3), 317 EX_LD(LOAD_TWIN(%i1 + %i3, %g2, %g3), NG_ret_i2_plus_g1_minus_48) 317 add %i1, 64, %i1 318 add %i1, 64, %i1 318 MIX_THREE_WORDS(%o3, %g2, %g3, %i5, GL 319 MIX_THREE_WORDS(%o3, %g2, %g3, %i5, GLOBAL_SPARE, %o1) 319 320 320 EX_ST(STORE_INIT(%o3, %o0 + 0x30), NG_ 321 EX_ST(STORE_INIT(%o3, %o0 + 0x30), NG_ret_i2_plus_g1_minus_48) 321 EX_ST(STORE_INIT(%g2, %o0 + 0x38), NG_ 322 EX_ST(STORE_INIT(%g2, %o0 + 0x38), NG_ret_i2_plus_g1_minus_56) 322 323 323 subcc %g1, 64, %g1 324 subcc %g1, 64, %g1 324 bne,pt %XCC, 9b 325 bne,pt %XCC, 9b 325 add %o0, 64, %o0 326 add %o0, 64, %o0 326 327 327 ba,pt %XCC, 60f 328 ba,pt %XCC, 60f 328 add %i1, %i4, %i1 329 add %i1, %i4, %i1 329 330 330 10: /* Destination is 64-byte aligned, sou 331 10: /* Destination is 64-byte aligned, source was only 8-byte 331 * aligned but it has been subtracted 332 * aligned but it has been subtracted by 8 and we perform 332 * one twin load ahead, then add 8 bac 333 * one twin load ahead, then add 8 back into source when 333 * we finish the loop. 334 * we finish the loop. 334 */ 335 */ 335 EX_LD(LOAD_TWIN(%i1, %o4, %o5), NG_ret 336 EX_LD(LOAD_TWIN(%i1, %o4, %o5), NG_ret_i2_plus_g1) 336 mov 16, %o7 337 mov 16, %o7 337 mov 32, %g2 338 mov 32, %g2 338 mov 48, %g3 339 mov 48, %g3 339 mov 64, %o1 340 mov 64, %o1 340 1: EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3), 341 1: EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3), NG_ret_i2_plus_g1) 341 LOAD(prefetch, %i1 + %o1, #one_read) 342 LOAD(prefetch, %i1 + %o1, #one_read) 342 EX_ST(STORE_INIT(%o5, %o0 + 0x00), NG_ 343 EX_ST(STORE_INIT(%o5, %o0 + 0x00), NG_ret_i2_plus_g1) ! initializes cache line 343 EX_ST(STORE_INIT(%o2, %o0 + 0x08), NG_ 344 EX_ST(STORE_INIT(%o2, %o0 + 0x08), NG_ret_i2_plus_g1_minus_8) 344 EX_LD(LOAD_TWIN(%i1 + %g2, %o4, %o5), 345 EX_LD(LOAD_TWIN(%i1 + %g2, %o4, %o5), NG_ret_i2_plus_g1_minus_16) 345 EX_ST(STORE_INIT(%o3, %o0 + 0x10), NG_ 346 EX_ST(STORE_INIT(%o3, %o0 + 0x10), NG_ret_i2_plus_g1_minus_16) 346 EX_ST(STORE_INIT(%o4, %o0 + 0x18), NG_ 347 EX_ST(STORE_INIT(%o4, %o0 + 0x18), NG_ret_i2_plus_g1_minus_24) 347 EX_LD(LOAD_TWIN(%i1 + %g3, %o2, %o3), 348 EX_LD(LOAD_TWIN(%i1 + %g3, %o2, %o3), NG_ret_i2_plus_g1_minus_32) 348 EX_ST(STORE_INIT(%o5, %o0 + 0x20), NG_ 349 EX_ST(STORE_INIT(%o5, %o0 + 0x20), NG_ret_i2_plus_g1_minus_32) 349 EX_ST(STORE_INIT(%o2, %o0 + 0x28), NG_ 350 EX_ST(STORE_INIT(%o2, %o0 + 0x28), NG_ret_i2_plus_g1_minus_40) 350 EX_LD(LOAD_TWIN(%i1 + %o1, %o4, %o5), 351 EX_LD(LOAD_TWIN(%i1 + %o1, %o4, %o5), NG_ret_i2_plus_g1_minus_48) 351 add %i1, 64, %i1 352 add %i1, 64, %i1 352 EX_ST(STORE_INIT(%o3, %o0 + 0x30), NG_ 353 EX_ST(STORE_INIT(%o3, %o0 + 0x30), NG_ret_i2_plus_g1_minus_48) 353 EX_ST(STORE_INIT(%o4, %o0 + 0x38), NG_ 354 EX_ST(STORE_INIT(%o4, %o0 + 0x38), NG_ret_i2_plus_g1_minus_56) 354 subcc %g1, 64, %g1 355 subcc %g1, 64, %g1 355 bne,pt %XCC, 1b 356 bne,pt %XCC, 1b 356 add %o0, 64, %o0 357 add %o0, 64, %o0 357 358 358 ba,pt %XCC, 60f 359 ba,pt %XCC, 60f 359 add %i1, 0x8, %i1 360 add %i1, 0x8, %i1 360 361 361 50: /* Destination is 64-byte aligned, and 362 50: /* Destination is 64-byte aligned, and source is 16-byte 362 * aligned. 363 * aligned. 363 */ 364 */ 364 mov 16, %o7 365 mov 16, %o7 365 mov 32, %g2 366 mov 32, %g2 366 mov 48, %g3 367 mov 48, %g3 367 mov 64, %o1 368 mov 64, %o1 368 1: EX_LD(LOAD_TWIN(%i1 + %g0, %o4, %o5), 369 1: EX_LD(LOAD_TWIN(%i1 + %g0, %o4, %o5), NG_ret_i2_plus_g1) 369 EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3), 370 EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3), NG_ret_i2_plus_g1) 370 LOAD(prefetch, %i1 + %o1, #one_read) 371 LOAD(prefetch, %i1 + %o1, #one_read) 371 EX_ST(STORE_INIT(%o4, %o0 + 0x00), NG_ 372 EX_ST(STORE_INIT(%o4, %o0 + 0x00), NG_ret_i2_plus_g1) ! initializes cache line 372 EX_ST(STORE_INIT(%o5, %o0 + 0x08), NG_ 373 EX_ST(STORE_INIT(%o5, %o0 + 0x08), NG_ret_i2_plus_g1_minus_8) 373 EX_LD(LOAD_TWIN(%i1 + %g2, %o4, %o5), 374 EX_LD(LOAD_TWIN(%i1 + %g2, %o4, %o5), NG_ret_i2_plus_g1_minus_16) 374 EX_ST(STORE_INIT(%o2, %o0 + 0x10), NG_ 375 EX_ST(STORE_INIT(%o2, %o0 + 0x10), NG_ret_i2_plus_g1_minus_16) 375 EX_ST(STORE_INIT(%o3, %o0 + 0x18), NG_ 376 EX_ST(STORE_INIT(%o3, %o0 + 0x18), NG_ret_i2_plus_g1_minus_24) 376 EX_LD(LOAD_TWIN(%i1 + %g3, %o2, %o3), 377 EX_LD(LOAD_TWIN(%i1 + %g3, %o2, %o3), NG_ret_i2_plus_g1_minus_32) 377 add %i1, 64, %i1 378 add %i1, 64, %i1 378 EX_ST(STORE_INIT(%o4, %o0 + 0x20), NG_ 379 EX_ST(STORE_INIT(%o4, %o0 + 0x20), NG_ret_i2_plus_g1_minus_32) 379 EX_ST(STORE_INIT(%o5, %o0 + 0x28), NG_ 380 EX_ST(STORE_INIT(%o5, %o0 + 0x28), NG_ret_i2_plus_g1_minus_40) 380 EX_ST(STORE_INIT(%o2, %o0 + 0x30), NG_ 381 EX_ST(STORE_INIT(%o2, %o0 + 0x30), NG_ret_i2_plus_g1_minus_48) 381 EX_ST(STORE_INIT(%o3, %o0 + 0x38), NG_ 382 EX_ST(STORE_INIT(%o3, %o0 + 0x38), NG_ret_i2_plus_g1_minus_56) 382 subcc %g1, 64, %g1 383 subcc %g1, 64, %g1 383 bne,pt %XCC, 1b 384 bne,pt %XCC, 1b 384 add %o0, 64, %o0 385 add %o0, 64, %o0 385 /* fall through */ 386 /* fall through */ 386 387 387 60: 388 60: 388 membar #Sync 389 membar #Sync 389 390 390 /* %i2 contains any final bytes still 391 /* %i2 contains any final bytes still needed to be copied 391 * over. If anything is left, we copy 392 * over. If anything is left, we copy it one byte at a time. 392 */ 393 */ 393 RESTORE_ASI(%i3) 394 RESTORE_ASI(%i3) 394 brz,pt %i2, 85f 395 brz,pt %i2, 85f 395 sub %o0, %i1, %i3 396 sub %o0, %i1, %i3 396 ba,a,pt %XCC, 90f 397 ba,a,pt %XCC, 90f 397 nop 398 nop 398 399 399 .align 64 400 .align 64 400 70: /* 16 < len <= 64 */ 401 70: /* 16 < len <= 64 */ 401 bne,pn %XCC, 75f 402 bne,pn %XCC, 75f 402 sub %o0, %i1, %i3 403 sub %o0, %i1, %i3 403 404 404 72: 405 72: 405 andn %i2, 0xf, %i4 406 andn %i2, 0xf, %i4 406 and %i2, 0xf, %i2 407 and %i2, 0xf, %i2 407 1: subcc %i4, 0x10, %i4 408 1: subcc %i4, 0x10, %i4 408 EX_LD(LOAD(ldx, %i1, %o4), NG_ret_i2_p 409 EX_LD(LOAD(ldx, %i1, %o4), NG_ret_i2_plus_i4) 409 add %i1, 0x08, %i1 410 add %i1, 0x08, %i1 410 EX_LD(LOAD(ldx, %i1, %g1), NG_ret_i2_p 411 EX_LD(LOAD(ldx, %i1, %g1), NG_ret_i2_plus_i4) 411 sub %i1, 0x08, %i1 412 sub %i1, 0x08, %i1 412 EX_ST(STORE(stx, %o4, %i1 + %i3), NG_r 413 EX_ST(STORE(stx, %o4, %i1 + %i3), NG_ret_i2_plus_i4) 413 add %i1, 0x8, %i1 414 add %i1, 0x8, %i1 414 EX_ST(STORE(stx, %g1, %i1 + %i3), NG_r 415 EX_ST(STORE(stx, %g1, %i1 + %i3), NG_ret_i2_plus_i4_minus_8) 415 bgu,pt %XCC, 1b 416 bgu,pt %XCC, 1b 416 add %i1, 0x8, %i1 417 add %i1, 0x8, %i1 417 73: andcc %i2, 0x8, %g0 418 73: andcc %i2, 0x8, %g0 418 be,pt %XCC, 1f 419 be,pt %XCC, 1f 419 nop 420 nop 420 sub %i2, 0x8, %i2 421 sub %i2, 0x8, %i2 421 EX_LD(LOAD(ldx, %i1, %o4), NG_ret_i2_p 422 EX_LD(LOAD(ldx, %i1, %o4), NG_ret_i2_plus_8) 422 EX_ST(STORE(stx, %o4, %i1 + %i3), NG_r 423 EX_ST(STORE(stx, %o4, %i1 + %i3), NG_ret_i2_plus_8) 423 add %i1, 0x8, %i1 424 add %i1, 0x8, %i1 424 1: andcc %i2, 0x4, %g0 425 1: andcc %i2, 0x4, %g0 425 be,pt %XCC, 1f 426 be,pt %XCC, 1f 426 nop 427 nop 427 sub %i2, 0x4, %i2 428 sub %i2, 0x4, %i2 428 EX_LD(LOAD(lduw, %i1, %i5), NG_ret_i2_ 429 EX_LD(LOAD(lduw, %i1, %i5), NG_ret_i2_plus_4) 429 EX_ST(STORE(stw, %i5, %i1 + %i3), NG_r 430 EX_ST(STORE(stw, %i5, %i1 + %i3), NG_ret_i2_plus_4) 430 add %i1, 0x4, %i1 431 add %i1, 0x4, %i1 431 1: cmp %i2, 0 432 1: cmp %i2, 0 432 be,pt %XCC, 85f 433 be,pt %XCC, 85f 433 nop 434 nop 434 ba,pt %xcc, 90f 435 ba,pt %xcc, 90f 435 nop 436 nop 436 437 437 75: 438 75: 438 andcc %o0, 0x7, %g1 439 andcc %o0, 0x7, %g1 439 sub %g1, 0x8, %g1 440 sub %g1, 0x8, %g1 440 be,pn %icc, 2f 441 be,pn %icc, 2f 441 sub %g0, %g1, %g1 442 sub %g0, %g1, %g1 442 sub %i2, %g1, %i2 443 sub %i2, %g1, %i2 443 444 444 1: subcc %g1, 1, %g1 445 1: subcc %g1, 1, %g1 445 EX_LD(LOAD(ldub, %i1, %i5), NG_ret_i2_ 446 EX_LD(LOAD(ldub, %i1, %i5), NG_ret_i2_plus_g1_plus_1) 446 EX_ST(STORE(stb, %i5, %i1 + %i3), NG_r 447 EX_ST(STORE(stb, %i5, %i1 + %i3), NG_ret_i2_plus_g1_plus_1) 447 bgu,pt %icc, 1b 448 bgu,pt %icc, 1b 448 add %i1, 1, %i1 449 add %i1, 1, %i1 449 450 450 2: add %i1, %i3, %o0 451 2: add %i1, %i3, %o0 451 andcc %i1, 0x7, %g1 452 andcc %i1, 0x7, %g1 452 bne,pt %icc, 8f 453 bne,pt %icc, 8f 453 sll %g1, 3, %g1 454 sll %g1, 3, %g1 454 455 455 cmp %i2, 16 456 cmp %i2, 16 456 bgeu,pt %icc, 72b 457 bgeu,pt %icc, 72b 457 nop 458 nop 458 ba,a,pt %xcc, 73b 459 ba,a,pt %xcc, 73b 459 460 460 8: mov 64, %i3 461 8: mov 64, %i3 461 andn %i1, 0x7, %i1 462 andn %i1, 0x7, %i1 462 EX_LD(LOAD(ldx, %i1, %g2), NG_ret_i2) 463 EX_LD(LOAD(ldx, %i1, %g2), NG_ret_i2) 463 sub %i3, %g1, %i3 464 sub %i3, %g1, %i3 464 andn %i2, 0x7, %i4 465 andn %i2, 0x7, %i4 465 sllx %g2, %g1, %g2 466 sllx %g2, %g1, %g2 466 1: add %i1, 0x8, %i1 467 1: add %i1, 0x8, %i1 467 EX_LD(LOAD(ldx, %i1, %g3), NG_ret_i2_a 468 EX_LD(LOAD(ldx, %i1, %g3), NG_ret_i2_and_7_plus_i4) 468 subcc %i4, 0x8, %i4 469 subcc %i4, 0x8, %i4 469 srlx %g3, %i3, %i5 470 srlx %g3, %i3, %i5 470 or %i5, %g2, %i5 471 or %i5, %g2, %i5 471 EX_ST(STORE(stx, %i5, %o0), NG_ret_i2_ 472 EX_ST(STORE(stx, %i5, %o0), NG_ret_i2_and_7_plus_i4) 472 add %o0, 0x8, %o0 473 add %o0, 0x8, %o0 473 bgu,pt %icc, 1b 474 bgu,pt %icc, 1b 474 sllx %g3, %g1, %g2 475 sllx %g3, %g1, %g2 475 476 476 srl %g1, 3, %g1 477 srl %g1, 3, %g1 477 andcc %i2, 0x7, %i2 478 andcc %i2, 0x7, %i2 478 be,pn %icc, 85f 479 be,pn %icc, 85f 479 add %i1, %g1, %i1 480 add %i1, %g1, %i1 480 ba,pt %xcc, 90f 481 ba,pt %xcc, 90f 481 sub %o0, %i1, %i3 482 sub %o0, %i1, %i3 482 483 483 .align 64 484 .align 64 484 80: /* 0 < len <= 16 */ 485 80: /* 0 < len <= 16 */ 485 andcc %i3, 0x3, %g0 486 andcc %i3, 0x3, %g0 486 bne,pn %XCC, 90f 487 bne,pn %XCC, 90f 487 sub %o0, %i1, %i3 488 sub %o0, %i1, %i3 488 489 489 1: 490 1: 490 subcc %i2, 4, %i2 491 subcc %i2, 4, %i2 491 EX_LD(LOAD(lduw, %i1, %g1), NG_ret_i2_ 492 EX_LD(LOAD(lduw, %i1, %g1), NG_ret_i2_plus_4) 492 EX_ST(STORE(stw, %g1, %i1 + %i3), NG_r 493 EX_ST(STORE(stw, %g1, %i1 + %i3), NG_ret_i2_plus_4) 493 bgu,pt %XCC, 1b 494 bgu,pt %XCC, 1b 494 add %i1, 4, %i1 495 add %i1, 4, %i1 495 496 496 85: ret 497 85: ret 497 restore EX_RETVAL(%i0), %g0, % 498 restore EX_RETVAL(%i0), %g0, %o0 498 499 499 .align 32 500 .align 32 500 90: 501 90: 501 subcc %i2, 1, %i2 502 subcc %i2, 1, %i2 502 EX_LD(LOAD(ldub, %i1, %g1), NG_ret_i2_ 503 EX_LD(LOAD(ldub, %i1, %g1), NG_ret_i2_plus_1) 503 EX_ST(STORE(stb, %g1, %i1 + %i3), NG_r 504 EX_ST(STORE(stb, %g1, %i1 + %i3), NG_ret_i2_plus_1) 504 bgu,pt %XCC, 90b 505 bgu,pt %XCC, 90b 505 add %i1, 1, %i1 506 add %i1, 1, %i1 506 ret 507 ret 507 restore EX_RETVAL(%i0), %g0, % 508 restore EX_RETVAL(%i0), %g0, %o0 508 509 509 .size FUNC_NAME, .-FUNC_NAME 510 .size FUNC_NAME, .-FUNC_NAME
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.