1 /* SPDX-License-Identifier: GPL-2.0-only */ << 2 /* 1 /* 3 * Copyright (C) 2022 Michael T. Kloos <michael !! 2 * arch/alpha/lib/memmove.S >> 3 * >> 4 * Barely optimized memmove routine for Alpha EV5. >> 5 * >> 6 * This is hand-massaged output from the original memcpy.c. We defer to >> 7 * memcpy whenever possible; the backwards copy loops are not unrolled. 4 */ 8 */ >> 9 >> 10 .set noat >> 11 .set noreorder >> 12 .text >> 13 >> 14 .align 4 >> 15 .globl bcopy >> 16 .ent bcopy >> 17 bcopy: >> 18 ldgp $29, 0($27) >> 19 .prologue 1 >> 20 mov $16,$0 >> 21 mov $17,$16 >> 22 mov $0,$17 >> 23 br $31, memmove !samegp >> 24 .end bcopy >> 25 >> 26 .align 4 >> 27 .globl memmove >> 28 .ent memmove >> 29 memmove: >> 30 ldgp $29, 0($27) >> 31 unop >> 32 nop >> 33 .prologue 1 >> 34 >> 35 addq $16,$18,$4 >> 36 addq $17,$18,$5 >> 37 cmpule $4,$17,$1 /* dest + n <= src */ >> 38 cmpule $5,$16,$2 /* dest >= src + n */ >> 39 >> 40 bis $1,$2,$1 >> 41 mov $16,$0 >> 42 xor $16,$17,$2 >> 43 bne $1,memcpy !samegp >> 44 >> 45 and $2,7,$2 /* Test for src/dest co-alignment. */ >> 46 and $16,7,$1 >> 47 cmpule $16,$17,$3 >> 48 bne $3,$memmove_up /* dest < src */ >> 49 >> 50 and $4,7,$1 >> 51 bne $2,$misaligned_dn >> 52 unop >> 53 beq $1,$skip_aligned_byte_loop_head_dn >> 54 >> 55 $aligned_byte_loop_head_dn: >> 56 lda $4,-1($4) >> 57 lda $5,-1($5) >> 58 unop >> 59 ble $18,$egress >> 60 >> 61 ldq_u $3,0($5) >> 62 ldq_u $2,0($4) >> 63 lda $18,-1($18) >> 64 extbl $3,$5,$1 >> 65 >> 66 insbl $1,$4,$1 >> 67 mskbl $2,$4,$2 >> 68 bis $1,$2,$1 >> 69 and $4,7,$6 >> 70 >> 71 stq_u $1,0($4) >> 72 bne $6,$aligned_byte_loop_head_dn >> 73 >> 74 $skip_aligned_byte_loop_head_dn: >> 75 lda $18,-8($18) >> 76 blt $18,$skip_aligned_word_loop_dn >> 77 >> 78 $aligned_word_loop_dn: >> 79 ldq $1,-8($5) >> 80 nop >> 81 lda $5,-8($5) >> 82 lda $18,-8($18) >> 83 >> 84 stq $1,-8($4) >> 85 nop >> 86 lda $4,-8($4) >> 87 bge $18,$aligned_word_loop_dn >> 88 >> 89 $skip_aligned_word_loop_dn: >> 90 lda $18,8($18) >> 91 bgt $18,$byte_loop_tail_dn >> 92 unop >> 93 ret $31,($26),1 >> 94 >> 95 .align 4 >> 96 $misaligned_dn: >> 97 nop >> 98 fnop >> 99 unop >> 100 beq $18,$egress >> 101 >> 102 $byte_loop_tail_dn: >> 103 ldq_u $3,-1($5) >> 104 ldq_u $2,-1($4) >> 105 lda $5,-1($5) >> 106 lda $4,-1($4) >> 107 >> 108 lda $18,-1($18) >> 109 extbl $3,$5,$1 >> 110 insbl $1,$4,$1 >> 111 mskbl $2,$4,$2 >> 112 >> 113 bis $1,$2,$1 >> 114 stq_u $1,0($4) >> 115 bgt $18,$byte_loop_tail_dn >> 116 br $egress >> 117 >> 118 $memmove_up: >> 119 mov $16,$4 >> 120 mov $17,$5 >> 121 bne $2,$misaligned_up >> 122 beq $1,$skip_aligned_byte_loop_head_up >> 123 >> 124 $aligned_byte_loop_head_up: >> 125 unop >> 126 ble $18,$egress >> 127 ldq_u $3,0($5) >> 128 ldq_u $2,0($4) >> 129 >> 130 lda $18,-1($18) >> 131 extbl $3,$5,$1 >> 132 insbl $1,$4,$1 >> 133 mskbl $2,$4,$2 >> 134 >> 135 bis $1,$2,$1 >> 136 lda $5,1($5) >> 137 stq_u $1,0($4) >> 138 lda $4,1($4) >> 139 >> 140 and $4,7,$6 >> 141 bne $6,$aligned_byte_loop_head_up >> 142 >> 143 $skip_aligned_byte_loop_head_up: >> 144 lda $18,-8($18) >> 145 blt $18,$skip_aligned_word_loop_up >> 146 >> 147 $aligned_word_loop_up: >> 148 ldq $1,0($5) >> 149 nop >> 150 lda $5,8($5) >> 151 lda $18,-8($18) >> 152 >> 153 stq $1,0($4) >> 154 nop >> 155 lda $4,8($4) >> 156 bge $18,$aligned_word_loop_up >> 157 >> 158 $skip_aligned_word_loop_up: >> 159 lda $18,8($18) >> 160 bgt $18,$byte_loop_tail_up >> 161 unop >> 162 ret $31,($26),1 >> 163 >> 164 .align 4 >> 165 $misaligned_up: >> 166 nop >> 167 fnop >> 168 unop >> 169 beq $18,$egress >> 170 >> 171 $byte_loop_tail_up: >> 172 ldq_u $3,0($5) >> 173 ldq_u $2,0($4) >> 174 lda $18,-1($18) >> 175 extbl $3,$5,$1 >> 176 >> 177 insbl $1,$4,$1 >> 178 mskbl $2,$4,$2 >> 179 bis $1,$2,$1 >> 180 stq_u $1,0($4) >> 181 >> 182 lda $5,1($5) >> 183 lda $4,1($4) >> 184 nop >> 185 bgt $18,$byte_loop_tail_up >> 186 >> 187 $egress: >> 188 ret $31,($26),1 >> 189 nop >> 190 nop >> 191 nop 5 192 6 #include <linux/linkage.h> !! 193 .end memmove 7 #include <asm/asm.h> << 8 << 9 SYM_FUNC_START(__memmove) << 10 /* << 11 * Returns << 12 * a0 - dest << 13 * << 14 * Parameters << 15 * a0 - Inclusive first byte of dest << 16 * a1 - Inclusive first byte of src << 17 * a2 - Length of copy n << 18 * << 19 * Because the return matches the para << 20 * we will not clobber or modify that << 21 * << 22 * Note: This currently only works on << 23 * To port to big-endian, reverse the << 24 * in the 2 misaligned fixup copy loop << 25 */ << 26 << 27 /* Return if nothing to do */ << 28 beq a0, a1, .Lreturn_from_memmove << 29 beqz a2, .Lreturn_from_memmove << 30 << 31 /* << 32 * Register Uses << 33 * Forward Copy: a1 - Index count << 34 * Reverse Copy: a4 - Index count << 35 * Forward Copy: t3 - Index count << 36 * Reverse Copy: t4 - Index count << 37 * Both Copy Modes: t5 - Inclusive f << 38 * Both Copy Modes: t6 - Non-Inclusi << 39 * Both Copy Modes: t0 - Link / Temp << 40 * Both Copy Modes: t1 - Temporary f << 41 * Both Copy Modes: t2 - Temporary f << 42 * Both Copy Modes: a5 - dest to src << 43 * Both Copy Modes: a6 - Shift ammou << 44 * Both Copy Modes: a7 - Inverse Shi << 45 * Both Copy Modes: a2 - Alternate b << 46 */ << 47 << 48 /* << 49 * Solve for some register values now. << 50 * Byte copy does not need t5 or t6. << 51 */ << 52 mv t3, a0 << 53 add t4, a0, a2 << 54 add a4, a1, a2 << 55 << 56 /* << 57 * Byte copy if copying less than (2 * << 58 * cause problems with the bulk copy i << 59 * small enough not to bother. << 60 */ << 61 andi t0, a2, -(2 * SZREG) << 62 beqz t0, .Lbyte_copy << 63 << 64 /* << 65 * Now solve for t5 and t6. << 66 */ << 67 andi t5, t3, -SZREG << 68 andi t6, t4, -SZREG << 69 /* << 70 * If dest(Register t3) rounded down t << 71 * aligned SZREG address, does not equ << 72 * to find the low-bound of SZREG alig << 73 * region. Note that this could overs << 74 * region if n is less than SZREG. Th << 75 * we always byte copy if n is less th << 76 * Otherwise, dest is already naturall << 77 */ << 78 beq t5, t3, 1f << 79 addi t5, t5, SZREG << 80 1: << 81 << 82 /* << 83 * If the dest and src are co-aligned << 84 * no need for the full rigmarole of a << 85 * Instead, do a simpler co-aligned co << 86 */ << 87 xor t0, a0, a1 << 88 andi t1, t0, (SZREG - 1) << 89 beqz t1, .Lcoaligned_copy << 90 /* Fall through to misaligned fixup co << 91 << 92 .Lmisaligned_fixup_copy: << 93 bltu a1, a0, .Lmisaligned_fixup_copy_r << 94 << 95 .Lmisaligned_fixup_copy_forward: << 96 jal t0, .Lbyte_copy_until_aligned_for << 97 << 98 andi a5, a1, (SZREG - 1) /* Find the a << 99 slli a6, a5, 3 /* Multiply by 8 to con << 100 sub a5, a1, t3 /* Find the difference << 101 andi a1, a1, -SZREG /* Align the src p << 102 addi a2, t6, SZREG /* The other breakp << 103 << 104 /* << 105 * Compute The Inverse Shift << 106 * a7 = XLEN - a6 = XLEN + -a6 << 107 * 2s complement negation to find the << 108 * Add that to XLEN. XLEN = SZREG * 8 << 109 */ << 110 not a7, a6 << 111 addi a7, a7, (SZREG * 8 + 1) << 112 << 113 /* << 114 * Fix Misalignment Copy Loop - Forwar << 115 * load_val0 = load_ptr[0]; << 116 * do { << 117 * load_val1 = load_ptr[1]; << 118 * store_ptr += 2; << 119 * store_ptr[0 - 2] = (load_val0 << 120 * << 121 * if (store_ptr == {a2}) << 122 * break; << 123 * << 124 * load_val0 = load_ptr[2]; << 125 * load_ptr += 2; << 126 * store_ptr[1 - 2] = (load_val1 << 127 * << 128 * } while (store_ptr != store_ptr_end << 129 * store_ptr = store_ptr_end; << 130 */ << 131 << 132 REG_L t0, (0 * SZREG)(a1) << 133 1: << 134 REG_L t1, (1 * SZREG)(a1) << 135 addi t3, t3, (2 * SZREG) << 136 srl t0, t0, a6 << 137 sll t2, t1, a7 << 138 or t2, t0, t2 << 139 REG_S t2, ((0 * SZREG) - (2 * SZREG))( << 140 << 141 beq t3, a2, 2f << 142 << 143 REG_L t0, (2 * SZREG)(a1) << 144 addi a1, a1, (2 * SZREG) << 145 srl t1, t1, a6 << 146 sll t2, t0, a7 << 147 or t2, t1, t2 << 148 REG_S t2, ((1 * SZREG) - (2 * SZREG))( << 149 << 150 bne t3, t6, 1b << 151 2: << 152 mv t3, t6 /* Fix the dest pointer i << 153 << 154 add a1, t3, a5 /* Restore the src poi << 155 j .Lbyte_copy_forward /* Copy any rema << 156 << 157 .Lmisaligned_fixup_copy_reverse: << 158 jal t0, .Lbyte_copy_until_aligned_rev << 159 << 160 andi a5, a4, (SZREG - 1) /* Find the a << 161 slli a6, a5, 3 /* Multiply by 8 to con << 162 sub a5, a4, t4 /* Find the difference << 163 andi a4, a4, -SZREG /* Align the src p << 164 addi a2, t5, -SZREG /* The other break << 165 << 166 /* << 167 * Compute The Inverse Shift << 168 * a7 = XLEN - a6 = XLEN + -a6 << 169 * 2s complement negation to find the << 170 * Add that to XLEN. XLEN = SZREG * 8 << 171 */ << 172 not a7, a6 << 173 addi a7, a7, (SZREG * 8 + 1) << 174 << 175 /* << 176 * Fix Misalignment Copy Loop - Revers << 177 * load_val1 = load_ptr[0]; << 178 * do { << 179 * load_val0 = load_ptr[-1]; << 180 * store_ptr -= 2; << 181 * store_ptr[1] = (load_val0 >> { << 182 * << 183 * if (store_ptr == {a2}) << 184 * break; << 185 * << 186 * load_val1 = load_ptr[-2]; << 187 * load_ptr -= 2; << 188 * store_ptr[0] = (load_val1 >> { << 189 * << 190 * } while (store_ptr != store_ptr_end << 191 * store_ptr = store_ptr_end; << 192 */ << 193 << 194 REG_L t1, ( 0 * SZREG)(a4) << 195 1: << 196 REG_L t0, (-1 * SZREG)(a4) << 197 addi t4, t4, (-2 * SZREG) << 198 sll t1, t1, a7 << 199 srl t2, t0, a6 << 200 or t2, t1, t2 << 201 REG_S t2, ( 1 * SZREG)(t4) << 202 << 203 beq t4, a2, 2f << 204 << 205 REG_L t1, (-2 * SZREG)(a4) << 206 addi a4, a4, (-2 * SZREG) << 207 sll t0, t0, a7 << 208 srl t2, t1, a6 << 209 or t2, t0, t2 << 210 REG_S t2, ( 0 * SZREG)(t4) << 211 << 212 bne t4, t5, 1b << 213 2: << 214 mv t4, t5 /* Fix the dest pointer i << 215 << 216 add a4, t4, a5 /* Restore the src poi << 217 j .Lbyte_copy_reverse /* Copy any rema << 218 << 219 /* << 220 * Simple copy loops for SZREG co-aligned memo << 221 * These also make calls to do byte copies for << 222 * data at their terminations. << 223 */ << 224 .Lcoaligned_copy: << 225 bltu a1, a0, .Lcoaligned_copy_reverse << 226 << 227 .Lcoaligned_copy_forward: << 228 jal t0, .Lbyte_copy_until_aligned_forw << 229 << 230 1: << 231 REG_L t1, ( 0 * SZREG)(a1) << 232 addi a1, a1, SZREG << 233 addi t3, t3, SZREG << 234 REG_S t1, (-1 * SZREG)(t3) << 235 bne t3, t6, 1b << 236 << 237 j .Lbyte_copy_forward /* Copy any rema << 238 << 239 .Lcoaligned_copy_reverse: << 240 jal t0, .Lbyte_copy_until_aligned_reve << 241 << 242 1: << 243 REG_L t1, (-1 * SZREG)(a4) << 244 addi a4, a4, -SZREG << 245 addi t4, t4, -SZREG << 246 REG_S t1, ( 0 * SZREG)(t4) << 247 bne t4, t5, 1b << 248 << 249 j .Lbyte_copy_reverse /* Copy any rema << 250 << 251 /* << 252 * These are basically sub-functions within th << 253 * are used to byte copy until the dest pointe << 254 * At which point, a bulk copy method can be u << 255 * calling code. These work on the same regis << 256 * copy loops. Therefore, the register values << 257 * up from where they were left and we avoid c << 258 * without any overhead except the call in and << 259 */ << 260 .Lbyte_copy_until_aligned_forward: << 261 beq t3, t5, 2f << 262 1: << 263 lb t1, 0(a1) << 264 addi a1, a1, 1 << 265 addi t3, t3, 1 << 266 sb t1, -1(t3) << 267 bne t3, t5, 1b << 268 2: << 269 jalr zero, 0x0(t0) /* Return to multib << 270 << 271 .Lbyte_copy_until_aligned_reverse: << 272 beq t4, t6, 2f << 273 1: << 274 lb t1, -1(a4) << 275 addi a4, a4, -1 << 276 addi t4, t4, -1 << 277 sb t1, 0(t4) << 278 bne t4, t6, 1b << 279 2: << 280 jalr zero, 0x0(t0) /* Return to multib << 281 << 282 /* << 283 * Simple byte copy loops. << 284 * These will byte copy until they reach the e << 285 * At that point, they will call to return fro << 286 */ << 287 .Lbyte_copy: << 288 bltu a1, a0, .Lbyte_copy_reverse << 289 << 290 .Lbyte_copy_forward: << 291 beq t3, t4, 2f << 292 1: << 293 lb t1, 0(a1) << 294 addi a1, a1, 1 << 295 addi t3, t3, 1 << 296 sb t1, -1(t3) << 297 bne t3, t4, 1b << 298 2: << 299 ret << 300 << 301 .Lbyte_copy_reverse: << 302 beq t4, t3, 2f << 303 1: << 304 lb t1, -1(a4) << 305 addi a4, a4, -1 << 306 addi t4, t4, -1 << 307 sb t1, 0(t4) << 308 bne t4, t3, 1b << 309 2: << 310 << 311 .Lreturn_from_memmove: << 312 ret << 313 << 314 SYM_FUNC_END(__memmove) << 315 SYM_FUNC_ALIAS_WEAK(memmove, __memmove) << 316 SYM_FUNC_ALIAS(__pi_memmove, __memmove) << 317 SYM_FUNC_ALIAS(__pi___memmove, __memmove) <<
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.