1 /* SPDX-License-Identifier: GPL-2.0 */ !! 1 /* 2 /* memcpy.S: Sparc optimized memcpy and memmov !! 2 * This file is subject to the terms and conditions of the GNU General Public 3 * Hand optimized from GNU libc's memcpy and m !! 3 * License. See the file "COPYING" in the main directory of this archive 4 * Copyright (C) 1991,1996 Free Software Found !! 4 * for more details. 5 * Copyright (C) 1995 Linus Torvalds (Linus.To !! 5 * 6 * Copyright (C) 1996 David S. Miller (davem@c !! 6 * Unified implementation of memcpy, memmove and the __copy_user backend. 7 * Copyright (C) 1996 Eddie C. Dost (ecd@skyne !! 7 * 8 * Copyright (C) 1996 Jakub Jelinek (jj@sunsit !! 8 * Copyright (C) 1998, 99, 2000, 01, 2002 Ralf Baechle (ralf@gnu.org) >> 9 * Copyright (C) 1999, 2000, 01, 2002 Silicon Graphics, Inc. >> 10 * Copyright (C) 2002 Broadcom, Inc. >> 11 * memcpy/copy_user author: Mark Vandevoorde >> 12 * >> 13 * Mnemonic names for arguments to memcpy/__copy_user >> 14 */ >> 15 #include <linux/config.h> >> 16 #include <asm/asm.h> >> 17 #include <asm/offset.h> >> 18 #include <asm/regdef.h> >> 19 >> 20 #define dst a0 >> 21 #define src a1 >> 22 #define len a2 >> 23 >> 24 /* >> 25 * Spec >> 26 * >> 27 * memcpy copies len bytes from src to dst and sets v0 to dst. >> 28 * It assumes that >> 29 * - src and dst don't overlap >> 30 * - src is readable >> 31 * - dst is writable >> 32 * memcpy uses the standard calling convention >> 33 * >> 34 * __copy_user copies up to len bytes from src to dst and sets a2 (len) to >> 35 * the number of uncopied bytes due to an exception caused by a read or write. >> 36 * __copy_user assumes that src and dst don't overlap, and that the call is >> 37 * implementing one of the following: >> 38 * copy_to_user >> 39 * - src is readable (no exceptions when reading src) >> 40 * copy_from_user >> 41 * - dst is writable (no exceptions when writing dst) >> 42 * __copy_user uses a non-standard calling convention; see >> 43 * include/asm-mips/uaccess.h >> 44 * >> 45 * When an exception happens on a load, the handler must >> 46 # ensure that all of the destination buffer is overwritten to prevent >> 47 * leaking information to user mode programs. 9 */ 48 */ 10 49 11 #include <linux/export.h> !! 50 /* >> 51 * Implementation >> 52 */ 12 53 13 #define FUNC(x) \ !! 54 /* 14 .globl x; \ !! 55 * The exception handler for loads requires that: 15 .type x,@function; \ !! 56 * 1- AT contain the address of the byte just past the end of the source 16 .align 4; \ !! 57 * of the copy, 17 x: !! 58 * 2- src_entry <= src < AT, and 18 !! 59 * 3- (dst - src) == (dst_entry - src_entry), 19 /* Both these macros have to start with exactl !! 60 * The _entry suffix denotes values when __copy_user was called. 20 #define MOVE_BIGCHUNK(src, dst, offset, t0, t1 !! 61 * 21 ldd [%src + (offset) + 0x00], %t0; !! 62 * (1) is set up up by uaccess.h and maintained by not writing AT in copy_user 22 ldd [%src + (offset) + 0x08], %t2; !! 63 * (2) is met by incrementing src by the number of bytes copied 23 ldd [%src + (offset) + 0x10], %t4; !! 64 * (3) is met by not doing loads between a pair of increments of dst and src 24 ldd [%src + (offset) + 0x18], %t6; !! 65 * 25 st %t0, [%dst + (offset) + 0x00]; !! 66 * The exception handlers for stores adjust len (if necessary) and return. 26 st %t1, [%dst + (offset) + 0x04]; !! 67 * These handlers do not need to overwrite any data. 27 st %t2, [%dst + (offset) + 0x08]; !! 68 * 28 st %t3, [%dst + (offset) + 0x0c]; !! 69 * For __rmemcpy and memmove an exception is always a kernel bug, therefore 29 st %t4, [%dst + (offset) + 0x10]; !! 70 * they're not protected. 30 st %t5, [%dst + (offset) + 0x14]; !! 71 */ 31 st %t6, [%dst + (offset) + 0x18]; << 32 st %t7, [%dst + (offset) + 0x1c]; << 33 << 34 #define MOVE_BIGALIGNCHUNK(src, dst, offset, t << 35 ldd [%src + (offset) + 0x00], %t0; << 36 ldd [%src + (offset) + 0x08], %t2; << 37 ldd [%src + (offset) + 0x10], %t4; << 38 ldd [%src + (offset) + 0x18], %t6; << 39 std %t0, [%dst + (offset) + 0x00]; << 40 std %t2, [%dst + (offset) + 0x08]; << 41 std %t4, [%dst + (offset) + 0x10]; << 42 std %t6, [%dst + (offset) + 0x18]; << 43 << 44 #define MOVE_LASTCHUNK(src, dst, offset, t0, t << 45 ldd [%src - (offset) - 0x10], %t0; << 46 ldd [%src - (offset) - 0x08], %t2; << 47 st %t0, [%dst - (offset) - 0x10]; << 48 st %t1, [%dst - (offset) - 0x0c]; << 49 st %t2, [%dst - (offset) - 0x08]; << 50 st %t3, [%dst - (offset) - 0x04]; << 51 << 52 #define MOVE_LASTALIGNCHUNK(src, dst, offset, << 53 ldd [%src - (offset) - 0x10], %t0; << 54 ldd [%src - (offset) - 0x08], %t2; << 55 std %t0, [%dst - (offset) - 0x10]; << 56 std %t2, [%dst - (offset) - 0x08]; << 57 << 58 #define MOVE_SHORTCHUNK(src, dst, offset, t0, << 59 ldub [%src - (offset) - 0x02], %t0; << 60 ldub [%src - (offset) - 0x01], %t1; << 61 stb %t0, [%dst - (offset) - 0x02]; << 62 stb %t1, [%dst - (offset) - 0x01]; << 63 72 64 .text !! 73 #define EXC(inst_reg,addr,handler) \ 65 .align 4 !! 74 9: inst_reg, addr; \ >> 75 .section __ex_table,"a"; \ >> 76 PTR 9b, handler; \ >> 77 .previous 66 78 67 FUNC(memmove) !! 79 /* 68 EXPORT_SYMBOL(memmove) !! 80 * Only on the 64-bit kernel we can made use of 64-bit registers. 69 cmp %o0, %o1 !! 81 */ 70 mov %o0, %g7 !! 82 #ifdef CONFIG_MIPS64 71 bleu 9f !! 83 #define USE_DOUBLE 72 sub %o0, %o1, %o4 !! 84 #endif 73 !! 85 74 add %o1, %o2, %o3 !! 86 #ifdef USE_DOUBLE 75 cmp %o3, %o0 !! 87 76 bleu 0f !! 88 #define LOAD ld 77 andcc %o4, 3, %o5 !! 89 #define LOADL ldl 78 !! 90 #define LOADR ldr 79 add %o1, %o2, %o1 !! 91 #define STOREL sdl 80 add %o0, %o2, %o0 !! 92 #define STORER sdr 81 sub %o1, 1, %o1 !! 93 #define STORE sd 82 sub %o0, 1, %o0 !! 94 #define ADD daddu >> 95 #define SUB dsubu >> 96 #define SRL dsrl >> 97 #define SRA dsra >> 98 #define SLL dsll >> 99 #define SLLV dsllv >> 100 #define SRLV dsrlv >> 101 #define NBYTES 8 >> 102 #define LOG_NBYTES 3 >> 103 >> 104 /* >> 105 * As we are sharing code base with the mips32 tree (which use the o32 ABI >> 106 * register definitions). We need to redefine the register definitions from >> 107 * the n64 ABI register naming to the o32 ABI register naming. >> 108 */ >> 109 #undef t0 >> 110 #undef t1 >> 111 #undef t2 >> 112 #undef t3 >> 113 #define t0 $8 >> 114 #define t1 $9 >> 115 #define t2 $10 >> 116 #define t3 $11 >> 117 #define t4 $12 >> 118 #define t5 $13 >> 119 #define t6 $14 >> 120 #define t7 $15 83 121 84 1: /* reverse_bytes */ !! 122 #else 85 123 86 ldub [%o1], %o4 !! 124 #define LOAD lw 87 subcc %o2, 1, %o2 !! 125 #define LOADL lwl 88 stb %o4, [%o0] !! 126 #define LOADR lwr 89 sub %o1, 1, %o1 !! 127 #define STOREL swl 90 bne 1b !! 128 #define STORER swr 91 sub %o0, 1, %o0 !! 129 #define STORE sw 92 !! 130 #define ADD addu 93 retl !! 131 #define SUB subu 94 mov %g7, %o0 !! 132 #define SRL srl 95 !! 133 #define SLL sll 96 /* NOTE: This code is executed just for the ca !! 134 #define SRA sra 97 where %src (=%o1) & 3 is != 0. !! 135 #define SLLV sllv 98 We need to align it to 4. So, for (%s !! 136 #define SRLV srlv 99 1 we need to do ldub,lduh !! 137 #define NBYTES 4 100 2 lduh !! 138 #define LOG_NBYTES 2 101 3 just ldub !! 139 102 so even if it looks weird, the branch !! 140 #endif /* USE_DOUBLE */ 103 are correct here. -jj !! 141 104 */ !! 142 #ifdef CONFIG_CPU_LITTLE_ENDIAN 105 78: /* dword_align */ !! 143 #define LDFIRST LOADR >> 144 #define LDREST LOADL >> 145 #define STFIRST STORER >> 146 #define STREST STOREL >> 147 #define SHIFT_DISCARD SLLV >> 148 #else >> 149 #define LDFIRST LOADL >> 150 #define LDREST LOADR >> 151 #define STFIRST STOREL >> 152 #define STREST STORER >> 153 #define SHIFT_DISCARD SRLV >> 154 #endif >> 155 >> 156 #define FIRST(unit) ((unit)*NBYTES) >> 157 #define REST(unit) (FIRST(unit)+NBYTES-1) >> 158 #define UNIT(unit) FIRST(unit) 106 159 107 andcc %o1, 1, %g0 !! 160 #define ADDRMASK (NBYTES-1) 108 be 4f << 109 andcc %o1, 2, %g0 << 110 << 111 ldub [%o1], %g2 << 112 add %o1, 1, %o1 << 113 stb %g2, [%o0] << 114 sub %o2, 1, %o2 << 115 bne 3f << 116 add %o0, 1, %o0 << 117 4: << 118 lduh [%o1], %g2 << 119 add %o1, 2, %o1 << 120 sth %g2, [%o0] << 121 sub %o2, 2, %o2 << 122 b 3f << 123 add %o0, 2, %o0 << 124 << 125 FUNC(memcpy) /* %o0=dst %o1=src %o2=len */ << 126 EXPORT_SYMBOL(memcpy) << 127 << 128 sub %o0, %o1, %o4 << 129 mov %o0, %g7 << 130 9: << 131 andcc %o4, 3, %o5 << 132 0: << 133 bne 86f << 134 cmp %o2, 15 << 135 << 136 bleu 90f << 137 andcc %o1, 3, %g0 << 138 << 139 bne 78b << 140 3: << 141 andcc %o1, 4, %g0 << 142 << 143 be 2f << 144 mov %o2, %g1 << 145 << 146 ld [%o1], %o4 << 147 sub %g1, 4, %g1 << 148 st %o4, [%o0] << 149 add %o1, 4, %o1 << 150 add %o0, 4, %o0 << 151 2: << 152 andcc %g1, 0xffffff80, %g0 << 153 be 3f << 154 andcc %o0, 4, %g0 << 155 << 156 be 82f + 4 << 157 5: << 158 MOVE_BIGCHUNK(o1, o0, 0x00, o2, o3, o4 << 159 MOVE_BIGCHUNK(o1, o0, 0x20, o2, o3, o4 << 160 MOVE_BIGCHUNK(o1, o0, 0x40, o2, o3, o4 << 161 MOVE_BIGCHUNK(o1, o0, 0x60, o2, o3, o4 << 162 sub %g1, 128, %g1 << 163 add %o1, 128, %o1 << 164 cmp %g1, 128 << 165 bge 5b << 166 add %o0, 128, %o0 << 167 3: << 168 andcc %g1, 0x70, %g4 << 169 be 80f << 170 andcc %g1, 8, %g0 << 171 << 172 sethi %hi(80f), %o5 << 173 srl %g4, 1, %o4 << 174 add %g4, %o4, %o4 << 175 add %o1, %g4, %o1 << 176 sub %o5, %o4, %o5 << 177 jmpl %o5 + %lo(80f), %g0 << 178 add %o0, %g4, %o0 << 179 << 180 79: /* memcpy_table */ << 181 << 182 MOVE_LASTCHUNK(o1, o0, 0x60, g2, g3, g << 183 MOVE_LASTCHUNK(o1, o0, 0x50, g2, g3, g << 184 MOVE_LASTCHUNK(o1, o0, 0x40, g2, g3, g << 185 MOVE_LASTCHUNK(o1, o0, 0x30, g2, g3, g << 186 MOVE_LASTCHUNK(o1, o0, 0x20, g2, g3, g << 187 MOVE_LASTCHUNK(o1, o0, 0x10, g2, g3, g << 188 MOVE_LASTCHUNK(o1, o0, 0x00, g2, g3, g << 189 << 190 80: /* memcpy_table_end */ << 191 be 81f << 192 andcc %g1, 4, %g0 << 193 << 194 ldd [%o1], %g2 << 195 add %o0, 8, %o0 << 196 st %g2, [%o0 - 0x08] << 197 add %o1, 8, %o1 << 198 st %g3, [%o0 - 0x04] << 199 << 200 81: /* memcpy_last7 */ << 201 << 202 be 1f << 203 andcc %g1, 2, %g0 << 204 << 205 ld [%o1], %g2 << 206 add %o1, 4, %o1 << 207 st %g2, [%o0] << 208 add %o0, 4, %o0 << 209 1: << 210 be 1f << 211 andcc %g1, 1, %g0 << 212 161 213 lduh [%o1], %g2 !! 162 .text 214 add %o1, 2, %o1 !! 163 .set noreorder 215 sth %g2, [%o0] !! 164 .set noat 216 add %o0, 2, %o0 !! 165 >> 166 /* >> 167 * A combined memcpy/__copy_user >> 168 * __copy_user sets len to 0 for success; else to an upper bound of >> 169 * the number of uncopied bytes. >> 170 * memcpy sets v0 to dst. >> 171 */ >> 172 .align 5 >> 173 LEAF(memcpy) /* a0=dst a1=src a2=len */ >> 174 move v0, dst /* return value */ >> 175 __memcpy: >> 176 FEXPORT(__copy_user) >> 177 /* >> 178 * Note: dst & src may be unaligned, len may be 0 >> 179 * Temps >> 180 */ >> 181 #define rem t8 >> 182 >> 183 /* >> 184 * The "issue break"s below are very approximate. >> 185 * Issue delays for dcache fills will perturb the schedule, as will >> 186 * load queue full replay traps, etc. >> 187 * >> 188 * If len < NBYTES use byte operations. >> 189 */ >> 190 PREF( 0, 0(src) ) >> 191 PREF( 1, 0(dst) ) >> 192 sltu t2, len, NBYTES >> 193 and t1, dst, ADDRMASK >> 194 PREF( 0, 1*32(src) ) >> 195 PREF( 1, 1*32(dst) ) >> 196 bnez t2, copy_bytes_checklen >> 197 and t0, src, ADDRMASK >> 198 PREF( 0, 2*32(src) ) >> 199 PREF( 1, 2*32(dst) ) >> 200 bnez t1, dst_unaligned >> 201 nop >> 202 bnez t0, src_unaligned_dst_aligned >> 203 /* >> 204 * use delay slot for fall-through >> 205 * src and dst are aligned; need to compute rem >> 206 */ >> 207 both_aligned: >> 208 SRL t0, len, LOG_NBYTES+3 # +3 for 8 units/iter >> 209 beqz t0, cleanup_both_aligned # len < 8*NBYTES >> 210 and rem, len, (8*NBYTES-1) # rem = len % (8*NBYTES) >> 211 PREF( 0, 3*32(src) ) >> 212 PREF( 1, 3*32(dst) ) >> 213 .align 4 217 1: 214 1: 218 be 1f !! 215 EXC( LOAD t0, UNIT(0)(src), l_exc) >> 216 EXC( LOAD t1, UNIT(1)(src), l_exc_copy) >> 217 EXC( LOAD t2, UNIT(2)(src), l_exc_copy) >> 218 EXC( LOAD t3, UNIT(3)(src), l_exc_copy) >> 219 SUB len, len, 8*NBYTES >> 220 EXC( LOAD t4, UNIT(4)(src), l_exc_copy) >> 221 EXC( LOAD t7, UNIT(5)(src), l_exc_copy) >> 222 EXC( STORE t0, UNIT(0)(dst), s_exc_p8u) >> 223 EXC( STORE t1, UNIT(1)(dst), s_exc_p7u) >> 224 EXC( LOAD t0, UNIT(6)(src), l_exc_copy) >> 225 EXC( LOAD t1, UNIT(7)(src), l_exc_copy) >> 226 ADD src, src, 8*NBYTES >> 227 ADD dst, dst, 8*NBYTES >> 228 EXC( STORE t2, UNIT(-6)(dst), s_exc_p6u) >> 229 EXC( STORE t3, UNIT(-5)(dst), s_exc_p5u) >> 230 EXC( STORE t4, UNIT(-4)(dst), s_exc_p4u) >> 231 EXC( STORE t7, UNIT(-3)(dst), s_exc_p3u) >> 232 EXC( STORE t0, UNIT(-2)(dst), s_exc_p2u) >> 233 EXC( STORE t1, UNIT(-1)(dst), s_exc_p1u) >> 234 PREF( 0, 8*32(src) ) >> 235 PREF( 1, 8*32(dst) ) >> 236 bne len, rem, 1b 219 nop 237 nop 220 238 221 ldub [%o1], %g2 !! 239 /* 222 stb %g2, [%o0] !! 240 * len == rem == the number of bytes left to copy < 8*NBYTES >> 241 */ >> 242 cleanup_both_aligned: >> 243 beqz len, done >> 244 sltu t0, len, 4*NBYTES >> 245 bnez t0, less_than_4units >> 246 and rem, len, (NBYTES-1) # rem = len % NBYTES >> 247 /* >> 248 * len >= 4*NBYTES >> 249 */ >> 250 EXC( LOAD t0, UNIT(0)(src), l_exc) >> 251 EXC( LOAD t1, UNIT(1)(src), l_exc_copy) >> 252 EXC( LOAD t2, UNIT(2)(src), l_exc_copy) >> 253 EXC( LOAD t3, UNIT(3)(src), l_exc_copy) >> 254 SUB len, len, 4*NBYTES >> 255 ADD src, src, 4*NBYTES >> 256 EXC( STORE t0, UNIT(0)(dst), s_exc_p4u) >> 257 EXC( STORE t1, UNIT(1)(dst), s_exc_p3u) >> 258 EXC( STORE t2, UNIT(2)(dst), s_exc_p2u) >> 259 EXC( STORE t3, UNIT(3)(dst), s_exc_p1u) >> 260 beqz len, done >> 261 ADD dst, dst, 4*NBYTES >> 262 less_than_4units: >> 263 /* >> 264 * rem = len % NBYTES >> 265 */ >> 266 beq rem, len, copy_bytes >> 267 nop 223 1: 268 1: 224 retl !! 269 EXC( LOAD t0, 0(src), l_exc) 225 mov %g7, %o0 !! 270 ADD src, src, NBYTES 226 !! 271 SUB len, len, NBYTES 227 82: /* ldd_std */ !! 272 EXC( STORE t0, 0(dst), s_exc_p1u) 228 MOVE_BIGALIGNCHUNK(o1, o0, 0x00, o2, o !! 273 bne rem, len, 1b 229 MOVE_BIGALIGNCHUNK(o1, o0, 0x20, o2, o !! 274 ADD dst, dst, NBYTES 230 MOVE_BIGALIGNCHUNK(o1, o0, 0x40, o2, o !! 275 231 MOVE_BIGALIGNCHUNK(o1, o0, 0x60, o2, o !! 276 /* 232 subcc %g1, 128, %g1 !! 277 * src and dst are aligned, need to copy rem bytes (rem < NBYTES) 233 add %o1, 128, %o1 !! 278 * A loop would do only a byte at a time with possible branch 234 cmp %g1, 128 !! 279 * mispredicts. Can't do an explicit LOAD dst,mask,or,STORE 235 bge 82b !! 280 * because can't assume read-access to dst. Instead, use 236 add %o0, 128, %o0 !! 281 * STREST dst, which doesn't require read access to dst. 237 !! 282 * 238 andcc %g1, 0x70, %g4 !! 283 * This code should perform better than a simple loop on modern, 239 be 84f !! 284 * wide-issue mips processors because the code has fewer branches and 240 andcc %g1, 8, %g0 !! 285 * more instruction-level parallelism. 241 !! 286 */ 242 sethi %hi(84f), %o5 !! 287 #define bits t2 243 add %o1, %g4, %o1 !! 288 beqz len, done 244 sub %o5, %g4, %o5 !! 289 ADD t1, dst, len # t1 is just past last byte of dst 245 jmpl %o5 + %lo(84f), %g0 !! 290 li bits, 8*NBYTES 246 add %o0, %g4, %o0 !! 291 SLL rem, len, 3 # rem = number of bits to keep 247 !! 292 EXC( LOAD t0, 0(src), l_exc) 248 83: /* amemcpy_table */ !! 293 SUB bits, bits, rem # bits = number of bits to discard 249 !! 294 SHIFT_DISCARD t0, t0, bits 250 MOVE_LASTALIGNCHUNK(o1, o0, 0x60, g2, !! 295 EXC( STREST t0, -1(t1), s_exc) 251 MOVE_LASTALIGNCHUNK(o1, o0, 0x50, g2, !! 296 jr ra 252 MOVE_LASTALIGNCHUNK(o1, o0, 0x40, g2, !! 297 move len, zero 253 MOVE_LASTALIGNCHUNK(o1, o0, 0x30, g2, !! 298 dst_unaligned: 254 MOVE_LASTALIGNCHUNK(o1, o0, 0x20, g2, !! 299 /* 255 MOVE_LASTALIGNCHUNK(o1, o0, 0x10, g2, !! 300 * dst is unaligned 256 MOVE_LASTALIGNCHUNK(o1, o0, 0x00, g2, !! 301 * t0 = src & ADDRMASK 257 !! 302 * t1 = dst & ADDRMASK; T1 > 0 258 84: /* amemcpy_table_end */ !! 303 * len >= NBYTES 259 be 85f !! 304 * 260 andcc %g1, 4, %g0 !! 305 * Copy enough bytes to align dst 261 !! 306 * Set match = (src and dst have same alignment) 262 ldd [%o1], %g2 !! 307 */ 263 add %o0, 8, %o0 !! 308 #define match rem 264 std %g2, [%o0 - 0x08] !! 309 EXC( LDFIRST t3, FIRST(0)(src), l_exc) 265 add %o1, 8, %o1 !! 310 ADD t2, zero, NBYTES 266 85: /* amemcpy_last7 */ !! 311 EXC( LDREST t3, REST(0)(src), l_exc_copy) 267 be 1f !! 312 SUB t2, t2, t1 # t2 = number of bytes copied 268 andcc %g1, 2, %g0 !! 313 xor match, t0, t1 269 !! 314 EXC( STFIRST t3, FIRST(0)(dst), s_exc) 270 ld [%o1], %g2 !! 315 beq len, t2, done 271 add %o1, 4, %o1 !! 316 SUB len, len, t2 272 st %g2, [%o0] !! 317 ADD dst, dst, t2 273 add %o0, 4, %o0 !! 318 beqz match, both_aligned >> 319 ADD src, src, t2 >> 320 >> 321 src_unaligned_dst_aligned: >> 322 SRL t0, len, LOG_NBYTES+2 # +2 for 4 units/iter >> 323 PREF( 0, 3*32(src) ) >> 324 beqz t0, cleanup_src_unaligned >> 325 and rem, len, (4*NBYTES-1) # rem = len % 4*NBYTES >> 326 PREF( 1, 3*32(dst) ) 274 1: 327 1: 275 be 1f !! 328 /* 276 andcc %g1, 1, %g0 !! 329 * Avoid consecutive LD*'s to the same register since some mips 277 !! 330 * implementations can't issue them in the same cycle. 278 lduh [%o1], %g2 !! 331 * It's OK to load FIRST(N+1) before REST(N) because the two addresses 279 add %o1, 2, %o1 !! 332 * are to the same unit (unless src is aligned, but it's not). 280 sth %g2, [%o0] !! 333 */ 281 add %o0, 2, %o0 !! 334 EXC( LDFIRST t0, FIRST(0)(src), l_exc) 282 1: !! 335 EXC( LDFIRST t1, FIRST(1)(src), l_exc_copy) 283 be 1f !! 336 SUB len, len, 4*NBYTES >> 337 EXC( LDREST t0, REST(0)(src), l_exc_copy) >> 338 EXC( LDREST t1, REST(1)(src), l_exc_copy) >> 339 EXC( LDFIRST t2, FIRST(2)(src), l_exc_copy) >> 340 EXC( LDFIRST t3, FIRST(3)(src), l_exc_copy) >> 341 EXC( LDREST t2, REST(2)(src), l_exc_copy) >> 342 EXC( LDREST t3, REST(3)(src), l_exc_copy) >> 343 PREF( 0, 9*32(src) ) # 0 is PREF_LOAD (not streamed) >> 344 ADD src, src, 4*NBYTES >> 345 #ifdef CONFIG_CPU_SB1 >> 346 nop # improves slotting >> 347 #endif >> 348 EXC( STORE t0, UNIT(0)(dst), s_exc_p4u) >> 349 EXC( STORE t1, UNIT(1)(dst), s_exc_p3u) >> 350 EXC( STORE t2, UNIT(2)(dst), s_exc_p2u) >> 351 EXC( STORE t3, UNIT(3)(dst), s_exc_p1u) >> 352 PREF( 1, 9*32(dst) ) # 1 is PREF_STORE (not streamed) >> 353 bne len, rem, 1b >> 354 ADD dst, dst, 4*NBYTES >> 355 >> 356 cleanup_src_unaligned: >> 357 beqz len, done >> 358 and rem, len, NBYTES-1 # rem = len % NBYTES >> 359 beq rem, len, copy_bytes 284 nop 360 nop 285 << 286 ldub [%o1], %g2 << 287 stb %g2, [%o0] << 288 1: 361 1: 289 retl !! 362 EXC( LDFIRST t0, FIRST(0)(src), l_exc) 290 mov %g7, %o0 !! 363 EXC( LDREST t0, REST(0)(src), l_exc_copy) 291 !! 364 ADD src, src, NBYTES 292 86: /* non_aligned */ !! 365 SUB len, len, NBYTES 293 cmp %o2, 6 !! 366 EXC( STORE t0, 0(dst), s_exc_p1u) 294 bleu 88f !! 367 bne len, rem, 1b 295 nop !! 368 ADD dst, dst, NBYTES 296 << 297 save %sp, -96, %sp << 298 andcc %i0, 3, %g0 << 299 be 61f << 300 andcc %i0, 1, %g0 << 301 be 60f << 302 andcc %i0, 2, %g0 << 303 << 304 ldub [%i1], %g5 << 305 add %i1, 1, %i1 << 306 stb %g5, [%i0] << 307 sub %i2, 1, %i2 << 308 bne 61f << 309 add %i0, 1, %i0 << 310 60: << 311 ldub [%i1], %g3 << 312 add %i1, 2, %i1 << 313 stb %g3, [%i0] << 314 sub %i2, 2, %i2 << 315 ldub [%i1 - 1], %g3 << 316 add %i0, 2, %i0 << 317 stb %g3, [%i0 - 1] << 318 61: << 319 and %i1, 3, %g2 << 320 and %i2, 0xc, %g3 << 321 and %i1, -4, %i1 << 322 cmp %g3, 4 << 323 sll %g2, 3, %g4 << 324 mov 32, %g2 << 325 be 4f << 326 sub %g2, %g4, %l0 << 327 << 328 blu 3f << 329 cmp %g3, 0x8 << 330 369 331 be 2f !! 370 copy_bytes_checklen: 332 srl %i2, 2, %g3 !! 371 beqz len, done >> 372 nop >> 373 copy_bytes: >> 374 /* 0 < len < NBYTES */ >> 375 #define COPY_BYTE(N) \ >> 376 EXC( lb t0, N(src), l_exc); \ >> 377 SUB len, len, 1; \ >> 378 beqz len, done; \ >> 379 EXC( sb t0, N(dst), s_exc_p1) >> 380 >> 381 COPY_BYTE(0) >> 382 COPY_BYTE(1) >> 383 #ifdef USE_DOUBLE >> 384 COPY_BYTE(2) >> 385 COPY_BYTE(3) >> 386 COPY_BYTE(4) >> 387 COPY_BYTE(5) >> 388 #endif >> 389 EXC( lb t0, NBYTES-2(src), l_exc) >> 390 SUB len, len, 1 >> 391 jr ra >> 392 EXC( sb t0, NBYTES-2(dst), s_exc_p1) >> 393 done: >> 394 jr ra >> 395 nop >> 396 END(memcpy) 333 397 334 ld [%i1], %i3 !! 398 l_exc_copy: 335 add %i0, -8, %i0 !! 399 /* 336 ld [%i1 + 4], %i4 !! 400 * Copy bytes from src until faulting load address (or until a 337 b 8f !! 401 * lb faults) 338 add %g3, 1, %g3 !! 402 * 339 2: !! 403 * When reached by a faulting LDFIRST/LDREST, THREAD_BUADDR($28) 340 ld [%i1], %i4 !! 404 * may be more than a byte beyond the last address. 341 add %i0, -12, %i0 !! 405 * Hence, the lb below may get an exception. 342 ld [%i1 + 4], %i5 !! 406 * 343 add %g3, 2, %g3 !! 407 * Assumes src < THREAD_BUADDR($28) 344 b 9f !! 408 */ 345 add %i1, -4, %i1 !! 409 LOAD t0, TI_TASK($28) 346 3: << 347 ld [%i1], %g1 << 348 add %i0, -4, %i0 << 349 ld [%i1 + 4], %i3 << 350 srl %i2, 2, %g3 << 351 b 7f << 352 add %i1, 4, %i1 << 353 4: << 354 ld [%i1], %i5 << 355 cmp %i2, 7 << 356 ld [%i1 + 4], %g1 << 357 srl %i2, 2, %g3 << 358 bleu 10f << 359 add %i1, 8, %i1 << 360 << 361 ld [%i1], %i3 << 362 add %g3, -1, %g3 << 363 5: << 364 sll %i5, %g4, %g2 << 365 srl %g1, %l0, %g5 << 366 or %g2, %g5, %g2 << 367 st %g2, [%i0] << 368 7: << 369 ld [%i1 + 4], %i4 << 370 sll %g1, %g4, %g2 << 371 srl %i3, %l0, %g5 << 372 or %g2, %g5, %g2 << 373 st %g2, [%i0 + 4] << 374 8: << 375 ld [%i1 + 8], %i5 << 376 sll %i3, %g4, %g2 << 377 srl %i4, %l0, %g5 << 378 or %g2, %g5, %g2 << 379 st %g2, [%i0 + 8] << 380 9: << 381 ld [%i1 + 12], %g1 << 382 sll %i4, %g4, %g2 << 383 srl %i5, %l0, %g5 << 384 addcc %g3, -4, %g3 << 385 or %g2, %g5, %g2 << 386 add %i1, 16, %i1 << 387 st %g2, [%i0 + 12] << 388 add %i0, 16, %i0 << 389 bne,a 5b << 390 ld [%i1], %i3 << 391 10: << 392 sll %i5, %g4, %g2 << 393 srl %g1, %l0, %g5 << 394 srl %l0, 3, %g3 << 395 or %g2, %g5, %g2 << 396 sub %i1, %g3, %i1 << 397 andcc %i2, 2, %g0 << 398 st %g2, [%i0] << 399 be 1f << 400 andcc %i2, 1, %g0 << 401 << 402 ldub [%i1], %g2 << 403 add %i1, 2, %i1 << 404 stb %g2, [%i0 + 4] << 405 add %i0, 2, %i0 << 406 ldub [%i1 - 1], %g2 << 407 stb %g2, [%i0 + 3] << 408 1: << 409 be 1f << 410 nop 410 nop 411 ldub [%i1], %g2 !! 411 LOAD t0, THREAD_BUADDR(t0) 412 stb %g2, [%i0 + 4] << 413 1: 412 1: 414 ret !! 413 EXC( lb t1, 0(src), l_exc) 415 restore %g7, %g0, %o0 !! 414 ADD src, src, 1 416 !! 415 sb t1, 0(dst) # can't fault -- we're copy_from_user 417 88: /* short_end */ !! 416 bne src, t0, 1b 418 !! 417 ADD dst, dst, 1 419 and %o2, 0xe, %o3 !! 418 l_exc: 420 20: !! 419 LOAD t0, TI_TASK($28) 421 sethi %hi(89f), %o5 !! 420 nop 422 sll %o3, 3, %o4 !! 421 LOAD t0, THREAD_BUADDR(t0) # t0 is just past last good address 423 add %o0, %o3, %o0 !! 422 nop 424 sub %o5, %o4, %o5 !! 423 SUB len, AT, t0 # len number of uncopied bytes 425 add %o1, %o3, %o1 !! 424 /* 426 jmpl %o5 + %lo(89f), %g0 !! 425 * Here's where we rely on src and dst being incremented in tandem, 427 andcc %o2, 1, %g0 !! 426 * See (3) above. 428 !! 427 * dst += (fault addr - src) to put dst at first byte to clear 429 MOVE_SHORTCHUNK(o1, o0, 0x0c, g2, g3) !! 428 */ 430 MOVE_SHORTCHUNK(o1, o0, 0x0a, g2, g3) !! 429 ADD dst, t0 # compute start address in a1 431 MOVE_SHORTCHUNK(o1, o0, 0x08, g2, g3) !! 430 SUB dst, src 432 MOVE_SHORTCHUNK(o1, o0, 0x06, g2, g3) !! 431 /* 433 MOVE_SHORTCHUNK(o1, o0, 0x04, g2, g3) !! 432 * Clear len bytes starting at dst. Can't call __bzero because it 434 MOVE_SHORTCHUNK(o1, o0, 0x02, g2, g3) !! 433 * might modify len. An inefficient loop for these rare times... 435 MOVE_SHORTCHUNK(o1, o0, 0x00, g2, g3) !! 434 */ >> 435 beqz len, done >> 436 SUB src, len, 1 >> 437 1: sb zero, 0(dst) >> 438 ADD dst, dst, 1 >> 439 bnez src, 1b >> 440 SUB src, src, 1 >> 441 jr ra >> 442 nop 436 443 437 89: /* short_table_end */ << 438 444 439 be 1f !! 445 #define SEXC(n) \ >> 446 s_exc_p ## n ## u: \ >> 447 jr ra; \ >> 448 ADD len, len, n*NBYTES >> 449 >> 450 SEXC(8) >> 451 SEXC(7) >> 452 SEXC(6) >> 453 SEXC(5) >> 454 SEXC(4) >> 455 SEXC(3) >> 456 SEXC(2) >> 457 SEXC(1) >> 458 >> 459 s_exc_p1: >> 460 jr ra >> 461 ADD len, len, 1 >> 462 s_exc: >> 463 jr ra 440 nop 464 nop 441 465 442 ldub [%o1], %g2 !! 466 .align 5 443 stb %g2, [%o0] !! 467 LEAF(memmove) 444 1: !! 468 ADD t0, a0, a2 445 retl !! 469 ADD t1, a1, a2 446 mov %g7, %o0 !! 470 sltu t0, a1, t0 # dst + len <= src -> memcpy >> 471 sltu t1, a0, t1 # dst >= src + len -> memcpy >> 472 and t0, t1 >> 473 beqz t0, __memcpy >> 474 move v0, a0 /* return value */ >> 475 beqz a2, r_out >> 476 END(memmove) >> 477 >> 478 /* fall through to __rmemcpy */ >> 479 LEAF(__rmemcpy) /* a0=dst a1=src a2=len */ >> 480 sltu t0, a1, a0 >> 481 beqz t0, r_end_bytes_up # src >= dst >> 482 nop >> 483 ADD a0, a2 # dst = dst + len >> 484 ADD a1, a2 # src = src + len 447 485 448 90: /* short_aligned_end */ !! 486 r_end_bytes: 449 bne 88b !! 487 lb t0, -1(a1) 450 andcc %o2, 8, %g0 !! 488 SUB a2, a2, 0x1 451 !! 489 sb t0, -1(a0) 452 be 1f !! 490 SUB a1, a1, 0x1 453 andcc %o2, 4, %g0 !! 491 bnez a2, r_end_bytes 454 !! 492 SUB a0, a0, 0x1 455 ld [%o1 + 0x00], %g2 !! 493 456 ld [%o1 + 0x04], %g3 !! 494 r_out: 457 add %o1, 8, %o1 !! 495 jr ra 458 st %g2, [%o0 + 0x00] !! 496 move a2, zero 459 st %g3, [%o0 + 0x04] !! 497 460 add %o0, 8, %o0 !! 498 r_end_bytes_up: 461 1: !! 499 lb t0, (a1) 462 b 81b !! 500 SUB a2, a2, 0x1 463 mov %o2, %g1 !! 501 sb t0, (a0) >> 502 ADD a1, a1, 0x1 >> 503 bnez a2, r_end_bytes_up >> 504 ADD a0, a0, 0x1 >> 505 >> 506 jr ra >> 507 move a2, zero >> 508 END(__rmemcpy)
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.