1 /* SPDX-License-Identifier: GPL-2.0 */ !! 1 /* 2 /* memcpy.S: Sparc optimized memcpy and memmov !! 2 * This file is subject to the terms and conditions of the GNU General Public 3 * Hand optimized from GNU libc's memcpy and m !! 3 * License. See the file "COPYING" in the main directory of this archive 4 * Copyright (C) 1991,1996 Free Software Found !! 4 * for more details. 5 * Copyright (C) 1995 Linus Torvalds (Linus.To !! 5 * 6 * Copyright (C) 1996 David S. Miller (davem@c !! 6 * Unified implementation of memcpy, memmove and the __copy_user backend. 7 * Copyright (C) 1996 Eddie C. Dost (ecd@skyne !! 7 * 8 * Copyright (C) 1996 Jakub Jelinek (jj@sunsit !! 8 * Copyright (C) 1998, 99, 2000, 01, 2002 Ralf Baechle (ralf@gnu.org) >> 9 * Copyright (C) 1999, 2000, 01, 2002 Silicon Graphics, Inc. >> 10 * Copyright (C) 2002 Broadcom, Inc. >> 11 * memcpy/copy_user author: Mark Vandevoorde >> 12 * Copyright (C) 2007 Maciej W. Rozycki >> 13 * >> 14 * Mnemonic names for arguments to memcpy/__copy_user 9 */ 15 */ 10 16 11 #include <linux/export.h> !! 17 /* 12 !! 18 * Hack to resolve longstanding prefetch issue 13 #define FUNC(x) \ !! 19 * 14 .globl x; \ !! 20 * Prefetching may be fatal on some systems if we're prefetching beyond the 15 .type x,@function; \ !! 21 * end of memory on some systems. It's also a seriously bad idea on non 16 .align 4; \ !! 22 * dma-coherent systems. 17 x: !! 23 */ 18 !! 24 #ifdef CONFIG_DMA_NONCOHERENT 19 /* Both these macros have to start with exactl !! 25 #undef CONFIG_CPU_HAS_PREFETCH 20 #define MOVE_BIGCHUNK(src, dst, offset, t0, t1 !! 26 #endif 21 ldd [%src + (offset) + 0x00], %t0; !! 27 #ifdef CONFIG_MIPS_MALTA 22 ldd [%src + (offset) + 0x08], %t2; !! 28 #undef CONFIG_CPU_HAS_PREFETCH 23 ldd [%src + (offset) + 0x10], %t4; !! 29 #endif 24 ldd [%src + (offset) + 0x18], %t6; !! 30 25 st %t0, [%dst + (offset) + 0x00]; !! 31 #include <asm/asm.h> 26 st %t1, [%dst + (offset) + 0x04]; !! 32 #include <asm/asm-offsets.h> 27 st %t2, [%dst + (offset) + 0x08]; !! 33 #include <asm/regdef.h> 28 st %t3, [%dst + (offset) + 0x0c]; !! 34 29 st %t4, [%dst + (offset) + 0x10]; !! 35 #define dst a0 30 st %t5, [%dst + (offset) + 0x14]; !! 36 #define src a1 31 st %t6, [%dst + (offset) + 0x18]; !! 37 #define len a2 32 st %t7, [%dst + (offset) + 0x1c]; !! 38 33 !! 39 /* 34 #define MOVE_BIGALIGNCHUNK(src, dst, offset, t !! 40 * Spec 35 ldd [%src + (offset) + 0x00], %t0; !! 41 * 36 ldd [%src + (offset) + 0x08], %t2; !! 42 * memcpy copies len bytes from src to dst and sets v0 to dst. 37 ldd [%src + (offset) + 0x10], %t4; !! 43 * It assumes that 38 ldd [%src + (offset) + 0x18], %t6; !! 44 * - src and dst don't overlap 39 std %t0, [%dst + (offset) + 0x00]; !! 45 * - src is readable 40 std %t2, [%dst + (offset) + 0x08]; !! 46 * - dst is writable 41 std %t4, [%dst + (offset) + 0x10]; !! 47 * memcpy uses the standard calling convention 42 std %t6, [%dst + (offset) + 0x18]; !! 48 * 43 !! 49 * __copy_user copies up to len bytes from src to dst and sets a2 (len) to 44 #define MOVE_LASTCHUNK(src, dst, offset, t0, t !! 50 * the number of uncopied bytes due to an exception caused by a read or write. 45 ldd [%src - (offset) - 0x10], %t0; !! 51 * __copy_user assumes that src and dst don't overlap, and that the call is 46 ldd [%src - (offset) - 0x08], %t2; !! 52 * implementing one of the following: 47 st %t0, [%dst - (offset) - 0x10]; !! 53 * copy_to_user 48 st %t1, [%dst - (offset) - 0x0c]; !! 54 * - src is readable (no exceptions when reading src) 49 st %t2, [%dst - (offset) - 0x08]; !! 55 * copy_from_user 50 st %t3, [%dst - (offset) - 0x04]; !! 56 * - dst is writable (no exceptions when writing dst) 51 !! 57 * __copy_user uses a non-standard calling convention; see 52 #define MOVE_LASTALIGNCHUNK(src, dst, offset, !! 58 * include/asm-mips/uaccess.h 53 ldd [%src - (offset) - 0x10], %t0; !! 59 * 54 ldd [%src - (offset) - 0x08], %t2; !! 60 * When an exception happens on a load, the handler must 55 std %t0, [%dst - (offset) - 0x10]; !! 61 # ensure that all of the destination buffer is overwritten to prevent 56 std %t2, [%dst - (offset) - 0x08]; !! 62 * leaking information to user mode programs. 57 !! 63 */ 58 #define MOVE_SHORTCHUNK(src, dst, offset, t0, << 59 ldub [%src - (offset) - 0x02], %t0; << 60 ldub [%src - (offset) - 0x01], %t1; << 61 stb %t0, [%dst - (offset) - 0x02]; << 62 stb %t1, [%dst - (offset) - 0x01]; << 63 << 64 .text << 65 .align 4 << 66 64 67 FUNC(memmove) !! 65 /* 68 EXPORT_SYMBOL(memmove) !! 66 * Implementation 69 cmp %o0, %o1 << 70 mov %o0, %g7 << 71 bleu 9f << 72 sub %o0, %o1, %o4 << 73 << 74 add %o1, %o2, %o3 << 75 cmp %o3, %o0 << 76 bleu 0f << 77 andcc %o4, 3, %o5 << 78 << 79 add %o1, %o2, %o1 << 80 add %o0, %o2, %o0 << 81 sub %o1, 1, %o1 << 82 sub %o0, 1, %o0 << 83 << 84 1: /* reverse_bytes */ << 85 << 86 ldub [%o1], %o4 << 87 subcc %o2, 1, %o2 << 88 stb %o4, [%o0] << 89 sub %o1, 1, %o1 << 90 bne 1b << 91 sub %o0, 1, %o0 << 92 << 93 retl << 94 mov %g7, %o0 << 95 << 96 /* NOTE: This code is executed just for the ca << 97 where %src (=%o1) & 3 is != 0. << 98 We need to align it to 4. So, for (%s << 99 1 we need to do ldub,lduh << 100 2 lduh << 101 3 just ldub << 102 so even if it looks weird, the branch << 103 are correct here. -jj << 104 */ 67 */ 105 78: /* dword_align */ << 106 68 107 andcc %o1, 1, %g0 !! 69 /* 108 be 4f !! 70 * The exception handler for loads requires that: 109 andcc %o1, 2, %g0 !! 71 * 1- AT contain the address of the byte just past the end of the source 110 !! 72 * of the copy, 111 ldub [%o1], %g2 !! 73 * 2- src_entry <= src < AT, and 112 add %o1, 1, %o1 !! 74 * 3- (dst - src) == (dst_entry - src_entry), 113 stb %g2, [%o0] !! 75 * The _entry suffix denotes values when __copy_user was called. 114 sub %o2, 1, %o2 !! 76 * 115 bne 3f !! 77 * (1) is set up up by uaccess.h and maintained by not writing AT in copy_user 116 add %o0, 1, %o0 !! 78 * (2) is met by incrementing src by the number of bytes copied 117 4: !! 79 * (3) is met by not doing loads between a pair of increments of dst and src 118 lduh [%o1], %g2 !! 80 * 119 add %o1, 2, %o1 !! 81 * The exception handlers for stores adjust len (if necessary) and return. 120 sth %g2, [%o0] !! 82 * These handlers do not need to overwrite any data. 121 sub %o2, 2, %o2 !! 83 * 122 b 3f !! 84 * For __rmemcpy and memmove an exception is always a kernel bug, therefore 123 add %o0, 2, %o0 !! 85 * they're not protected. 124 !! 86 */ 125 FUNC(memcpy) /* %o0=dst %o1=src %o2=len */ << 126 EXPORT_SYMBOL(memcpy) << 127 << 128 sub %o0, %o1, %o4 << 129 mov %o0, %g7 << 130 9: << 131 andcc %o4, 3, %o5 << 132 0: << 133 bne 86f << 134 cmp %o2, 15 << 135 << 136 bleu 90f << 137 andcc %o1, 3, %g0 << 138 << 139 bne 78b << 140 3: << 141 andcc %o1, 4, %g0 << 142 << 143 be 2f << 144 mov %o2, %g1 << 145 << 146 ld [%o1], %o4 << 147 sub %g1, 4, %g1 << 148 st %o4, [%o0] << 149 add %o1, 4, %o1 << 150 add %o0, 4, %o0 << 151 2: << 152 andcc %g1, 0xffffff80, %g0 << 153 be 3f << 154 andcc %o0, 4, %g0 << 155 << 156 be 82f + 4 << 157 5: << 158 MOVE_BIGCHUNK(o1, o0, 0x00, o2, o3, o4 << 159 MOVE_BIGCHUNK(o1, o0, 0x20, o2, o3, o4 << 160 MOVE_BIGCHUNK(o1, o0, 0x40, o2, o3, o4 << 161 MOVE_BIGCHUNK(o1, o0, 0x60, o2, o3, o4 << 162 sub %g1, 128, %g1 << 163 add %o1, 128, %o1 << 164 cmp %g1, 128 << 165 bge 5b << 166 add %o0, 128, %o0 << 167 3: << 168 andcc %g1, 0x70, %g4 << 169 be 80f << 170 andcc %g1, 8, %g0 << 171 << 172 sethi %hi(80f), %o5 << 173 srl %g4, 1, %o4 << 174 add %g4, %o4, %o4 << 175 add %o1, %g4, %o1 << 176 sub %o5, %o4, %o5 << 177 jmpl %o5 + %lo(80f), %g0 << 178 add %o0, %g4, %o0 << 179 << 180 79: /* memcpy_table */ << 181 << 182 MOVE_LASTCHUNK(o1, o0, 0x60, g2, g3, g << 183 MOVE_LASTCHUNK(o1, o0, 0x50, g2, g3, g << 184 MOVE_LASTCHUNK(o1, o0, 0x40, g2, g3, g << 185 MOVE_LASTCHUNK(o1, o0, 0x30, g2, g3, g << 186 MOVE_LASTCHUNK(o1, o0, 0x20, g2, g3, g << 187 MOVE_LASTCHUNK(o1, o0, 0x10, g2, g3, g << 188 MOVE_LASTCHUNK(o1, o0, 0x00, g2, g3, g << 189 << 190 80: /* memcpy_table_end */ << 191 be 81f << 192 andcc %g1, 4, %g0 << 193 << 194 ldd [%o1], %g2 << 195 add %o0, 8, %o0 << 196 st %g2, [%o0 - 0x08] << 197 add %o1, 8, %o1 << 198 st %g3, [%o0 - 0x04] << 199 << 200 81: /* memcpy_last7 */ << 201 << 202 be 1f << 203 andcc %g1, 2, %g0 << 204 << 205 ld [%o1], %g2 << 206 add %o1, 4, %o1 << 207 st %g2, [%o0] << 208 add %o0, 4, %o0 << 209 1: << 210 be 1f << 211 andcc %g1, 1, %g0 << 212 87 213 lduh [%o1], %g2 !! 88 #define EXC(inst_reg,addr,handler) \ 214 add %o1, 2, %o1 !! 89 9: inst_reg, addr; \ 215 sth %g2, [%o0] !! 90 .section __ex_table,"a"; \ 216 add %o0, 2, %o0 !! 91 PTR 9b, handler; \ 217 1: !! 92 .previous 218 be 1f << 219 nop << 220 93 221 ldub [%o1], %g2 !! 94 /* 222 stb %g2, [%o0] !! 95 * Only on the 64-bit kernel we can made use of 64-bit registers. 223 1: !! 96 */ 224 retl !! 97 #ifdef CONFIG_64BIT 225 mov %g7, %o0 !! 98 #define USE_DOUBLE >> 99 #endif >> 100 >> 101 #ifdef USE_DOUBLE >> 102 >> 103 #define LOAD ld >> 104 #define LOADL ldl >> 105 #define LOADR ldr >> 106 #define STOREL sdl >> 107 #define STORER sdr >> 108 #define STORE sd >> 109 #define ADD daddu >> 110 #define SUB dsubu >> 111 #define SRL dsrl >> 112 #define SRA dsra >> 113 #define SLL dsll >> 114 #define SLLV dsllv >> 115 #define SRLV dsrlv >> 116 #define NBYTES 8 >> 117 #define LOG_NBYTES 3 >> 118 >> 119 /* >> 120 * As we are sharing code base with the mips32 tree (which use the o32 ABI >> 121 * register definitions). We need to redefine the register definitions from >> 122 * the n64 ABI register naming to the o32 ABI register naming. >> 123 */ >> 124 #undef t0 >> 125 #undef t1 >> 126 #undef t2 >> 127 #undef t3 >> 128 #define t0 $8 >> 129 #define t1 $9 >> 130 #define t2 $10 >> 131 #define t3 $11 >> 132 #define t4 $12 >> 133 #define t5 $13 >> 134 #define t6 $14 >> 135 #define t7 $15 >> 136 >> 137 #else >> 138 >> 139 #define LOAD lw >> 140 #define LOADL lwl >> 141 #define LOADR lwr >> 142 #define STOREL swl >> 143 #define STORER swr >> 144 #define STORE sw >> 145 #define ADD addu >> 146 #define SUB subu >> 147 #define SRL srl >> 148 #define SLL sll >> 149 #define SRA sra >> 150 #define SLLV sllv >> 151 #define SRLV srlv >> 152 #define NBYTES 4 >> 153 #define LOG_NBYTES 2 >> 154 >> 155 #endif /* USE_DOUBLE */ >> 156 >> 157 #ifdef CONFIG_CPU_LITTLE_ENDIAN >> 158 #define LDFIRST LOADR >> 159 #define LDREST LOADL >> 160 #define STFIRST STORER >> 161 #define STREST STOREL >> 162 #define SHIFT_DISCARD SLLV >> 163 #else >> 164 #define LDFIRST LOADL >> 165 #define LDREST LOADR >> 166 #define STFIRST STOREL >> 167 #define STREST STORER >> 168 #define SHIFT_DISCARD SRLV >> 169 #endif >> 170 >> 171 #define FIRST(unit) ((unit)*NBYTES) >> 172 #define REST(unit) (FIRST(unit)+NBYTES-1) >> 173 #define UNIT(unit) FIRST(unit) 226 174 227 82: /* ldd_std */ !! 175 #define ADDRMASK (NBYTES-1) 228 MOVE_BIGALIGNCHUNK(o1, o0, 0x00, o2, o << 229 MOVE_BIGALIGNCHUNK(o1, o0, 0x20, o2, o << 230 MOVE_BIGALIGNCHUNK(o1, o0, 0x40, o2, o << 231 MOVE_BIGALIGNCHUNK(o1, o0, 0x60, o2, o << 232 subcc %g1, 128, %g1 << 233 add %o1, 128, %o1 << 234 cmp %g1, 128 << 235 bge 82b << 236 add %o0, 128, %o0 << 237 << 238 andcc %g1, 0x70, %g4 << 239 be 84f << 240 andcc %g1, 8, %g0 << 241 << 242 sethi %hi(84f), %o5 << 243 add %o1, %g4, %o1 << 244 sub %o5, %g4, %o5 << 245 jmpl %o5 + %lo(84f), %g0 << 246 add %o0, %g4, %o0 << 247 << 248 83: /* amemcpy_table */ << 249 << 250 MOVE_LASTALIGNCHUNK(o1, o0, 0x60, g2, << 251 MOVE_LASTALIGNCHUNK(o1, o0, 0x50, g2, << 252 MOVE_LASTALIGNCHUNK(o1, o0, 0x40, g2, << 253 MOVE_LASTALIGNCHUNK(o1, o0, 0x30, g2, << 254 MOVE_LASTALIGNCHUNK(o1, o0, 0x20, g2, << 255 MOVE_LASTALIGNCHUNK(o1, o0, 0x10, g2, << 256 MOVE_LASTALIGNCHUNK(o1, o0, 0x00, g2, << 257 << 258 84: /* amemcpy_table_end */ << 259 be 85f << 260 andcc %g1, 4, %g0 << 261 << 262 ldd [%o1], %g2 << 263 add %o0, 8, %o0 << 264 std %g2, [%o0 - 0x08] << 265 add %o1, 8, %o1 << 266 85: /* amemcpy_last7 */ << 267 be 1f << 268 andcc %g1, 2, %g0 << 269 << 270 ld [%o1], %g2 << 271 add %o1, 4, %o1 << 272 st %g2, [%o0] << 273 add %o0, 4, %o0 << 274 1: << 275 be 1f << 276 andcc %g1, 1, %g0 << 277 176 278 lduh [%o1], %g2 !! 177 .text 279 add %o1, 2, %o1 !! 178 .set noreorder 280 sth %g2, [%o0] !! 179 #ifndef CONFIG_CPU_DADDI_WORKAROUNDS 281 add %o0, 2, %o0 !! 180 .set noat >> 181 #else >> 182 .set at=v1 >> 183 #endif >> 184 >> 185 /* >> 186 * A combined memcpy/__copy_user >> 187 * __copy_user sets len to 0 for success; else to an upper bound of >> 188 * the number of uncopied bytes. >> 189 * memcpy sets v0 to dst. >> 190 */ >> 191 .align 5 >> 192 LEAF(memcpy) /* a0=dst a1=src a2=len */ >> 193 move v0, dst /* return value */ >> 194 .L__memcpy: >> 195 FEXPORT(__copy_user) >> 196 /* >> 197 * Note: dst & src may be unaligned, len may be 0 >> 198 * Temps >> 199 */ >> 200 #define rem t8 >> 201 >> 202 R10KCBARRIER(0(ra)) >> 203 /* >> 204 * The "issue break"s below are very approximate. >> 205 * Issue delays for dcache fills will perturb the schedule, as will >> 206 * load queue full replay traps, etc. >> 207 * >> 208 * If len < NBYTES use byte operations. >> 209 */ >> 210 PREF( 0, 0(src) ) >> 211 PREF( 1, 0(dst) ) >> 212 sltu t2, len, NBYTES >> 213 and t1, dst, ADDRMASK >> 214 PREF( 0, 1*32(src) ) >> 215 PREF( 1, 1*32(dst) ) >> 216 bnez t2, .Lcopy_bytes_checklen >> 217 and t0, src, ADDRMASK >> 218 PREF( 0, 2*32(src) ) >> 219 PREF( 1, 2*32(dst) ) >> 220 bnez t1, .Ldst_unaligned >> 221 nop >> 222 bnez t0, .Lsrc_unaligned_dst_aligned >> 223 /* >> 224 * use delay slot for fall-through >> 225 * src and dst are aligned; need to compute rem >> 226 */ >> 227 .Lboth_aligned: >> 228 SRL t0, len, LOG_NBYTES+3 # +3 for 8 units/iter >> 229 beqz t0, .Lcleanup_both_aligned # len < 8*NBYTES >> 230 and rem, len, (8*NBYTES-1) # rem = len % (8*NBYTES) >> 231 PREF( 0, 3*32(src) ) >> 232 PREF( 1, 3*32(dst) ) >> 233 .align 4 282 1: 234 1: 283 be 1f !! 235 R10KCBARRIER(0(ra)) >> 236 EXC( LOAD t0, UNIT(0)(src), .Ll_exc) >> 237 EXC( LOAD t1, UNIT(1)(src), .Ll_exc_copy) >> 238 EXC( LOAD t2, UNIT(2)(src), .Ll_exc_copy) >> 239 EXC( LOAD t3, UNIT(3)(src), .Ll_exc_copy) >> 240 SUB len, len, 8*NBYTES >> 241 EXC( LOAD t4, UNIT(4)(src), .Ll_exc_copy) >> 242 EXC( LOAD t7, UNIT(5)(src), .Ll_exc_copy) >> 243 EXC( STORE t0, UNIT(0)(dst), .Ls_exc_p8u) >> 244 EXC( STORE t1, UNIT(1)(dst), .Ls_exc_p7u) >> 245 EXC( LOAD t0, UNIT(6)(src), .Ll_exc_copy) >> 246 EXC( LOAD t1, UNIT(7)(src), .Ll_exc_copy) >> 247 ADD src, src, 8*NBYTES >> 248 ADD dst, dst, 8*NBYTES >> 249 EXC( STORE t2, UNIT(-6)(dst), .Ls_exc_p6u) >> 250 EXC( STORE t3, UNIT(-5)(dst), .Ls_exc_p5u) >> 251 EXC( STORE t4, UNIT(-4)(dst), .Ls_exc_p4u) >> 252 EXC( STORE t7, UNIT(-3)(dst), .Ls_exc_p3u) >> 253 EXC( STORE t0, UNIT(-2)(dst), .Ls_exc_p2u) >> 254 EXC( STORE t1, UNIT(-1)(dst), .Ls_exc_p1u) >> 255 PREF( 0, 8*32(src) ) >> 256 PREF( 1, 8*32(dst) ) >> 257 bne len, rem, 1b 284 nop 258 nop 285 259 286 ldub [%o1], %g2 !! 260 /* 287 stb %g2, [%o0] !! 261 * len == rem == the number of bytes left to copy < 8*NBYTES >> 262 */ >> 263 .Lcleanup_both_aligned: >> 264 beqz len, .Ldone >> 265 sltu t0, len, 4*NBYTES >> 266 bnez t0, .Lless_than_4units >> 267 and rem, len, (NBYTES-1) # rem = len % NBYTES >> 268 /* >> 269 * len >= 4*NBYTES >> 270 */ >> 271 EXC( LOAD t0, UNIT(0)(src), .Ll_exc) >> 272 EXC( LOAD t1, UNIT(1)(src), .Ll_exc_copy) >> 273 EXC( LOAD t2, UNIT(2)(src), .Ll_exc_copy) >> 274 EXC( LOAD t3, UNIT(3)(src), .Ll_exc_copy) >> 275 SUB len, len, 4*NBYTES >> 276 ADD src, src, 4*NBYTES >> 277 R10KCBARRIER(0(ra)) >> 278 EXC( STORE t0, UNIT(0)(dst), .Ls_exc_p4u) >> 279 EXC( STORE t1, UNIT(1)(dst), .Ls_exc_p3u) >> 280 EXC( STORE t2, UNIT(2)(dst), .Ls_exc_p2u) >> 281 EXC( STORE t3, UNIT(3)(dst), .Ls_exc_p1u) >> 282 .set reorder /* DADDI_WAR */ >> 283 ADD dst, dst, 4*NBYTES >> 284 beqz len, .Ldone >> 285 .set noreorder >> 286 .Lless_than_4units: >> 287 /* >> 288 * rem = len % NBYTES >> 289 */ >> 290 beq rem, len, .Lcopy_bytes >> 291 nop 288 1: 292 1: 289 retl !! 293 R10KCBARRIER(0(ra)) 290 mov %g7, %o0 !! 294 EXC( LOAD t0, 0(src), .Ll_exc) 291 !! 295 ADD src, src, NBYTES 292 86: /* non_aligned */ !! 296 SUB len, len, NBYTES 293 cmp %o2, 6 !! 297 EXC( STORE t0, 0(dst), .Ls_exc_p1u) 294 bleu 88f !! 298 .set reorder /* DADDI_WAR */ 295 nop !! 299 ADD dst, dst, NBYTES 296 !! 300 bne rem, len, 1b 297 save %sp, -96, %sp !! 301 .set noreorder 298 andcc %i0, 3, %g0 !! 302 299 be 61f !! 303 /* 300 andcc %i0, 1, %g0 !! 304 * src and dst are aligned, need to copy rem bytes (rem < NBYTES) 301 be 60f !! 305 * A loop would do only a byte at a time with possible branch 302 andcc %i0, 2, %g0 !! 306 * mispredicts. Can't do an explicit LOAD dst,mask,or,STORE 303 !! 307 * because can't assume read-access to dst. Instead, use 304 ldub [%i1], %g5 !! 308 * STREST dst, which doesn't require read access to dst. 305 add %i1, 1, %i1 !! 309 * 306 stb %g5, [%i0] !! 310 * This code should perform better than a simple loop on modern, 307 sub %i2, 1, %i2 !! 311 * wide-issue mips processors because the code has fewer branches and 308 bne 61f !! 312 * more instruction-level parallelism. 309 add %i0, 1, %i0 !! 313 */ 310 60: !! 314 #define bits t2 311 ldub [%i1], %g3 !! 315 beqz len, .Ldone 312 add %i1, 2, %i1 !! 316 ADD t1, dst, len # t1 is just past last byte of dst 313 stb %g3, [%i0] !! 317 li bits, 8*NBYTES 314 sub %i2, 2, %i2 !! 318 SLL rem, len, 3 # rem = number of bits to keep 315 ldub [%i1 - 1], %g3 !! 319 EXC( LOAD t0, 0(src), .Ll_exc) 316 add %i0, 2, %i0 !! 320 SUB bits, bits, rem # bits = number of bits to discard 317 stb %g3, [%i0 - 1] !! 321 SHIFT_DISCARD t0, t0, bits 318 61: !! 322 EXC( STREST t0, -1(t1), .Ls_exc) 319 and %i1, 3, %g2 !! 323 jr ra 320 and %i2, 0xc, %g3 !! 324 move len, zero 321 and %i1, -4, %i1 !! 325 .Ldst_unaligned: 322 cmp %g3, 4 !! 326 /* 323 sll %g2, 3, %g4 !! 327 * dst is unaligned 324 mov 32, %g2 !! 328 * t0 = src & ADDRMASK 325 be 4f !! 329 * t1 = dst & ADDRMASK; T1 > 0 326 sub %g2, %g4, %l0 !! 330 * len >= NBYTES 327 !! 331 * 328 blu 3f !! 332 * Copy enough bytes to align dst 329 cmp %g3, 0x8 !! 333 * Set match = (src and dst have same alignment) 330 !! 334 */ 331 be 2f !! 335 #define match rem 332 srl %i2, 2, %g3 !! 336 EXC( LDFIRST t3, FIRST(0)(src), .Ll_exc) 333 !! 337 ADD t2, zero, NBYTES 334 ld [%i1], %i3 !! 338 EXC( LDREST t3, REST(0)(src), .Ll_exc_copy) 335 add %i0, -8, %i0 !! 339 SUB t2, t2, t1 # t2 = number of bytes copied 336 ld [%i1 + 4], %i4 !! 340 xor match, t0, t1 337 b 8f !! 341 R10KCBARRIER(0(ra)) 338 add %g3, 1, %g3 !! 342 EXC( STFIRST t3, FIRST(0)(dst), .Ls_exc) 339 2: !! 343 beq len, t2, .Ldone 340 ld [%i1], %i4 !! 344 SUB len, len, t2 341 add %i0, -12, %i0 !! 345 ADD dst, dst, t2 342 ld [%i1 + 4], %i5 !! 346 beqz match, .Lboth_aligned 343 add %g3, 2, %g3 !! 347 ADD src, src, t2 344 b 9f !! 348 345 add %i1, -4, %i1 !! 349 .Lsrc_unaligned_dst_aligned: 346 3: !! 350 SRL t0, len, LOG_NBYTES+2 # +2 for 4 units/iter 347 ld [%i1], %g1 !! 351 PREF( 0, 3*32(src) ) 348 add %i0, -4, %i0 !! 352 beqz t0, .Lcleanup_src_unaligned 349 ld [%i1 + 4], %i3 !! 353 and rem, len, (4*NBYTES-1) # rem = len % 4*NBYTES 350 srl %i2, 2, %g3 !! 354 PREF( 1, 3*32(dst) ) 351 b 7f << 352 add %i1, 4, %i1 << 353 4: << 354 ld [%i1], %i5 << 355 cmp %i2, 7 << 356 ld [%i1 + 4], %g1 << 357 srl %i2, 2, %g3 << 358 bleu 10f << 359 add %i1, 8, %i1 << 360 << 361 ld [%i1], %i3 << 362 add %g3, -1, %g3 << 363 5: << 364 sll %i5, %g4, %g2 << 365 srl %g1, %l0, %g5 << 366 or %g2, %g5, %g2 << 367 st %g2, [%i0] << 368 7: << 369 ld [%i1 + 4], %i4 << 370 sll %g1, %g4, %g2 << 371 srl %i3, %l0, %g5 << 372 or %g2, %g5, %g2 << 373 st %g2, [%i0 + 4] << 374 8: << 375 ld [%i1 + 8], %i5 << 376 sll %i3, %g4, %g2 << 377 srl %i4, %l0, %g5 << 378 or %g2, %g5, %g2 << 379 st %g2, [%i0 + 8] << 380 9: << 381 ld [%i1 + 12], %g1 << 382 sll %i4, %g4, %g2 << 383 srl %i5, %l0, %g5 << 384 addcc %g3, -4, %g3 << 385 or %g2, %g5, %g2 << 386 add %i1, 16, %i1 << 387 st %g2, [%i0 + 12] << 388 add %i0, 16, %i0 << 389 bne,a 5b << 390 ld [%i1], %i3 << 391 10: << 392 sll %i5, %g4, %g2 << 393 srl %g1, %l0, %g5 << 394 srl %l0, 3, %g3 << 395 or %g2, %g5, %g2 << 396 sub %i1, %g3, %i1 << 397 andcc %i2, 2, %g0 << 398 st %g2, [%i0] << 399 be 1f << 400 andcc %i2, 1, %g0 << 401 << 402 ldub [%i1], %g2 << 403 add %i1, 2, %i1 << 404 stb %g2, [%i0 + 4] << 405 add %i0, 2, %i0 << 406 ldub [%i1 - 1], %g2 << 407 stb %g2, [%i0 + 3] << 408 1: 355 1: 409 be 1f !! 356 /* >> 357 * Avoid consecutive LD*'s to the same register since some mips >> 358 * implementations can't issue them in the same cycle. >> 359 * It's OK to load FIRST(N+1) before REST(N) because the two addresses >> 360 * are to the same unit (unless src is aligned, but it's not). >> 361 */ >> 362 R10KCBARRIER(0(ra)) >> 363 EXC( LDFIRST t0, FIRST(0)(src), .Ll_exc) >> 364 EXC( LDFIRST t1, FIRST(1)(src), .Ll_exc_copy) >> 365 SUB len, len, 4*NBYTES >> 366 EXC( LDREST t0, REST(0)(src), .Ll_exc_copy) >> 367 EXC( LDREST t1, REST(1)(src), .Ll_exc_copy) >> 368 EXC( LDFIRST t2, FIRST(2)(src), .Ll_exc_copy) >> 369 EXC( LDFIRST t3, FIRST(3)(src), .Ll_exc_copy) >> 370 EXC( LDREST t2, REST(2)(src), .Ll_exc_copy) >> 371 EXC( LDREST t3, REST(3)(src), .Ll_exc_copy) >> 372 PREF( 0, 9*32(src) ) # 0 is PREF_LOAD (not streamed) >> 373 ADD src, src, 4*NBYTES >> 374 #ifdef CONFIG_CPU_SB1 >> 375 nop # improves slotting >> 376 #endif >> 377 EXC( STORE t0, UNIT(0)(dst), .Ls_exc_p4u) >> 378 EXC( STORE t1, UNIT(1)(dst), .Ls_exc_p3u) >> 379 EXC( STORE t2, UNIT(2)(dst), .Ls_exc_p2u) >> 380 EXC( STORE t3, UNIT(3)(dst), .Ls_exc_p1u) >> 381 PREF( 1, 9*32(dst) ) # 1 is PREF_STORE (not streamed) >> 382 .set reorder /* DADDI_WAR */ >> 383 ADD dst, dst, 4*NBYTES >> 384 bne len, rem, 1b >> 385 .set noreorder >> 386 >> 387 .Lcleanup_src_unaligned: >> 388 beqz len, .Ldone >> 389 and rem, len, NBYTES-1 # rem = len % NBYTES >> 390 beq rem, len, .Lcopy_bytes 410 nop 391 nop 411 ldub [%i1], %g2 << 412 stb %g2, [%i0 + 4] << 413 1: 392 1: 414 ret !! 393 R10KCBARRIER(0(ra)) 415 restore %g7, %g0, %o0 !! 394 EXC( LDFIRST t0, FIRST(0)(src), .Ll_exc) >> 395 EXC( LDREST t0, REST(0)(src), .Ll_exc_copy) >> 396 ADD src, src, NBYTES >> 397 SUB len, len, NBYTES >> 398 EXC( STORE t0, 0(dst), .Ls_exc_p1u) >> 399 .set reorder /* DADDI_WAR */ >> 400 ADD dst, dst, NBYTES >> 401 bne len, rem, 1b >> 402 .set noreorder 416 403 417 88: /* short_end */ !! 404 .Lcopy_bytes_checklen: >> 405 beqz len, .Ldone >> 406 nop >> 407 .Lcopy_bytes: >> 408 /* 0 < len < NBYTES */ >> 409 R10KCBARRIER(0(ra)) >> 410 #define COPY_BYTE(N) \ >> 411 EXC( lb t0, N(src), .Ll_exc); \ >> 412 SUB len, len, 1; \ >> 413 beqz len, .Ldone; \ >> 414 EXC( sb t0, N(dst), .Ls_exc_p1) >> 415 >> 416 COPY_BYTE(0) >> 417 COPY_BYTE(1) >> 418 #ifdef USE_DOUBLE >> 419 COPY_BYTE(2) >> 420 COPY_BYTE(3) >> 421 COPY_BYTE(4) >> 422 COPY_BYTE(5) >> 423 #endif >> 424 EXC( lb t0, NBYTES-2(src), .Ll_exc) >> 425 SUB len, len, 1 >> 426 jr ra >> 427 EXC( sb t0, NBYTES-2(dst), .Ls_exc_p1) >> 428 .Ldone: >> 429 jr ra >> 430 nop >> 431 END(memcpy) 418 432 419 and %o2, 0xe, %o3 !! 433 .Ll_exc_copy: 420 20: !! 434 /* 421 sethi %hi(89f), %o5 !! 435 * Copy bytes from src until faulting load address (or until a 422 sll %o3, 3, %o4 !! 436 * lb faults) 423 add %o0, %o3, %o0 !! 437 * 424 sub %o5, %o4, %o5 !! 438 * When reached by a faulting LDFIRST/LDREST, THREAD_BUADDR($28) 425 add %o1, %o3, %o1 !! 439 * may be more than a byte beyond the last address. 426 jmpl %o5 + %lo(89f), %g0 !! 440 * Hence, the lb below may get an exception. 427 andcc %o2, 1, %g0 !! 441 * 428 !! 442 * Assumes src < THREAD_BUADDR($28) 429 MOVE_SHORTCHUNK(o1, o0, 0x0c, g2, g3) !! 443 */ 430 MOVE_SHORTCHUNK(o1, o0, 0x0a, g2, g3) !! 444 LOAD t0, TI_TASK($28) 431 MOVE_SHORTCHUNK(o1, o0, 0x08, g2, g3) !! 445 nop 432 MOVE_SHORTCHUNK(o1, o0, 0x06, g2, g3) !! 446 LOAD t0, THREAD_BUADDR(t0) 433 MOVE_SHORTCHUNK(o1, o0, 0x04, g2, g3) !! 447 1: 434 MOVE_SHORTCHUNK(o1, o0, 0x02, g2, g3) !! 448 EXC( lb t1, 0(src), .Ll_exc) 435 MOVE_SHORTCHUNK(o1, o0, 0x00, g2, g3) !! 449 ADD src, src, 1 >> 450 sb t1, 0(dst) # can't fault -- we're copy_from_user >> 451 .set reorder /* DADDI_WAR */ >> 452 ADD dst, dst, 1 >> 453 bne src, t0, 1b >> 454 .set noreorder >> 455 .Ll_exc: >> 456 LOAD t0, TI_TASK($28) >> 457 nop >> 458 LOAD t0, THREAD_BUADDR(t0) # t0 is just past last good address >> 459 nop >> 460 SUB len, AT, t0 # len number of uncopied bytes >> 461 /* >> 462 * Here's where we rely on src and dst being incremented in tandem, >> 463 * See (3) above. >> 464 * dst += (fault addr - src) to put dst at first byte to clear >> 465 */ >> 466 ADD dst, t0 # compute start address in a1 >> 467 SUB dst, src >> 468 /* >> 469 * Clear len bytes starting at dst. Can't call __bzero because it >> 470 * might modify len. An inefficient loop for these rare times... >> 471 */ >> 472 .set reorder /* DADDI_WAR */ >> 473 SUB src, len, 1 >> 474 beqz len, .Ldone >> 475 .set noreorder >> 476 1: sb zero, 0(dst) >> 477 ADD dst, dst, 1 >> 478 #ifndef CONFIG_CPU_DADDI_WORKAROUNDS >> 479 bnez src, 1b >> 480 SUB src, src, 1 >> 481 #else >> 482 .set push >> 483 .set noat >> 484 li v1, 1 >> 485 bnez src, 1b >> 486 SUB src, src, v1 >> 487 .set pop >> 488 #endif >> 489 jr ra >> 490 nop 436 491 437 89: /* short_table_end */ << 438 492 439 be 1f !! 493 #define SEXC(n) \ >> 494 .set reorder; /* DADDI_WAR */ \ >> 495 .Ls_exc_p ## n ## u: \ >> 496 ADD len, len, n*NBYTES; \ >> 497 jr ra; \ >> 498 .set noreorder >> 499 >> 500 SEXC(8) >> 501 SEXC(7) >> 502 SEXC(6) >> 503 SEXC(5) >> 504 SEXC(4) >> 505 SEXC(3) >> 506 SEXC(2) >> 507 SEXC(1) >> 508 >> 509 .Ls_exc_p1: >> 510 .set reorder /* DADDI_WAR */ >> 511 ADD len, len, 1 >> 512 jr ra >> 513 .set noreorder >> 514 .Ls_exc: >> 515 jr ra 440 nop 516 nop 441 517 442 ldub [%o1], %g2 !! 518 .align 5 443 stb %g2, [%o0] !! 519 LEAF(memmove) 444 1: !! 520 ADD t0, a0, a2 445 retl !! 521 ADD t1, a1, a2 446 mov %g7, %o0 !! 522 sltu t0, a1, t0 # dst + len <= src -> memcpy >> 523 sltu t1, a0, t1 # dst >= src + len -> memcpy >> 524 and t0, t1 >> 525 beqz t0, .L__memcpy >> 526 move v0, a0 /* return value */ >> 527 beqz a2, .Lr_out >> 528 END(memmove) >> 529 >> 530 /* fall through to __rmemcpy */ >> 531 LEAF(__rmemcpy) /* a0=dst a1=src a2=len */ >> 532 sltu t0, a1, a0 >> 533 beqz t0, .Lr_end_bytes_up # src >= dst >> 534 nop >> 535 ADD a0, a2 # dst = dst + len >> 536 ADD a1, a2 # src = src + len 447 537 448 90: /* short_aligned_end */ !! 538 .Lr_end_bytes: 449 bne 88b !! 539 R10KCBARRIER(0(ra)) 450 andcc %o2, 8, %g0 !! 540 lb t0, -1(a1) 451 !! 541 SUB a2, a2, 0x1 452 be 1f !! 542 sb t0, -1(a0) 453 andcc %o2, 4, %g0 !! 543 SUB a1, a1, 0x1 454 !! 544 .set reorder /* DADDI_WAR */ 455 ld [%o1 + 0x00], %g2 !! 545 SUB a0, a0, 0x1 456 ld [%o1 + 0x04], %g3 !! 546 bnez a2, .Lr_end_bytes 457 add %o1, 8, %o1 !! 547 .set noreorder 458 st %g2, [%o0 + 0x00] !! 548 459 st %g3, [%o0 + 0x04] !! 549 .Lr_out: 460 add %o0, 8, %o0 !! 550 jr ra 461 1: !! 551 move a2, zero 462 b 81b !! 552 463 mov %o2, %g1 !! 553 .Lr_end_bytes_up: >> 554 R10KCBARRIER(0(ra)) >> 555 lb t0, (a1) >> 556 SUB a2, a2, 0x1 >> 557 sb t0, (a0) >> 558 ADD a1, a1, 0x1 >> 559 .set reorder /* DADDI_WAR */ >> 560 ADD a0, a0, 0x1 >> 561 bnez a2, .Lr_end_bytes_up >> 562 .set noreorder >> 563 >> 564 jr ra >> 565 move a2, zero >> 566 END(__rmemcpy)
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.