1 /* SPDX-License-Identifier: GPL-2.0-or-later */ 2 /* 3 * User Space Access Routines 4 * 5 * Copyright (C) 2000-2002 Hewlett-Packard (John Marvin) 6 * Copyright (C) 2000 Richard Hirst <rhirst with parisc-linux.org> 7 * Copyright (C) 2001 Matthieu Delahaye <delahaym at esiee.fr> 8 * Copyright (C) 2003 Randolph Chung <tausq with parisc-linux.org> 9 * Copyright (C) 2017 Helge Deller <deller@gmx.de> 10 * Copyright (C) 2017 John David Anglin <dave.anglin@bell.net> 11 */ 12 13 /* 14 * These routines still have plenty of room for optimization 15 * (word & doubleword load/store, dual issue, store hints, etc.). 16 */ 17 18 /* 19 * The following routines assume that space register 3 (sr3) contains 20 * the space id associated with the current users address space. 21 */ 22 23 24 .text 25 26 #include <asm/assembly.h> 27 #include <asm/errno.h> 28 #include <linux/linkage.h> 29 30 /* 31 * unsigned long lclear_user(void *to, unsigned long n) 32 * 33 * Returns 0 for success. 34 * otherwise, returns number of bytes not transferred. 35 */ 36 37 ENTRY_CFI(lclear_user) 38 comib,=,n 0,%r25,$lclu_done 39 $lclu_loop: 40 addib,<> -1,%r25,$lclu_loop 41 1: stbs,ma %r0,1(%sr3,%r26) 42 43 $lclu_done: 44 bv %r0(%r2) 45 copy %r25,%r28 46 47 2: b $lclu_done 48 ldo 1(%r25),%r25 49 50 ASM_EXCEPTIONTABLE_ENTRY(1b,2b) 51 ENDPROC_CFI(lclear_user) 52 53 54 /* 55 * unsigned long pa_memcpy(void *dstp, const void *srcp, unsigned long len) 56 * 57 * Inputs: 58 * - sr1 already contains space of source region 59 * - sr2 already contains space of destination region 60 * 61 * Returns: 62 * - number of bytes that could not be copied. 63 * On success, this will be zero. 64 * 65 * This code is based on a C-implementation of a copy routine written by 66 * Randolph Chung, which in turn was derived from the glibc. 67 * 68 * Several strategies are tried to try to get the best performance for various 69 * conditions. In the optimal case, we copy by loops that copy 32- or 16-bytes 70 * at a time using general registers. Unaligned copies are handled either by 71 * aligning the destination and then using shift-and-write method, or in a few 72 * cases by falling back to a byte-at-a-time copy. 73 * 74 * Testing with various alignments and buffer sizes shows that this code is 75 * often >10x faster than a simple byte-at-a-time copy, even for strangely 76 * aligned operands. It is interesting to note that the glibc version of memcpy 77 * (written in C) is actually quite fast already. This routine is able to beat 78 * it by 30-40% for aligned copies because of the loop unrolling, but in some 79 * cases the glibc version is still slightly faster. This lends more 80 * credibility that gcc can generate very good code as long as we are careful. 81 * 82 * Possible optimizations: 83 * - add cache prefetching 84 * - try not to use the post-increment address modifiers; they may create 85 * additional interlocks. Assumption is that those were only efficient on old 86 * machines (pre PA8000 processors) 87 */ 88 89 dst = arg0 90 src = arg1 91 len = arg2 92 end = arg3 93 t1 = r19 94 t2 = r20 95 t3 = r21 96 t4 = r22 97 srcspc = sr1 98 dstspc = sr2 99 100 t0 = r1 101 a1 = t1 102 a2 = t2 103 a3 = t3 104 a0 = t4 105 106 save_src = ret0 107 save_dst = ret1 108 save_len = r31 109 110 ENTRY_CFI(pa_memcpy) 111 /* Last destination address */ 112 add dst,len,end 113 114 /* short copy with less than 16 bytes? */ 115 cmpib,COND(>>=),n 15,len,.Lbyte_loop 116 117 /* same alignment? */ 118 xor src,dst,t0 119 extru t0,31,2,t1 120 cmpib,<>,n 0,t1,.Lunaligned_copy 121 122 #ifdef CONFIG_64BIT 123 /* only do 64-bit copies if we can get aligned. */ 124 extru t0,31,3,t1 125 cmpib,<>,n 0,t1,.Lalign_loop32 126 127 /* loop until we are 64-bit aligned */ 128 .Lalign_loop64: 129 extru dst,31,3,t1 130 cmpib,=,n 0,t1,.Lcopy_loop_16_start 131 20: ldb,ma 1(srcspc,src),t1 132 21: stb,ma t1,1(dstspc,dst) 133 b .Lalign_loop64 134 ldo -1(len),len 135 136 ASM_EXCEPTIONTABLE_ENTRY(20b,.Lcopy_done) 137 ASM_EXCEPTIONTABLE_ENTRY(21b,.Lcopy_done) 138 139 .Lcopy_loop_16_start: 140 ldi 31,t0 141 .Lcopy_loop_16: 142 cmpb,COND(>>=),n t0,len,.Lword_loop 143 144 10: ldd 0(srcspc,src),t1 145 11: ldd 8(srcspc,src),t2 146 ldo 16(src),src 147 12: std,ma t1,8(dstspc,dst) 148 13: std,ma t2,8(dstspc,dst) 149 14: ldd 0(srcspc,src),t1 150 15: ldd 8(srcspc,src),t2 151 ldo 16(src),src 152 16: std,ma t1,8(dstspc,dst) 153 17: std,ma t2,8(dstspc,dst) 154 155 ASM_EXCEPTIONTABLE_ENTRY(10b,.Lcopy_done) 156 ASM_EXCEPTIONTABLE_ENTRY(11b,.Lcopy16_fault) 157 ASM_EXCEPTIONTABLE_ENTRY(12b,.Lcopy_done) 158 ASM_EXCEPTIONTABLE_ENTRY(13b,.Lcopy_done) 159 ASM_EXCEPTIONTABLE_ENTRY(14b,.Lcopy_done) 160 ASM_EXCEPTIONTABLE_ENTRY(15b,.Lcopy16_fault) 161 ASM_EXCEPTIONTABLE_ENTRY(16b,.Lcopy_done) 162 ASM_EXCEPTIONTABLE_ENTRY(17b,.Lcopy_done) 163 164 b .Lcopy_loop_16 165 ldo -32(len),len 166 167 .Lword_loop: 168 cmpib,COND(>>=),n 3,len,.Lbyte_loop 169 20: ldw,ma 4(srcspc,src),t1 170 21: stw,ma t1,4(dstspc,dst) 171 b .Lword_loop 172 ldo -4(len),len 173 174 ASM_EXCEPTIONTABLE_ENTRY(20b,.Lcopy_done) 175 ASM_EXCEPTIONTABLE_ENTRY(21b,.Lcopy_done) 176 177 #endif /* CONFIG_64BIT */ 178 179 /* loop until we are 32-bit aligned */ 180 .Lalign_loop32: 181 extru dst,31,2,t1 182 cmpib,=,n 0,t1,.Lcopy_loop_8 183 20: ldb,ma 1(srcspc,src),t1 184 21: stb,ma t1,1(dstspc,dst) 185 b .Lalign_loop32 186 ldo -1(len),len 187 188 ASM_EXCEPTIONTABLE_ENTRY(20b,.Lcopy_done) 189 ASM_EXCEPTIONTABLE_ENTRY(21b,.Lcopy_done) 190 191 192 .Lcopy_loop_8: 193 cmpib,COND(>>=),n 15,len,.Lbyte_loop 194 195 10: ldw 0(srcspc,src),t1 196 11: ldw 4(srcspc,src),t2 197 12: stw,ma t1,4(dstspc,dst) 198 13: stw,ma t2,4(dstspc,dst) 199 14: ldw 8(srcspc,src),t1 200 15: ldw 12(srcspc,src),t2 201 ldo 16(src),src 202 16: stw,ma t1,4(dstspc,dst) 203 17: stw,ma t2,4(dstspc,dst) 204 205 ASM_EXCEPTIONTABLE_ENTRY(10b,.Lcopy_done) 206 ASM_EXCEPTIONTABLE_ENTRY(11b,.Lcopy8_fault) 207 ASM_EXCEPTIONTABLE_ENTRY(12b,.Lcopy_done) 208 ASM_EXCEPTIONTABLE_ENTRY(13b,.Lcopy_done) 209 ASM_EXCEPTIONTABLE_ENTRY(14b,.Lcopy_done) 210 ASM_EXCEPTIONTABLE_ENTRY(15b,.Lcopy8_fault) 211 ASM_EXCEPTIONTABLE_ENTRY(16b,.Lcopy_done) 212 ASM_EXCEPTIONTABLE_ENTRY(17b,.Lcopy_done) 213 214 b .Lcopy_loop_8 215 ldo -16(len),len 216 217 .Lbyte_loop: 218 cmpclr,COND(<>) len,%r0,%r0 219 b,n .Lcopy_done 220 20: ldb 0(srcspc,src),t1 221 ldo 1(src),src 222 21: stb,ma t1,1(dstspc,dst) 223 b .Lbyte_loop 224 ldo -1(len),len 225 226 ASM_EXCEPTIONTABLE_ENTRY(20b,.Lcopy_done) 227 ASM_EXCEPTIONTABLE_ENTRY(21b,.Lcopy_done) 228 229 .Lcopy_done: 230 bv %r0(%r2) 231 sub end,dst,ret0 232 233 234 /* src and dst are not aligned the same way. */ 235 /* need to go the hard way */ 236 .Lunaligned_copy: 237 /* align until dst is 32bit-word-aligned */ 238 extru dst,31,2,t1 239 cmpib,=,n 0,t1,.Lcopy_dstaligned 240 20: ldb 0(srcspc,src),t1 241 ldo 1(src),src 242 21: stb,ma t1,1(dstspc,dst) 243 b .Lunaligned_copy 244 ldo -1(len),len 245 246 ASM_EXCEPTIONTABLE_ENTRY(20b,.Lcopy_done) 247 ASM_EXCEPTIONTABLE_ENTRY(21b,.Lcopy_done) 248 249 .Lcopy_dstaligned: 250 251 /* store src, dst and len in safe place */ 252 copy src,save_src 253 copy dst,save_dst 254 copy len,save_len 255 256 /* len now needs give number of words to copy */ 257 SHRREG len,2,len 258 259 /* 260 * Copy from a not-aligned src to an aligned dst using shifts. 261 * Handles 4 words per loop. 262 */ 263 264 depw,z src,28,2,t0 265 subi 32,t0,t0 266 mtsar t0 267 extru len,31,2,t0 268 cmpib,= 2,t0,.Lcase2 269 /* Make src aligned by rounding it down. */ 270 depi 0,31,2,src 271 272 cmpiclr,<> 3,t0,%r0 273 b,n .Lcase3 274 cmpiclr,<> 1,t0,%r0 275 b,n .Lcase1 276 .Lcase0: 277 cmpb,COND(=) %r0,len,.Lcda_finish 278 nop 279 280 1: ldw,ma 4(srcspc,src), a3 281 ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault) 282 1: ldw,ma 4(srcspc,src), a0 283 ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault) 284 b,n .Ldo3 285 .Lcase1: 286 1: ldw,ma 4(srcspc,src), a2 287 ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault) 288 1: ldw,ma 4(srcspc,src), a3 289 ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault) 290 ldo -1(len),len 291 cmpb,COND(=),n %r0,len,.Ldo0 292 .Ldo4: 293 1: ldw,ma 4(srcspc,src), a0 294 ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault) 295 shrpw a2, a3, %sar, t0 296 1: stw,ma t0, 4(dstspc,dst) 297 ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcopy_done) 298 .Ldo3: 299 1: ldw,ma 4(srcspc,src), a1 300 ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault) 301 shrpw a3, a0, %sar, t0 302 1: stw,ma t0, 4(dstspc,dst) 303 ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcopy_done) 304 .Ldo2: 305 1: ldw,ma 4(srcspc,src), a2 306 ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault) 307 shrpw a0, a1, %sar, t0 308 1: stw,ma t0, 4(dstspc,dst) 309 ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcopy_done) 310 .Ldo1: 311 1: ldw,ma 4(srcspc,src), a3 312 ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault) 313 shrpw a1, a2, %sar, t0 314 1: stw,ma t0, 4(dstspc,dst) 315 ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcopy_done) 316 ldo -4(len),len 317 cmpb,COND(<>) %r0,len,.Ldo4 318 nop 319 .Ldo0: 320 shrpw a2, a3, %sar, t0 321 1: stw,ma t0, 4(dstspc,dst) 322 ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcopy_done) 323 324 .Lcda_rdfault: 325 .Lcda_finish: 326 /* calculate new src, dst and len and jump to byte-copy loop */ 327 sub dst,save_dst,t0 328 add save_src,t0,src 329 b .Lbyte_loop 330 sub save_len,t0,len 331 332 .Lcase3: 333 1: ldw,ma 4(srcspc,src), a0 334 ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault) 335 1: ldw,ma 4(srcspc,src), a1 336 ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault) 337 b .Ldo2 338 ldo 1(len),len 339 .Lcase2: 340 1: ldw,ma 4(srcspc,src), a1 341 ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault) 342 1: ldw,ma 4(srcspc,src), a2 343 ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault) 344 b .Ldo1 345 ldo 2(len),len 346 347 348 /* fault exception fixup handlers: */ 349 #ifdef CONFIG_64BIT 350 .Lcopy16_fault: 351 b .Lcopy_done 352 10: std,ma t1,8(dstspc,dst) 353 ASM_EXCEPTIONTABLE_ENTRY(10b,.Lcopy_done) 354 #endif 355 356 .Lcopy8_fault: 357 b .Lcopy_done 358 10: stw,ma t1,4(dstspc,dst) 359 ASM_EXCEPTIONTABLE_ENTRY(10b,.Lcopy_done) 360 ENDPROC_CFI(pa_memcpy) 361 362 .end
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.