1 /* SPDX-License-Identifier: GPL-2.0-only */ !! 1 /* SPDX-License-Identifier: GPL-2.0 */ 2 /* !! 2 /* memcpy.S: Sparc optimized memcpy and memmove code 3 * Copyright (c) 2010-2011, The Linux Foundati !! 3 * Hand optimized from GNU libc's memcpy and memmove >> 4 * Copyright (C) 1991,1996 Free Software Foundation >> 5 * Copyright (C) 1995 Linus Torvalds (Linus.Torvalds@helsinki.fi) >> 6 * Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu) >> 7 * Copyright (C) 1996 Eddie C. Dost (ecd@skynet.be) >> 8 * Copyright (C) 1996 Jakub Jelinek (jj@sunsite.mff.cuni.cz) 4 */ 9 */ 5 10 6 /* !! 11 #include <linux/export.h> 7 * Description !! 12 8 * !! 13 #define FUNC(x) \ 9 * library function for memcpy where length !! 14 .globl x; \ 10 * ptr_in to ptr_out. ptr_out is returned un !! 15 .type x,@function; \ 11 * Allows any combination of alignment on in !! 16 .align 4; \ 12 * and length from 0 to 2^32-1 !! 17 x: 13 * !! 18 14 * Restrictions !! 19 /* Both these macros have to start with exactly the same insn */ 15 * The arrays should not overlap, the progra !! 20 #define MOVE_BIGCHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, t7) \ 16 * if they do. !! 21 ldd [%src + (offset) + 0x00], %t0; \ 17 * For blocks less than 16 bytes a byte by b !! 22 ldd [%src + (offset) + 0x08], %t2; \ 18 * 8byte alignments, and length multiples, a !! 23 ldd [%src + (offset) + 0x10], %t4; \ 19 * 96bytes !! 24 ldd [%src + (offset) + 0x18], %t6; \ 20 * History !! 25 st %t0, [%dst + (offset) + 0x00]; \ 21 * !! 26 st %t1, [%dst + (offset) + 0x04]; \ 22 * DJH 5/15/09 Initial version 1.0 !! 27 st %t2, [%dst + (offset) + 0x08]; \ 23 * DJH 6/ 1/09 Version 1.1 modified ABI to !! 28 st %t3, [%dst + (offset) + 0x0c]; \ 24 * DJH 7/12/09 Version 1.2 optimized codesi !! 29 st %t4, [%dst + (offset) + 0x10]; \ 25 * DJH 10/14/09 Version 1.3 added special lo !! 30 st %t5, [%dst + (offset) + 0x14]; \ 26 * overreading bloa !! 31 st %t6, [%dst + (offset) + 0x18]; \ 27 * DJH 4/20/10 Version 1.4 fixed Ldword_loo !! 32 st %t7, [%dst + (offset) + 0x1c]; 28 * occurring if onl !! 33 29 * # 3888, correcte !! 34 #define MOVE_BIGALIGNCHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, t7) \ 30 * 1 32byte chunk f !! 35 ldd [%src + (offset) + 0x00], %t0; \ 31 * loop at end to s !! 36 ldd [%src + (offset) + 0x08], %t2; \ 32 * over read. Fixe !! 37 ldd [%src + (offset) + 0x10], %t4; \ 33 * overread for blo !! 38 ldd [%src + (offset) + 0x18], %t6; \ 34 * codesize to 752 !! 39 std %t0, [%dst + (offset) + 0x00]; \ 35 * DJH 4/21/10 version 1.5 1.4 fix broke co !! 40 std %t2, [%dst + (offset) + 0x08]; \ 36 * aligned to dword !! 41 std %t4, [%dst + (offset) + 0x10]; \ 37 * byte, added dete !! 42 std %t6, [%dst + (offset) + 0x18]; 38 * little bloat. !! 43 39 * DJH 4/23/10 version 1.6 corrected stack !! 44 #define MOVE_LASTCHUNK(src, dst, offset, t0, t1, t2, t3) \ 40 * always, fixed th !! 45 ldd [%src - (offset) - 0x10], %t0; \ 41 * before it was be !! 46 ldd [%src - (offset) - 0x08], %t2; \ 42 * Natural c model !! 47 st %t0, [%dst - (offset) - 0x10]; \ 43 * =============== !! 48 st %t1, [%dst - (offset) - 0x0c]; \ 44 * void * memcpy(char * ptr_out, char * ptr_in !! 49 st %t2, [%dst - (offset) - 0x08]; \ 45 * int i; !! 50 st %t3, [%dst - (offset) - 0x04]; 46 * if(length) for(i=0; i < length; i++) { pt !! 51 47 * return(ptr_out); !! 52 #define MOVE_LASTALIGNCHUNK(src, dst, offset, t0, t1, t2, t3) \ 48 * } !! 53 ldd [%src - (offset) - 0x10], %t0; \ 49 * !! 54 ldd [%src - (offset) - 0x08], %t2; \ 50 * Optimized memcpy function !! 55 std %t0, [%dst - (offset) - 0x10]; \ 51 * ========================= !! 56 std %t2, [%dst - (offset) - 0x08]; 52 * void * memcpy(char * ptr_out, char * ptr_in !! 57 53 * int i, prolog, kernel, epilog, mask; !! 58 #define MOVE_SHORTCHUNK(src, dst, offset, t0, t1) \ 54 * u8 offset; !! 59 ldub [%src - (offset) - 0x02], %t0; \ 55 * s64 data0, dataF8, data70; !! 60 ldub [%src - (offset) - 0x01], %t1; \ 56 * !! 61 stb %t0, [%dst - (offset) - 0x02]; \ 57 * s64 * ptr8_in; !! 62 stb %t1, [%dst - (offset) - 0x01]; 58 * s64 * ptr8_out; !! 63 59 * s32 * ptr4; !! 64 .text 60 * s16 * ptr2; !! 65 .align 4 61 * !! 66 62 * offset = ((int) ptr_in) & 7; !! 67 FUNC(memmove) 63 * ptr8_in = (s64 *) &ptr_in[-offset]; //r !! 68 EXPORT_SYMBOL(memmove) 64 * !! 69 cmp %o0, %o1 65 * data70 = *ptr8_in++; !! 70 mov %o0, %g7 66 * dataF8 = *ptr8_in++; !! 71 bleu 9f 67 * !! 72 sub %o0, %o1, %o4 68 * data0 = HEXAGON_P_valignb_PPp(dataF8, dat !! 73 69 * !! 74 add %o1, %o2, %o3 70 * prolog = 32 - ((int) ptr_out); !! 75 cmp %o3, %o0 71 * mask = 0x7fffffff >> HEXAGON_R_cl0_R(len !! 76 bleu 0f 72 * prolog = prolog & mask; !! 77 andcc %o4, 3, %o5 73 * kernel = len - prolog; !! 78 74 * epilog = kernel & 0x1F; !! 79 add %o1, %o2, %o1 75 * kernel = kernel>>5; !! 80 add %o0, %o2, %o0 76 * !! 81 sub %o1, 1, %o1 77 * if (prolog & 1) { ptr_out[0] = (u8) data0 !! 82 sub %o0, 1, %o0 78 * ptr2 = (s16 *) &ptr_out[0]; !! 83 79 * if (prolog & 2) { ptr2[0] = (u16) data0; !! 84 1: /* reverse_bytes */ 80 * ptr4 = (s32 *) &ptr_out[0]; !! 85 81 * if (prolog & 4) { ptr4[0] = (u32) data0; !! 86 ldub [%o1], %o4 82 * !! 87 subcc %o2, 1, %o2 83 * offset = offset + (prolog & 7); !! 88 stb %o4, [%o0] 84 * if (offset >= 8) { !! 89 sub %o1, 1, %o1 85 * data70 = dataF8; !! 90 bne 1b 86 * dataF8 = *ptr8_in++; !! 91 sub %o0, 1, %o0 87 * } !! 92 88 * offset = offset & 0x7; !! 93 retl 89 * !! 94 mov %g7, %o0 90 * prolog = prolog >> 3; !! 95 91 * if (prolog) for (i=0; i < prolog; i++) { !! 96 /* NOTE: This code is executed just for the cases, 92 * data0 = HEXAGON_P_valignb_PPp(dataF8, !! 97 where %src (=%o1) & 3 is != 0. 93 * ptr8_out = (s64 *) &ptr_out[0]; *ptr8 !! 98 We need to align it to 4. So, for (%src & 3) 94 * data70 = dataF8; !! 99 1 we need to do ldub,lduh 95 * dataF8 = *ptr8_in++; !! 100 2 lduh 96 * } !! 101 3 just ldub 97 * if(kernel) { kernel -= 1; epilog += 32; } !! 102 so even if it looks weird, the branches 98 * if(kernel) for(i=0; i < kernel; i++) { !! 103 are correct here. -jj 99 * data0 = HEXAGON_P_valignb_PPp(dataF8, << 100 * ptr8_out = (s64 *) &ptr_out[0]; *ptr8 << 101 * data70 = *ptr8_in++; << 102 * << 103 * data0 = HEXAGON_P_valignb_PPp(data70, << 104 * ptr8_out = (s64 *) &ptr_out[0]; *ptr8 << 105 * dataF8 = *ptr8_in++; << 106 * << 107 * data0 = HEXAGON_P_valignb_PPp(dataF8, << 108 * ptr8_out = (s64 *) &ptr_out[0]; *ptr8 << 109 * data70 = *ptr8_in++; << 110 * << 111 * data0 = HEXAGON_P_valignb_PPp(data70, << 112 * ptr8_out = (s64 *) &ptr_out[0]; *ptr8 << 113 * dataF8 = *ptr8_in++; << 114 * } << 115 * epilogdws = epilog >> 3; << 116 * if (epilogdws) for (i=0; i < epilogdws; i << 117 * data0 = HEXAGON_P_valignb_PPp(dataF8, << 118 * ptr8_out = (s64 *) &ptr_out[0]; *ptr8 << 119 * data70 = dataF8; << 120 * dataF8 = *ptr8_in++; << 121 * } << 122 * data0 = HEXAGON_P_valignb_PPp(dataF8, dat << 123 * << 124 * ptr4 = (s32 *) &ptr_out[0]; << 125 * if (epilog & 4) { ptr4[0] = (u32) data0; << 126 * ptr2 = (s16 *) &ptr_out[0]; << 127 * if (epilog & 2) { ptr2[0] = (u16) data0; << 128 * if (epilog & 1) { *ptr_out++ = (u8) data0 << 129 * << 130 * return(ptr_out - length); << 131 * } << 132 * << 133 * Codesize : 784 bytes << 134 */ 104 */ >> 105 78: /* dword_align */ >> 106 >> 107 andcc %o1, 1, %g0 >> 108 be 4f >> 109 andcc %o1, 2, %g0 >> 110 >> 111 ldub [%o1], %g2 >> 112 add %o1, 1, %o1 >> 113 stb %g2, [%o0] >> 114 sub %o2, 1, %o2 >> 115 bne 3f >> 116 add %o0, 1, %o0 >> 117 4: >> 118 lduh [%o1], %g2 >> 119 add %o1, 2, %o1 >> 120 sth %g2, [%o0] >> 121 sub %o2, 2, %o2 >> 122 b 3f >> 123 add %o0, 2, %o0 >> 124 >> 125 FUNC(memcpy) /* %o0=dst %o1=src %o2=len */ >> 126 EXPORT_SYMBOL(memcpy) >> 127 >> 128 sub %o0, %o1, %o4 >> 129 mov %o0, %g7 >> 130 9: >> 131 andcc %o4, 3, %o5 >> 132 0: >> 133 bne 86f >> 134 cmp %o2, 15 >> 135 >> 136 bleu 90f >> 137 andcc %o1, 3, %g0 >> 138 >> 139 bne 78b >> 140 3: >> 141 andcc %o1, 4, %g0 >> 142 >> 143 be 2f >> 144 mov %o2, %g1 >> 145 >> 146 ld [%o1], %o4 >> 147 sub %g1, 4, %g1 >> 148 st %o4, [%o0] >> 149 add %o1, 4, %o1 >> 150 add %o0, 4, %o0 >> 151 2: >> 152 andcc %g1, 0xffffff80, %g0 >> 153 be 3f >> 154 andcc %o0, 4, %g0 >> 155 >> 156 be 82f + 4 >> 157 5: >> 158 MOVE_BIGCHUNK(o1, o0, 0x00, o2, o3, o4, o5, g2, g3, g4, g5) >> 159 MOVE_BIGCHUNK(o1, o0, 0x20, o2, o3, o4, o5, g2, g3, g4, g5) >> 160 MOVE_BIGCHUNK(o1, o0, 0x40, o2, o3, o4, o5, g2, g3, g4, g5) >> 161 MOVE_BIGCHUNK(o1, o0, 0x60, o2, o3, o4, o5, g2, g3, g4, g5) >> 162 sub %g1, 128, %g1 >> 163 add %o1, 128, %o1 >> 164 cmp %g1, 128 >> 165 bge 5b >> 166 add %o0, 128, %o0 >> 167 3: >> 168 andcc %g1, 0x70, %g4 >> 169 be 80f >> 170 andcc %g1, 8, %g0 >> 171 >> 172 sethi %hi(80f), %o5 >> 173 srl %g4, 1, %o4 >> 174 add %g4, %o4, %o4 >> 175 add %o1, %g4, %o1 >> 176 sub %o5, %o4, %o5 >> 177 jmpl %o5 + %lo(80f), %g0 >> 178 add %o0, %g4, %o0 >> 179 >> 180 79: /* memcpy_table */ >> 181 >> 182 MOVE_LASTCHUNK(o1, o0, 0x60, g2, g3, g4, g5) >> 183 MOVE_LASTCHUNK(o1, o0, 0x50, g2, g3, g4, g5) >> 184 MOVE_LASTCHUNK(o1, o0, 0x40, g2, g3, g4, g5) >> 185 MOVE_LASTCHUNK(o1, o0, 0x30, g2, g3, g4, g5) >> 186 MOVE_LASTCHUNK(o1, o0, 0x20, g2, g3, g4, g5) >> 187 MOVE_LASTCHUNK(o1, o0, 0x10, g2, g3, g4, g5) >> 188 MOVE_LASTCHUNK(o1, o0, 0x00, g2, g3, g4, g5) >> 189 >> 190 80: /* memcpy_table_end */ >> 191 be 81f >> 192 andcc %g1, 4, %g0 >> 193 >> 194 ldd [%o1], %g2 >> 195 add %o0, 8, %o0 >> 196 st %g2, [%o0 - 0x08] >> 197 add %o1, 8, %o1 >> 198 st %g3, [%o0 - 0x04] >> 199 >> 200 81: /* memcpy_last7 */ >> 201 >> 202 be 1f >> 203 andcc %g1, 2, %g0 >> 204 >> 205 ld [%o1], %g2 >> 206 add %o1, 4, %o1 >> 207 st %g2, [%o0] >> 208 add %o0, 4, %o0 >> 209 1: >> 210 be 1f >> 211 andcc %g1, 1, %g0 >> 212 >> 213 lduh [%o1], %g2 >> 214 add %o1, 2, %o1 >> 215 sth %g2, [%o0] >> 216 add %o0, 2, %o0 >> 217 1: >> 218 be 1f >> 219 nop >> 220 >> 221 ldub [%o1], %g2 >> 222 stb %g2, [%o0] >> 223 1: >> 224 retl >> 225 mov %g7, %o0 >> 226 >> 227 82: /* ldd_std */ >> 228 MOVE_BIGALIGNCHUNK(o1, o0, 0x00, o2, o3, o4, o5, g2, g3, g4, g5) >> 229 MOVE_BIGALIGNCHUNK(o1, o0, 0x20, o2, o3, o4, o5, g2, g3, g4, g5) >> 230 MOVE_BIGALIGNCHUNK(o1, o0, 0x40, o2, o3, o4, o5, g2, g3, g4, g5) >> 231 MOVE_BIGALIGNCHUNK(o1, o0, 0x60, o2, o3, o4, o5, g2, g3, g4, g5) >> 232 subcc %g1, 128, %g1 >> 233 add %o1, 128, %o1 >> 234 cmp %g1, 128 >> 235 bge 82b >> 236 add %o0, 128, %o0 >> 237 >> 238 andcc %g1, 0x70, %g4 >> 239 be 84f >> 240 andcc %g1, 8, %g0 >> 241 >> 242 sethi %hi(84f), %o5 >> 243 add %o1, %g4, %o1 >> 244 sub %o5, %g4, %o5 >> 245 jmpl %o5 + %lo(84f), %g0 >> 246 add %o0, %g4, %o0 >> 247 >> 248 83: /* amemcpy_table */ >> 249 >> 250 MOVE_LASTALIGNCHUNK(o1, o0, 0x60, g2, g3, g4, g5) >> 251 MOVE_LASTALIGNCHUNK(o1, o0, 0x50, g2, g3, g4, g5) >> 252 MOVE_LASTALIGNCHUNK(o1, o0, 0x40, g2, g3, g4, g5) >> 253 MOVE_LASTALIGNCHUNK(o1, o0, 0x30, g2, g3, g4, g5) >> 254 MOVE_LASTALIGNCHUNK(o1, o0, 0x20, g2, g3, g4, g5) >> 255 MOVE_LASTALIGNCHUNK(o1, o0, 0x10, g2, g3, g4, g5) >> 256 MOVE_LASTALIGNCHUNK(o1, o0, 0x00, g2, g3, g4, g5) >> 257 >> 258 84: /* amemcpy_table_end */ >> 259 be 85f >> 260 andcc %g1, 4, %g0 >> 261 >> 262 ldd [%o1], %g2 >> 263 add %o0, 8, %o0 >> 264 std %g2, [%o0 - 0x08] >> 265 add %o1, 8, %o1 >> 266 85: /* amemcpy_last7 */ >> 267 be 1f >> 268 andcc %g1, 2, %g0 >> 269 >> 270 ld [%o1], %g2 >> 271 add %o1, 4, %o1 >> 272 st %g2, [%o0] >> 273 add %o0, 4, %o0 >> 274 1: >> 275 be 1f >> 276 andcc %g1, 1, %g0 >> 277 >> 278 lduh [%o1], %g2 >> 279 add %o1, 2, %o1 >> 280 sth %g2, [%o0] >> 281 add %o0, 2, %o0 >> 282 1: >> 283 be 1f >> 284 nop >> 285 >> 286 ldub [%o1], %g2 >> 287 stb %g2, [%o0] >> 288 1: >> 289 retl >> 290 mov %g7, %o0 >> 291 >> 292 86: /* non_aligned */ >> 293 cmp %o2, 6 >> 294 bleu 88f >> 295 nop >> 296 >> 297 save %sp, -96, %sp >> 298 andcc %i0, 3, %g0 >> 299 be 61f >> 300 andcc %i0, 1, %g0 >> 301 be 60f >> 302 andcc %i0, 2, %g0 >> 303 >> 304 ldub [%i1], %g5 >> 305 add %i1, 1, %i1 >> 306 stb %g5, [%i0] >> 307 sub %i2, 1, %i2 >> 308 bne 61f >> 309 add %i0, 1, %i0 >> 310 60: >> 311 ldub [%i1], %g3 >> 312 add %i1, 2, %i1 >> 313 stb %g3, [%i0] >> 314 sub %i2, 2, %i2 >> 315 ldub [%i1 - 1], %g3 >> 316 add %i0, 2, %i0 >> 317 stb %g3, [%i0 - 1] >> 318 61: >> 319 and %i1, 3, %g2 >> 320 and %i2, 0xc, %g3 >> 321 and %i1, -4, %i1 >> 322 cmp %g3, 4 >> 323 sll %g2, 3, %g4 >> 324 mov 32, %g2 >> 325 be 4f >> 326 sub %g2, %g4, %l0 >> 327 >> 328 blu 3f >> 329 cmp %g3, 0x8 >> 330 >> 331 be 2f >> 332 srl %i2, 2, %g3 >> 333 >> 334 ld [%i1], %i3 >> 335 add %i0, -8, %i0 >> 336 ld [%i1 + 4], %i4 >> 337 b 8f >> 338 add %g3, 1, %g3 >> 339 2: >> 340 ld [%i1], %i4 >> 341 add %i0, -12, %i0 >> 342 ld [%i1 + 4], %i5 >> 343 add %g3, 2, %g3 >> 344 b 9f >> 345 add %i1, -4, %i1 >> 346 3: >> 347 ld [%i1], %g1 >> 348 add %i0, -4, %i0 >> 349 ld [%i1 + 4], %i3 >> 350 srl %i2, 2, %g3 >> 351 b 7f >> 352 add %i1, 4, %i1 >> 353 4: >> 354 ld [%i1], %i5 >> 355 cmp %i2, 7 >> 356 ld [%i1 + 4], %g1 >> 357 srl %i2, 2, %g3 >> 358 bleu 10f >> 359 add %i1, 8, %i1 >> 360 >> 361 ld [%i1], %i3 >> 362 add %g3, -1, %g3 >> 363 5: >> 364 sll %i5, %g4, %g2 >> 365 srl %g1, %l0, %g5 >> 366 or %g2, %g5, %g2 >> 367 st %g2, [%i0] >> 368 7: >> 369 ld [%i1 + 4], %i4 >> 370 sll %g1, %g4, %g2 >> 371 srl %i3, %l0, %g5 >> 372 or %g2, %g5, %g2 >> 373 st %g2, [%i0 + 4] >> 374 8: >> 375 ld [%i1 + 8], %i5 >> 376 sll %i3, %g4, %g2 >> 377 srl %i4, %l0, %g5 >> 378 or %g2, %g5, %g2 >> 379 st %g2, [%i0 + 8] >> 380 9: >> 381 ld [%i1 + 12], %g1 >> 382 sll %i4, %g4, %g2 >> 383 srl %i5, %l0, %g5 >> 384 addcc %g3, -4, %g3 >> 385 or %g2, %g5, %g2 >> 386 add %i1, 16, %i1 >> 387 st %g2, [%i0 + 12] >> 388 add %i0, 16, %i0 >> 389 bne,a 5b >> 390 ld [%i1], %i3 >> 391 10: >> 392 sll %i5, %g4, %g2 >> 393 srl %g1, %l0, %g5 >> 394 srl %l0, 3, %g3 >> 395 or %g2, %g5, %g2 >> 396 sub %i1, %g3, %i1 >> 397 andcc %i2, 2, %g0 >> 398 st %g2, [%i0] >> 399 be 1f >> 400 andcc %i2, 1, %g0 >> 401 >> 402 ldub [%i1], %g2 >> 403 add %i1, 2, %i1 >> 404 stb %g2, [%i0 + 4] >> 405 add %i0, 2, %i0 >> 406 ldub [%i1 - 1], %g2 >> 407 stb %g2, [%i0 + 3] >> 408 1: >> 409 be 1f >> 410 nop >> 411 ldub [%i1], %g2 >> 412 stb %g2, [%i0 + 4] >> 413 1: >> 414 ret >> 415 restore %g7, %g0, %o0 >> 416 >> 417 88: /* short_end */ >> 418 >> 419 and %o2, 0xe, %o3 >> 420 20: >> 421 sethi %hi(89f), %o5 >> 422 sll %o3, 3, %o4 >> 423 add %o0, %o3, %o0 >> 424 sub %o5, %o4, %o5 >> 425 add %o1, %o3, %o1 >> 426 jmpl %o5 + %lo(89f), %g0 >> 427 andcc %o2, 1, %g0 >> 428 >> 429 MOVE_SHORTCHUNK(o1, o0, 0x0c, g2, g3) >> 430 MOVE_SHORTCHUNK(o1, o0, 0x0a, g2, g3) >> 431 MOVE_SHORTCHUNK(o1, o0, 0x08, g2, g3) >> 432 MOVE_SHORTCHUNK(o1, o0, 0x06, g2, g3) >> 433 MOVE_SHORTCHUNK(o1, o0, 0x04, g2, g3) >> 434 MOVE_SHORTCHUNK(o1, o0, 0x02, g2, g3) >> 435 MOVE_SHORTCHUNK(o1, o0, 0x00, g2, g3) >> 436 >> 437 89: /* short_table_end */ >> 438 >> 439 be 1f >> 440 nop >> 441 >> 442 ldub [%o1], %g2 >> 443 stb %g2, [%o0] >> 444 1: >> 445 retl >> 446 mov %g7, %o0 >> 447 >> 448 90: /* short_aligned_end */ >> 449 bne 88b >> 450 andcc %o2, 8, %g0 135 451 >> 452 be 1f >> 453 andcc %o2, 4, %g0 136 454 137 #define ptr_out R0 /* destinatio !! 455 ld [%o1 + 0x00], %g2 138 #define ptr_in R1 /* source poi !! 456 ld [%o1 + 0x04], %g3 139 #define len R2 /* length of !! 457 add %o1, 8, %o1 140 !! 458 st %g2, [%o0 + 0x00] 141 #define data70 R13:12 /* lo 8 bytes !! 459 st %g3, [%o0 + 0x04] 142 #define dataF8 R11:10 /* hi 8 bytes !! 460 add %o0, 8, %o0 143 #define ldata0 R7:6 /* even 8 byt !! 461 1: 144 #define ldata1 R25:24 /* odd 8 byte !! 462 b 81b 145 #define data1 R7 /* lower 8 by !! 463 mov %o2, %g1 146 #define data0 R6 /* lower 8 by << 147 << 148 #define ifbyte p0 /* if transfe << 149 #define ifhword p0 /* if transfe << 150 #define ifword p0 /* if transfe << 151 #define noprolog p0 /* no prolog, << 152 #define nokernel p1 /* no 32byte << 153 #define noepilog p0 /* no epilog, << 154 #define align p2 /* alignment << 155 #define kernel1 p0 /* kernel cou << 156 << 157 #define dalign R25 /* rel alignm << 158 #define star3 R16 /* number byt << 159 #define rest R8 /* length - p << 160 #define back R7 /* nr bytes > << 161 #define epilog R3 /* bytes in e << 162 #define inc R15:14 /* inc kernel << 163 #define kernel R4 /* number of << 164 #define ptr_in_p_128 R5 /* pointer fo << 165 #define mask R8 /* mask used << 166 #define shift R8 /* used to wo << 167 #define shift2 R5 /* in epilog << 168 #define prolog R15 /* bytes in << 169 #define epilogdws R15 /* number dwo << 170 #define shiftb R14 /* used to ex << 171 #define offset R9 /* same as al << 172 #define ptr_out_p_32 R17 /* pointer to << 173 #define align888 R14 /* if simple << 174 #define len8 R9 /* number of << 175 #define over R20 /* nr of byte << 176 << 177 #define ptr_in_p_128kernel R5:4 /* pa << 178 << 179 .section .text << 180 .p2align 4 << 181 .global memcpy << 182 .type memcpy, @function << 183 memcpy: << 184 { << 185 p2 = cmp.eq(len, #0); /* =0 << 186 align888 = or(ptr_in, ptr_out); /* %8 << 187 p0 = cmp.gtu(len, #23); /* %1 << 188 p1 = cmp.eq(ptr_in, ptr_out); /* at << 189 } << 190 { << 191 p1 = or(p2, p1); << 192 p3 = cmp.gtu(len, #95); /* %8 << 193 align888 = or(align888, len); /* %8 << 194 len8 = lsr(len, #3); /* %8 << 195 } << 196 { << 197 dcfetch(ptr_in); /* ze << 198 p2 = bitsclr(align888, #7); /* %8 << 199 if(p1) jumpr r31; /* =0 << 200 } << 201 { << 202 p2 = and(p2,!p3); << 203 if (p2.new) len = add(len, #-8); << 204 if (p2.new) jump:NT .Ldwordaligned; << 205 } << 206 { << 207 if(!p0) jump .Lbytes23orless; /* %1 << 208 mask.l = #LO(0x7fffffff); << 209 /* all bytes before line multiples of << 210 prolog = sub(#0, ptr_out); << 211 } << 212 { << 213 /* save r31 on stack, decrement sp by << 214 allocframe(#24); << 215 mask.h = #HI(0x7fffffff); << 216 ptr_in_p_128 = add(ptr_in, #32); << 217 back = cl0(len); << 218 } << 219 { << 220 memd(sp+#0) = R17:16; /* sa << 221 r31.l = #LO(.Lmemcpy_return); /* se << 222 prolog &= lsr(mask, back); << 223 offset = and(ptr_in, #7); << 224 } << 225 { << 226 memd(sp+#8) = R25:24; /* sa << 227 dalign = sub(ptr_out, ptr_in); << 228 r31.h = #HI(.Lmemcpy_return); /* se << 229 } << 230 { << 231 /* see if there if input buffer end i << 232 over = add(len, ptr_in); << 233 back = add(len, offset); << 234 memd(sp+#16) = R21:20; /* sa << 235 } << 236 { << 237 noprolog = bitsclr(prolog, #7); << 238 prolog = and(prolog, #31); << 239 dcfetch(ptr_in_p_128); << 240 ptr_in_p_128 = add(ptr_in_p_128, #32); << 241 } << 242 { << 243 kernel = sub(len, prolog); << 244 shift = asl(prolog, #3); << 245 star3 = and(prolog, #7); << 246 ptr_in = and(ptr_in, #-8); << 247 } << 248 { << 249 prolog = lsr(prolog, #3); << 250 epilog = and(kernel, #31); << 251 ptr_out_p_32 = add(ptr_out, prolog); << 252 over = and(over, #7); << 253 } << 254 { << 255 p3 = cmp.gtu(back, #8); << 256 kernel = lsr(kernel, #5); << 257 dcfetch(ptr_in_p_128); << 258 ptr_in_p_128 = add(ptr_in_p_128, #32); << 259 } << 260 { << 261 p1 = cmp.eq(prolog, #0); << 262 if(!p1.new) prolog = add(prolog, #1); << 263 dcfetch(ptr_in_p_128); /* reserve th << 264 ptr_in_p_128 = add(ptr_in_p_128, #32); << 265 } << 266 { << 267 nokernel = cmp.eq(kernel,#0); << 268 dcfetch(ptr_in_p_128); /* reserve the << 269 ptr_in_p_128 = add(ptr_in_p_128, #32); << 270 shiftb = and(shift, #8); << 271 } << 272 { << 273 dcfetch(ptr_in_p_128); /* re << 274 ptr_in_p_128 = add(ptr_in_p_128, #32); << 275 if(nokernel) jump .Lskip64; << 276 p2 = cmp.eq(kernel, #1); /* sk << 277 } << 278 { << 279 dczeroa(ptr_out_p_32); << 280 /* don't advance pointer */ << 281 if(!p2) ptr_out_p_32 = add(ptr_out_p_3 << 282 } << 283 { << 284 dalign = and(dalign, #31); << 285 dczeroa(ptr_out_p_32); << 286 } << 287 .Lskip64: << 288 { << 289 data70 = memd(ptr_in++#16); << 290 if(p3) dataF8 = memd(ptr_in+#8); << 291 if(noprolog) jump .Lnoprolog32; << 292 align = offset; << 293 } << 294 /* upto initial 7 bytes */ << 295 { << 296 ldata0 = valignb(dataF8, data70, align << 297 ifbyte = tstbit(shift,#3); << 298 offset = add(offset, star3); << 299 } << 300 { << 301 if(ifbyte) memb(ptr_out++#1) = data0; << 302 ldata0 = lsr(ldata0, shiftb); << 303 shiftb = and(shift, #16); << 304 ifhword = tstbit(shift,#4); << 305 } << 306 { << 307 if(ifhword) memh(ptr_out++#2) = data0; << 308 ldata0 = lsr(ldata0, shiftb); << 309 ifword = tstbit(shift,#5); << 310 p2 = cmp.gtu(offset, #7); << 311 } << 312 { << 313 if(ifword) memw(ptr_out++#4) = data0; << 314 if(p2) data70 = dataF8; << 315 if(p2) dataF8 = memd(ptr_in++#8); << 316 align = offset; << 317 } << 318 .Lnoprolog32: << 319 { << 320 p3 = sp1loop0(.Ldword_loop_prolog, pro << 321 rest = sub(len, star3); /* whats left << 322 p0 = cmp.gt(over, #0); << 323 } << 324 if(p0) rest = add(rest, #16); << 325 .Ldword_loop_prolog: << 326 { << 327 if(p3) memd(ptr_out++#8) = ldata0; << 328 ldata0 = valignb(dataF8, data70, align << 329 p0 = cmp.gt(rest, #16); << 330 } << 331 { << 332 data70 = dataF8; << 333 if(p0) dataF8 = memd(ptr_in++#8); << 334 rest = add(rest, #-8); << 335 }:endloop0 << 336 .Lkernel: << 337 { << 338 /* kernel is at least 32bytes */ << 339 p3 = cmp.gtu(kernel, #0); << 340 /* last itn. remove edge effects */ << 341 if(p3.new) kernel = add(kernel, #-1); << 342 /* dealt with in last dword loop */ << 343 if(p3.new) epilog = add(epilog, #32); << 344 } << 345 { << 346 nokernel = cmp.eq(kernel, #0); << 347 if(nokernel.new) jump:NT .Lepilog; << 348 inc = combine(#32, #-1); << 349 p3 = cmp.gtu(dalign, #24); << 350 } << 351 { << 352 if(p3) jump .Lodd_alignment; << 353 } << 354 { << 355 loop0(.Loword_loop_25to31, kernel); << 356 kernel1 = cmp.gtu(kernel, #1); << 357 rest = kernel; << 358 } << 359 .falign << 360 .Loword_loop_25to31: << 361 { << 362 dcfetch(ptr_in_p_128); /* prefetch 4 << 363 if(kernel1) ptr_out_p_32 = add(ptr_out << 364 } << 365 { << 366 dczeroa(ptr_out_p_32); /* reserve th << 367 p3 = cmp.eq(kernel, rest); << 368 } << 369 { << 370 /* kernel -= 1 */ << 371 ptr_in_p_128kernel = vaddw(ptr_in_p_12 << 372 /* kill write on first iteration */ << 373 if(!p3) memd(ptr_out++#8) = ldata1; << 374 ldata1 = valignb(dataF8, data70, align << 375 data70 = memd(ptr_in++#8); << 376 } << 377 { << 378 memd(ptr_out++#8) = ldata0; << 379 ldata0 = valignb(data70, dataF8, align << 380 dataF8 = memd(ptr_in++#8); << 381 } << 382 { << 383 memd(ptr_out++#8) = ldata1; << 384 ldata1 = valignb(dataF8, data70, align << 385 data70 = memd(ptr_in++#8); << 386 } << 387 { << 388 memd(ptr_out++#8) = ldata0; << 389 ldata0 = valignb(data70, dataF8, align << 390 dataF8 = memd(ptr_in++#8); << 391 kernel1 = cmp.gtu(kernel, #1); << 392 }:endloop0 << 393 { << 394 memd(ptr_out++#8) = ldata1; << 395 jump .Lepilog; << 396 } << 397 .Lodd_alignment: << 398 { << 399 loop0(.Loword_loop_00to24, kernel); << 400 kernel1 = cmp.gtu(kernel, #1); << 401 rest = add(kernel, #-1); << 402 } << 403 .falign << 404 .Loword_loop_00to24: << 405 { << 406 dcfetch(ptr_in_p_128); /* prefetch 4 << 407 ptr_in_p_128kernel = vaddw(ptr_in_p_12 << 408 if(kernel1) ptr_out_p_32 = add(ptr_out << 409 } << 410 { << 411 dczeroa(ptr_out_p_32); /* reserve th << 412 } << 413 { << 414 memd(ptr_out++#8) = ldata0; << 415 ldata0 = valignb(dataF8, data70, align << 416 data70 = memd(ptr_in++#8); << 417 } << 418 { << 419 memd(ptr_out++#8) = ldata0; << 420 ldata0 = valignb(data70, dataF8, align << 421 dataF8 = memd(ptr_in++#8); << 422 } << 423 { << 424 memd(ptr_out++#8) = ldata0; << 425 ldata0 = valignb(dataF8, data70, align << 426 data70 = memd(ptr_in++#8); << 427 } << 428 { << 429 memd(ptr_out++#8) = ldata0; << 430 ldata0 = valignb(data70, dataF8, align << 431 dataF8 = memd(ptr_in++#8); << 432 kernel1 = cmp.gtu(kernel, #1); << 433 }:endloop0 << 434 .Lepilog: << 435 { << 436 noepilog = cmp.eq(epilog,#0); << 437 epilogdws = lsr(epilog, #3); << 438 kernel = and(epilog, #7); << 439 } << 440 { << 441 if(noepilog) jumpr r31; << 442 if(noepilog) ptr_out = sub(ptr_out, le << 443 p3 = cmp.eq(epilogdws, #0); << 444 shift2 = asl(epilog, #3); << 445 } << 446 { << 447 shiftb = and(shift2, #32); << 448 ifword = tstbit(epilog,#2); << 449 if(p3) jump .Lepilog60; << 450 if(!p3) epilog = add(epilog, #-16); << 451 } << 452 { << 453 loop0(.Ldword_loop_epilog, epilogdws); << 454 /* stop criteria is lsbs unless = 0 t << 455 p3 = cmp.eq(kernel, #0); << 456 if(p3.new) kernel= #8; << 457 p1 = cmp.gt(over, #0); << 458 } << 459 /* if not aligned to end of buffer ex << 460 if(p1) kernel= #0; << 461 .Ldword_loop_epilog: << 462 { << 463 memd(ptr_out++#8) = ldata0; << 464 ldata0 = valignb(dataF8, data70, align << 465 p3 = cmp.gt(epilog, kernel); << 466 } << 467 { << 468 data70 = dataF8; << 469 if(p3) dataF8 = memd(ptr_in++#8); << 470 epilog = add(epilog, #-8); << 471 }:endloop0 << 472 /* copy last 7 bytes */ << 473 .Lepilog60: << 474 { << 475 if(ifword) memw(ptr_out++#4) = data0; << 476 ldata0 = lsr(ldata0, shiftb); << 477 ifhword = tstbit(epilog,#1); << 478 shiftb = and(shift2, #16); << 479 } << 480 { << 481 if(ifhword) memh(ptr_out++#2) = data0; << 482 ldata0 = lsr(ldata0, shiftb); << 483 ifbyte = tstbit(epilog,#0); << 484 if(ifbyte.new) len = add(len, #-1); << 485 } << 486 { << 487 if(ifbyte) memb(ptr_out) = data0; << 488 ptr_out = sub(ptr_out, len); /* re << 489 jumpr r31; << 490 } << 491 /* do byte copy for small n */ << 492 .Lbytes23orless: << 493 { << 494 p3 = sp1loop0(.Lbyte_copy, len); << 495 len = add(len, #-1); << 496 } << 497 .Lbyte_copy: << 498 { << 499 data0 = memb(ptr_in++#1); << 500 if(p3) memb(ptr_out++#1) = data0; << 501 }:endloop0 << 502 { << 503 memb(ptr_out) = data0; << 504 ptr_out = sub(ptr_out, len); << 505 jumpr r31; << 506 } << 507 /* do dword copies for aligned in, out and le << 508 .Ldwordaligned: << 509 { << 510 p3 = sp1loop0(.Ldword_copy, len8); << 511 } << 512 .Ldword_copy: << 513 { << 514 if(p3) memd(ptr_out++#8) = ldata0; << 515 ldata0 = memd(ptr_in++#8); << 516 }:endloop0 << 517 { << 518 memd(ptr_out) = ldata0; << 519 ptr_out = sub(ptr_out, len); << 520 jumpr r31; /* return to function << 521 } << 522 .Lmemcpy_return: << 523 r21:20 = memd(sp+#16); /* restore r2 << 524 { << 525 r25:24 = memd(sp+#8); /* restore r2 << 526 r17:16 = memd(sp+#0); /* restore r1 << 527 } << 528 deallocframe; /* restore r31 and in << 529 jumpr r31 <<
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.