~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/arch/hexagon/lib/memcpy.S

Version: ~ [ linux-6.12-rc7 ] ~ [ linux-6.11.7 ] ~ [ linux-6.10.14 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.60 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.116 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.171 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.229 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.285 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.323 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.12 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

Diff markup

Differences between /arch/hexagon/lib/memcpy.S (Architecture ppc) and /arch/mips/lib/memcpy.S (Architecture mips)


  1 /* SPDX-License-Identifier: GPL-2.0-only */    << 
  2 /*                                                  1 /*
  3  * Copyright (c) 2010-2011, The Linux Foundati !!   2  * This file is subject to the terms and conditions of the GNU General Public
                                                   >>   3  * License.  See the file "COPYING" in the main directory of this archive
                                                   >>   4  * for more details.
                                                   >>   5  *
                                                   >>   6  * Unified implementation of memcpy, memmove and the __copy_user backend.
                                                   >>   7  *
                                                   >>   8  * Copyright (C) 1998, 99, 2000, 01, 2002 Ralf Baechle (ralf@gnu.org)
                                                   >>   9  * Copyright (C) 1999, 2000, 01, 2002 Silicon Graphics, Inc.
                                                   >>  10  * Copyright (C) 2002 Broadcom, Inc.
                                                   >>  11  *   memcpy/copy_user author: Mark Vandevoorde
                                                   >>  12  * Copyright (C) 2007  Maciej W. Rozycki
                                                   >>  13  * Copyright (C) 2014 Imagination Technologies Ltd.
                                                   >>  14  *
                                                   >>  15  * Mnemonic names for arguments to memcpy/__copy_user
  4  */                                                16  */
  5                                                    17 
  6 /*                                                 18 /*
  7  * Description                                 !!  19  * Hack to resolve longstanding prefetch issue
  8  *                                             << 
  9  *   library function for memcpy where length  << 
 10  *   ptr_in to ptr_out. ptr_out is returned un << 
 11  *   Allows any combination of alignment on in << 
 12  *   and length from 0 to 2^32-1               << 
 13  *                                             << 
 14  * Restrictions                                << 
 15  *   The arrays should not overlap, the progra << 
 16  *   if they do.                               << 
 17  *   For blocks less than 16 bytes a byte by b << 
 18  *   8byte alignments, and length multiples, a << 
 19  *   96bytes                                   << 
 20  * History                                     << 
 21  *                                             << 
 22  *   DJH  5/15/09 Initial version 1.0          << 
 23  *   DJH  6/ 1/09 Version 1.1 modified ABI to  << 
 24  *   DJH  7/12/09 Version 1.2 optimized codesi << 
 25  *   DJH 10/14/09 Version 1.3 added special lo << 
 26  *                            overreading bloa << 
 27  *   DJH  4/20/10 Version 1.4 fixed Ldword_loo << 
 28  *                            occurring if onl << 
 29  *                            # 3888, correcte << 
 30  *                            1 32byte chunk f << 
 31  *                            loop at end to s << 
 32  *                            over read.  Fixe << 
 33  *                            overread for blo << 
 34  *                            codesize to 752  << 
 35  *   DJH  4/21/10 version 1.5 1.4 fix broke co << 
 36  *                            aligned to dword << 
 37  *                            byte, added dete << 
 38  *                            little bloat.    << 
 39  *   DJH  4/23/10 version 1.6 corrected stack  << 
 40  *                            always, fixed th << 
 41  *                            before it was be << 
 42  * Natural c model                             << 
 43  * ===============                             << 
 44  * void * memcpy(char * ptr_out, char * ptr_in << 
 45  *   int i;                                    << 
 46  *   if(length) for(i=0; i < length; i++) { pt << 
 47  *   return(ptr_out);                          << 
 48  * }                                           << 
 49  *                                             << 
 50  * Optimized memcpy function                   << 
 51  * =========================                   << 
 52  * void * memcpy(char * ptr_out, char * ptr_in << 
 53  *   int i, prolog, kernel, epilog, mask;      << 
 54  *   u8 offset;                                << 
 55  *   s64 data0, dataF8, data70;                << 
 56  *                                             << 
 57  *   s64 * ptr8_in;                            << 
 58  *   s64 * ptr8_out;                           << 
 59  *   s32 * ptr4;                               << 
 60  *   s16 * ptr2;                               << 
 61  *                                             << 
 62  *   offset = ((int) ptr_in) & 7;              << 
 63  *   ptr8_in = (s64 *) &ptr_in[-offset];   //r << 
 64  *                                             << 
 65  *   data70 = *ptr8_in++;                      << 
 66  *   dataF8 = *ptr8_in++;                      << 
 67  *                                             << 
 68  *   data0 = HEXAGON_P_valignb_PPp(dataF8, dat << 
 69  *                                             << 
 70  *   prolog = 32 - ((int) ptr_out);            << 
 71  *   mask  = 0x7fffffff >> HEXAGON_R_cl0_R(len << 
 72  *   prolog = prolog & mask;                   << 
 73  *   kernel = len - prolog;                    << 
 74  *   epilog = kernel & 0x1F;                   << 
 75  *   kernel = kernel>>5;                       << 
 76  *                                             << 
 77  *   if (prolog & 1) { ptr_out[0] = (u8) data0 << 
 78  *   ptr2 = (s16 *) &ptr_out[0];               << 
 79  *   if (prolog & 2) { ptr2[0] = (u16) data0;  << 
 80  *   ptr4 = (s32 *) &ptr_out[0];               << 
 81  *   if (prolog & 4) { ptr4[0] = (u32) data0;  << 
 82  *                                             << 
 83  *   offset = offset + (prolog & 7);           << 
 84  *   if (offset >= 8) {                        << 
 85  *     data70 = dataF8;                        << 
 86  *     dataF8 = *ptr8_in++;                    << 
 87  *   }                                         << 
 88  *   offset = offset & 0x7;                    << 
 89  *                                                 20  *
 90  *   prolog = prolog >> 3;                     !!  21  * Prefetching may be fatal on some systems if we're prefetching beyond the
 91  *   if (prolog) for (i=0; i < prolog; i++) {  !!  22  * end of memory on some systems.  It's also a seriously bad idea on non
 92  *       data0 = HEXAGON_P_valignb_PPp(dataF8, !!  23  * dma-coherent systems.
 93  *       ptr8_out = (s64 *) &ptr_out[0]; *ptr8 !!  24  */
 94  *       data70 = dataF8;                      !!  25 #ifdef CONFIG_DMA_NONCOHERENT
 95  *       dataF8 = *ptr8_in++;                  !!  26 #undef CONFIG_CPU_HAS_PREFETCH
 96  *   }                                         !!  27 #endif
 97  *   if(kernel) { kernel -= 1; epilog += 32; } !!  28 #ifdef CONFIG_MIPS_MALTA
 98  *   if(kernel) for(i=0; i < kernel; i++) {    !!  29 #undef CONFIG_CPU_HAS_PREFETCH
 99  *       data0 = HEXAGON_P_valignb_PPp(dataF8, !!  30 #endif
100  *       ptr8_out = (s64 *) &ptr_out[0]; *ptr8 !!  31 #ifdef CONFIG_CPU_MIPSR6
101  *       data70 = *ptr8_in++;                  !!  32 #undef CONFIG_CPU_HAS_PREFETCH
102  *                                             !!  33 #endif
103  *       data0 = HEXAGON_P_valignb_PPp(data70, !!  34 
104  *       ptr8_out = (s64 *) &ptr_out[0]; *ptr8 !!  35 #include <linux/export.h>
105  *       dataF8 = *ptr8_in++;                  !!  36 #include <asm/asm.h>
106  *                                             !!  37 #include <asm/asm-offsets.h>
107  *       data0 = HEXAGON_P_valignb_PPp(dataF8, !!  38 #include <asm/regdef.h>
108  *       ptr8_out = (s64 *) &ptr_out[0]; *ptr8 !!  39 
109  *       data70 = *ptr8_in++;                  !!  40 #define dst a0
110  *                                             !!  41 #define src a1
111  *       data0 = HEXAGON_P_valignb_PPp(data70, !!  42 #define len a2
112  *       ptr8_out = (s64 *) &ptr_out[0]; *ptr8 !!  43 
113  *       dataF8 = *ptr8_in++;                  !!  44 /*
114  *   }                                         !!  45  * Spec
115  *   epilogdws = epilog >> 3;                  << 
116  *   if (epilogdws) for (i=0; i < epilogdws; i << 
117  *       data0 = HEXAGON_P_valignb_PPp(dataF8, << 
118  *       ptr8_out = (s64 *) &ptr_out[0]; *ptr8 << 
119  *       data70 = dataF8;                      << 
120  *       dataF8 = *ptr8_in++;                  << 
121  *   }                                         << 
122  *   data0 = HEXAGON_P_valignb_PPp(dataF8, dat << 
123  *                                                 46  *
124  *   ptr4 = (s32 *) &ptr_out[0];               !!  47  * memcpy copies len bytes from src to dst and sets v0 to dst.
125  *   if (epilog & 4) { ptr4[0] = (u32) data0;  !!  48  * It assumes that
126  *   ptr2 = (s16 *) &ptr_out[0];               !!  49  *   - src and dst don't overlap
127  *   if (epilog & 2) { ptr2[0] = (u16) data0;  !!  50  *   - src is readable
128  *   if (epilog & 1) { *ptr_out++ = (u8) data0 !!  51  *   - dst is writable
                                                   >>  52  * memcpy uses the standard calling convention
                                                   >>  53  *
                                                   >>  54  * __copy_user copies up to len bytes from src to dst and sets a2 (len) to
                                                   >>  55  * the number of uncopied bytes due to an exception caused by a read or write.
                                                   >>  56  * __copy_user assumes that src and dst don't overlap, and that the call is
                                                   >>  57  * implementing one of the following:
                                                   >>  58  *   copy_to_user
                                                   >>  59  *     - src is readable  (no exceptions when reading src)
                                                   >>  60  *   copy_from_user
                                                   >>  61  *     - dst is writable  (no exceptions when writing dst)
                                                   >>  62  * __copy_user uses a non-standard calling convention; see
                                                   >>  63  * include/asm-mips/uaccess.h
                                                   >>  64  *
                                                   >>  65  * When an exception happens on a load, the handler must
                                                   >>  66  # ensure that all of the destination buffer is overwritten to prevent
                                                   >>  67  * leaking information to user mode programs.
                                                   >>  68  */
                                                   >>  69 
                                                   >>  70 /*
                                                   >>  71  * Implementation
                                                   >>  72  */
                                                   >>  73 
                                                   >>  74 /*
                                                   >>  75  * The exception handler for loads requires that:
                                                   >>  76  *  1- AT contain the address of the byte just past the end of the source
                                                   >>  77  *     of the copy,
                                                   >>  78  *  2- src_entry <= src < AT, and
                                                   >>  79  *  3- (dst - src) == (dst_entry - src_entry),
                                                   >>  80  * The _entry suffix denotes values when __copy_user was called.
                                                   >>  81  *
                                                   >>  82  * (1) is set up up by uaccess.h and maintained by not writing AT in copy_user
                                                   >>  83  * (2) is met by incrementing src by the number of bytes copied
                                                   >>  84  * (3) is met by not doing loads between a pair of increments of dst and src
129  *                                                 85  *
130  *   return(ptr_out - length);                 !!  86  * The exception handlers for stores adjust len (if necessary) and return.
131  * }                                           !!  87  * These handlers do not need to overwrite any data.
132  *                                                 88  *
133  * Codesize : 784 bytes                        !!  89  * For __rmemcpy and memmove an exception is always a kernel bug, therefore
                                                   >>  90  * they're not protected.
                                                   >>  91  */
                                                   >>  92 
                                                   >>  93 /* Instruction type */
                                                   >>  94 #define LD_INSN 1
                                                   >>  95 #define ST_INSN 2
                                                   >>  96 /* Pretech type */
                                                   >>  97 #define SRC_PREFETCH 1
                                                   >>  98 #define DST_PREFETCH 2
                                                   >>  99 #define LEGACY_MODE 1
                                                   >> 100 #define EVA_MODE    2
                                                   >> 101 #define USEROP   1
                                                   >> 102 #define KERNELOP 2
                                                   >> 103 
                                                   >> 104 /*
                                                   >> 105  * Wrapper to add an entry in the exception table
                                                   >> 106  * in case the insn causes a memory exception.
                                                   >> 107  * Arguments:
                                                   >> 108  * insn    : Load/store instruction
                                                   >> 109  * type    : Instruction type
                                                   >> 110  * reg     : Register
                                                   >> 111  * addr    : Address
                                                   >> 112  * handler : Exception handler
                                                   >> 113  */
                                                   >> 114 
                                                   >> 115 #define EXC(insn, type, reg, addr, handler)                     \
                                                   >> 116         .if \mode == LEGACY_MODE;                               \
                                                   >> 117 9:              insn reg, addr;                                 \
                                                   >> 118                 .section __ex_table,"a";                        \
                                                   >> 119                 PTR_WD  9b, handler;                            \
                                                   >> 120                 .previous;                                      \
                                                   >> 121         /* This is assembled in EVA mode */                     \
                                                   >> 122         .else;                                                  \
                                                   >> 123                 /* If loading from user or storing to user */   \
                                                   >> 124                 .if ((\from == USEROP) && (type == LD_INSN)) || \
                                                   >> 125                     ((\to == USEROP) && (type == ST_INSN));     \
                                                   >> 126 9:                      __BUILD_EVA_INSN(insn##e, reg, addr);   \
                                                   >> 127                         .section __ex_table,"a";                \
                                                   >> 128                         PTR_WD  9b, handler;                    \
                                                   >> 129                         .previous;                              \
                                                   >> 130                 .else;                                          \
                                                   >> 131                         /*                                      \
                                                   >> 132                          *  Still in EVA, but no need for       \
                                                   >> 133                          * exception handler or EVA insn        \
                                                   >> 134                          */                                     \
                                                   >> 135                         insn reg, addr;                         \
                                                   >> 136                 .endif;                                         \
                                                   >> 137         .endif
                                                   >> 138 
                                                   >> 139 /*
                                                   >> 140  * Only on the 64-bit kernel we can made use of 64-bit registers.
                                                   >> 141  */
                                                   >> 142 #ifdef CONFIG_64BIT
                                                   >> 143 #define USE_DOUBLE
                                                   >> 144 #endif
                                                   >> 145 
                                                   >> 146 #ifdef USE_DOUBLE
                                                   >> 147 
                                                   >> 148 #define LOADK ld /* No exception */
                                                   >> 149 #define LOAD(reg, addr, handler)        EXC(ld, LD_INSN, reg, addr, handler)
                                                   >> 150 #define LOADL(reg, addr, handler)       EXC(ldl, LD_INSN, reg, addr, handler)
                                                   >> 151 #define LOADR(reg, addr, handler)       EXC(ldr, LD_INSN, reg, addr, handler)
                                                   >> 152 #define STOREL(reg, addr, handler)      EXC(sdl, ST_INSN, reg, addr, handler)
                                                   >> 153 #define STORER(reg, addr, handler)      EXC(sdr, ST_INSN, reg, addr, handler)
                                                   >> 154 #define STORE(reg, addr, handler)       EXC(sd, ST_INSN, reg, addr, handler)
                                                   >> 155 #define ADD    daddu
                                                   >> 156 #define SUB    dsubu
                                                   >> 157 #define SRL    dsrl
                                                   >> 158 #define SRA    dsra
                                                   >> 159 #define SLL    dsll
                                                   >> 160 #define SLLV   dsllv
                                                   >> 161 #define SRLV   dsrlv
                                                   >> 162 #define NBYTES 8
                                                   >> 163 #define LOG_NBYTES 3
                                                   >> 164 
                                                   >> 165 /*
                                                   >> 166  * As we are sharing code base with the mips32 tree (which use the o32 ABI
                                                   >> 167  * register definitions). We need to redefine the register definitions from
                                                   >> 168  * the n64 ABI register naming to the o32 ABI register naming.
                                                   >> 169  */
                                                   >> 170 #undef t0
                                                   >> 171 #undef t1
                                                   >> 172 #undef t2
                                                   >> 173 #undef t3
                                                   >> 174 #define t0      $8
                                                   >> 175 #define t1      $9
                                                   >> 176 #define t2      $10
                                                   >> 177 #define t3      $11
                                                   >> 178 #define t4      $12
                                                   >> 179 #define t5      $13
                                                   >> 180 #define t6      $14
                                                   >> 181 #define t7      $15
                                                   >> 182 
                                                   >> 183 #else
                                                   >> 184 
                                                   >> 185 #define LOADK lw /* No exception */
                                                   >> 186 #define LOAD(reg, addr, handler)        EXC(lw, LD_INSN, reg, addr, handler)
                                                   >> 187 #define LOADL(reg, addr, handler)       EXC(lwl, LD_INSN, reg, addr, handler)
                                                   >> 188 #define LOADR(reg, addr, handler)       EXC(lwr, LD_INSN, reg, addr, handler)
                                                   >> 189 #define STOREL(reg, addr, handler)      EXC(swl, ST_INSN, reg, addr, handler)
                                                   >> 190 #define STORER(reg, addr, handler)      EXC(swr, ST_INSN, reg, addr, handler)
                                                   >> 191 #define STORE(reg, addr, handler)       EXC(sw, ST_INSN, reg, addr, handler)
                                                   >> 192 #define ADD    addu
                                                   >> 193 #define SUB    subu
                                                   >> 194 #define SRL    srl
                                                   >> 195 #define SLL    sll
                                                   >> 196 #define SRA    sra
                                                   >> 197 #define SLLV   sllv
                                                   >> 198 #define SRLV   srlv
                                                   >> 199 #define NBYTES 4
                                                   >> 200 #define LOG_NBYTES 2
                                                   >> 201 
                                                   >> 202 #endif /* USE_DOUBLE */
                                                   >> 203 
                                                   >> 204 #define LOADB(reg, addr, handler)       EXC(lb, LD_INSN, reg, addr, handler)
                                                   >> 205 #define STOREB(reg, addr, handler)      EXC(sb, ST_INSN, reg, addr, handler)
                                                   >> 206 
                                                   >> 207 #ifdef CONFIG_CPU_HAS_PREFETCH
                                                   >> 208 # define _PREF(hint, addr, type)                                        \
                                                   >> 209         .if \mode == LEGACY_MODE;                                       \
                                                   >> 210                 kernel_pref(hint, addr);                                \
                                                   >> 211         .else;                                                          \
                                                   >> 212                 .if ((\from == USEROP) && (type == SRC_PREFETCH)) ||    \
                                                   >> 213                     ((\to == USEROP) && (type == DST_PREFETCH));        \
                                                   >> 214                         /*                                              \
                                                   >> 215                          * PREFE has only 9 bits for the offset         \
                                                   >> 216                          * compared to PREF which has 16, so it may     \
                                                   >> 217                          * need to use the $at register but this        \
                                                   >> 218                          * register should remain intact because it's   \
                                                   >> 219                          * used later on. Therefore use $v1.            \
                                                   >> 220                          */                                             \
                                                   >> 221                         .set at=v1;                                     \
                                                   >> 222                         user_pref(hint, addr);                          \
                                                   >> 223                         .set noat;                                      \
                                                   >> 224                 .else;                                                  \
                                                   >> 225                         kernel_pref(hint, addr);                        \
                                                   >> 226                 .endif;                                                 \
                                                   >> 227         .endif
                                                   >> 228 #else
                                                   >> 229 # define _PREF(hint, addr, type)
                                                   >> 230 #endif
                                                   >> 231 
                                                   >> 232 #define PREFS(hint, addr) _PREF(hint, addr, SRC_PREFETCH)
                                                   >> 233 #define PREFD(hint, addr) _PREF(hint, addr, DST_PREFETCH)
                                                   >> 234 
                                                   >> 235 #ifdef CONFIG_CPU_LITTLE_ENDIAN
                                                   >> 236 #define LDFIRST LOADR
                                                   >> 237 #define LDREST  LOADL
                                                   >> 238 #define STFIRST STORER
                                                   >> 239 #define STREST  STOREL
                                                   >> 240 #define SHIFT_DISCARD SLLV
                                                   >> 241 #else
                                                   >> 242 #define LDFIRST LOADL
                                                   >> 243 #define LDREST  LOADR
                                                   >> 244 #define STFIRST STOREL
                                                   >> 245 #define STREST  STORER
                                                   >> 246 #define SHIFT_DISCARD SRLV
                                                   >> 247 #endif
                                                   >> 248 
                                                   >> 249 #define FIRST(unit) ((unit)*NBYTES)
                                                   >> 250 #define REST(unit)  (FIRST(unit)+NBYTES-1)
                                                   >> 251 #define UNIT(unit)  FIRST(unit)
                                                   >> 252 
                                                   >> 253 #define ADDRMASK (NBYTES-1)
                                                   >> 254 
                                                   >> 255         .text
                                                   >> 256         .set    noreorder
                                                   >> 257 #ifndef CONFIG_CPU_DADDI_WORKAROUNDS
                                                   >> 258         .set    noat
                                                   >> 259 #else
                                                   >> 260         .set    at=v1
                                                   >> 261 #endif
                                                   >> 262 
                                                   >> 263         .align  5
                                                   >> 264 
                                                   >> 265         /*
                                                   >> 266          * Macro to build the __copy_user common code
                                                   >> 267          * Arguments:
                                                   >> 268          * mode : LEGACY_MODE or EVA_MODE
                                                   >> 269          * from : Source operand. USEROP or KERNELOP
                                                   >> 270          * to   : Destination operand. USEROP or KERNELOP
                                                   >> 271          */
                                                   >> 272         .macro __BUILD_COPY_USER mode, from, to
                                                   >> 273 
                                                   >> 274         /* initialize __memcpy if this the first time we execute this macro */
                                                   >> 275         .ifnotdef __memcpy
                                                   >> 276         .set __memcpy, 1
                                                   >> 277         .hidden __memcpy /* make sure it does not leak */
                                                   >> 278         .endif
                                                   >> 279 
                                                   >> 280         /*
                                                   >> 281          * Note: dst & src may be unaligned, len may be 0
                                                   >> 282          * Temps
                                                   >> 283          */
                                                   >> 284 #define rem t8
                                                   >> 285 
                                                   >> 286         R10KCBARRIER(0(ra))
                                                   >> 287         /*
                                                   >> 288          * The "issue break"s below are very approximate.
                                                   >> 289          * Issue delays for dcache fills will perturb the schedule, as will
                                                   >> 290          * load queue full replay traps, etc.
                                                   >> 291          *
                                                   >> 292          * If len < NBYTES use byte operations.
                                                   >> 293          */
                                                   >> 294         PREFS(  0, 0(src) )
                                                   >> 295         PREFD(  1, 0(dst) )
                                                   >> 296         sltu    t2, len, NBYTES
                                                   >> 297         and     t1, dst, ADDRMASK
                                                   >> 298         PREFS(  0, 1*32(src) )
                                                   >> 299         PREFD(  1, 1*32(dst) )
                                                   >> 300         bnez    t2, .Lcopy_bytes_checklen\@
                                                   >> 301          and    t0, src, ADDRMASK
                                                   >> 302         PREFS(  0, 2*32(src) )
                                                   >> 303         PREFD(  1, 2*32(dst) )
                                                   >> 304 #ifndef CONFIG_CPU_NO_LOAD_STORE_LR
                                                   >> 305         bnez    t1, .Ldst_unaligned\@
                                                   >> 306          nop
                                                   >> 307         bnez    t0, .Lsrc_unaligned_dst_aligned\@
                                                   >> 308 #else /* CONFIG_CPU_NO_LOAD_STORE_LR */
                                                   >> 309         or      t0, t0, t1
                                                   >> 310         bnez    t0, .Lcopy_unaligned_bytes\@
                                                   >> 311 #endif /* CONFIG_CPU_NO_LOAD_STORE_LR */
                                                   >> 312         /*
                                                   >> 313          * use delay slot for fall-through
                                                   >> 314          * src and dst are aligned; need to compute rem
                                                   >> 315          */
                                                   >> 316 .Lboth_aligned\@:
                                                   >> 317          SRL    t0, len, LOG_NBYTES+3    # +3 for 8 units/iter
                                                   >> 318         beqz    t0, .Lcleanup_both_aligned\@ # len < 8*NBYTES
                                                   >> 319          and    rem, len, (8*NBYTES-1)   # rem = len % (8*NBYTES)
                                                   >> 320         PREFS(  0, 3*32(src) )
                                                   >> 321         PREFD(  1, 3*32(dst) )
                                                   >> 322         .align  4
                                                   >> 323 1:
                                                   >> 324         R10KCBARRIER(0(ra))
                                                   >> 325         LOAD(t0, UNIT(0)(src), .Ll_exc\@)
                                                   >> 326         LOAD(t1, UNIT(1)(src), .Ll_exc_copy\@)
                                                   >> 327         LOAD(t2, UNIT(2)(src), .Ll_exc_copy\@)
                                                   >> 328         LOAD(t3, UNIT(3)(src), .Ll_exc_copy\@)
                                                   >> 329         SUB     len, len, 8*NBYTES
                                                   >> 330         LOAD(t4, UNIT(4)(src), .Ll_exc_copy\@)
                                                   >> 331         LOAD(t7, UNIT(5)(src), .Ll_exc_copy\@)
                                                   >> 332         STORE(t0, UNIT(0)(dst), .Ls_exc_p8u\@)
                                                   >> 333         STORE(t1, UNIT(1)(dst), .Ls_exc_p7u\@)
                                                   >> 334         LOAD(t0, UNIT(6)(src), .Ll_exc_copy\@)
                                                   >> 335         LOAD(t1, UNIT(7)(src), .Ll_exc_copy\@)
                                                   >> 336         ADD     src, src, 8*NBYTES
                                                   >> 337         ADD     dst, dst, 8*NBYTES
                                                   >> 338         STORE(t2, UNIT(-6)(dst), .Ls_exc_p6u\@)
                                                   >> 339         STORE(t3, UNIT(-5)(dst), .Ls_exc_p5u\@)
                                                   >> 340         STORE(t4, UNIT(-4)(dst), .Ls_exc_p4u\@)
                                                   >> 341         STORE(t7, UNIT(-3)(dst), .Ls_exc_p3u\@)
                                                   >> 342         STORE(t0, UNIT(-2)(dst), .Ls_exc_p2u\@)
                                                   >> 343         STORE(t1, UNIT(-1)(dst), .Ls_exc_p1u\@)
                                                   >> 344         PREFS(  0, 8*32(src) )
                                                   >> 345         PREFD(  1, 8*32(dst) )
                                                   >> 346         bne     len, rem, 1b
                                                   >> 347          nop
                                                   >> 348 
                                                   >> 349         /*
                                                   >> 350          * len == rem == the number of bytes left to copy < 8*NBYTES
                                                   >> 351          */
                                                   >> 352 .Lcleanup_both_aligned\@:
                                                   >> 353         beqz    len, .Ldone\@
                                                   >> 354          sltu   t0, len, 4*NBYTES
                                                   >> 355         bnez    t0, .Lless_than_4units\@
                                                   >> 356          and    rem, len, (NBYTES-1)    # rem = len % NBYTES
                                                   >> 357         /*
                                                   >> 358          * len >= 4*NBYTES
                                                   >> 359          */
                                                   >> 360         LOAD( t0, UNIT(0)(src), .Ll_exc\@)
                                                   >> 361         LOAD( t1, UNIT(1)(src), .Ll_exc_copy\@)
                                                   >> 362         LOAD( t2, UNIT(2)(src), .Ll_exc_copy\@)
                                                   >> 363         LOAD( t3, UNIT(3)(src), .Ll_exc_copy\@)
                                                   >> 364         SUB     len, len, 4*NBYTES
                                                   >> 365         ADD     src, src, 4*NBYTES
                                                   >> 366         R10KCBARRIER(0(ra))
                                                   >> 367         STORE(t0, UNIT(0)(dst), .Ls_exc_p4u\@)
                                                   >> 368         STORE(t1, UNIT(1)(dst), .Ls_exc_p3u\@)
                                                   >> 369         STORE(t2, UNIT(2)(dst), .Ls_exc_p2u\@)
                                                   >> 370         STORE(t3, UNIT(3)(dst), .Ls_exc_p1u\@)
                                                   >> 371         .set    reorder                         /* DADDI_WAR */
                                                   >> 372         ADD     dst, dst, 4*NBYTES
                                                   >> 373         beqz    len, .Ldone\@
                                                   >> 374         .set    noreorder
                                                   >> 375 .Lless_than_4units\@:
                                                   >> 376         /*
                                                   >> 377          * rem = len % NBYTES
                                                   >> 378          */
                                                   >> 379         beq     rem, len, .Lcopy_bytes\@
                                                   >> 380          nop
                                                   >> 381 1:
                                                   >> 382         R10KCBARRIER(0(ra))
                                                   >> 383         LOAD(t0, 0(src), .Ll_exc\@)
                                                   >> 384         ADD     src, src, NBYTES
                                                   >> 385         SUB     len, len, NBYTES
                                                   >> 386         STORE(t0, 0(dst), .Ls_exc_p1u\@)
                                                   >> 387         .set    reorder                         /* DADDI_WAR */
                                                   >> 388         ADD     dst, dst, NBYTES
                                                   >> 389         bne     rem, len, 1b
                                                   >> 390         .set    noreorder
                                                   >> 391 
                                                   >> 392 #ifndef CONFIG_CPU_NO_LOAD_STORE_LR
                                                   >> 393         /*
                                                   >> 394          * src and dst are aligned, need to copy rem bytes (rem < NBYTES)
                                                   >> 395          * A loop would do only a byte at a time with possible branch
                                                   >> 396          * mispredicts.  Can't do an explicit LOAD dst,mask,or,STORE
                                                   >> 397          * because can't assume read-access to dst.  Instead, use
                                                   >> 398          * STREST dst, which doesn't require read access to dst.
                                                   >> 399          *
                                                   >> 400          * This code should perform better than a simple loop on modern,
                                                   >> 401          * wide-issue mips processors because the code has fewer branches and
                                                   >> 402          * more instruction-level parallelism.
                                                   >> 403          */
                                                   >> 404 #define bits t2
                                                   >> 405         beqz    len, .Ldone\@
                                                   >> 406          ADD    t1, dst, len    # t1 is just past last byte of dst
                                                   >> 407         li      bits, 8*NBYTES
                                                   >> 408         SLL     rem, len, 3     # rem = number of bits to keep
                                                   >> 409         LOAD(t0, 0(src), .Ll_exc\@)
                                                   >> 410         SUB     bits, bits, rem # bits = number of bits to discard
                                                   >> 411         SHIFT_DISCARD t0, t0, bits
                                                   >> 412         STREST(t0, -1(t1), .Ls_exc\@)
                                                   >> 413         jr      ra
                                                   >> 414          move   len, zero
                                                   >> 415 .Ldst_unaligned\@:
                                                   >> 416         /*
                                                   >> 417          * dst is unaligned
                                                   >> 418          * t0 = src & ADDRMASK
                                                   >> 419          * t1 = dst & ADDRMASK; T1 > 0
                                                   >> 420          * len >= NBYTES
                                                   >> 421          *
                                                   >> 422          * Copy enough bytes to align dst
                                                   >> 423          * Set match = (src and dst have same alignment)
                                                   >> 424          */
                                                   >> 425 #define match rem
                                                   >> 426         LDFIRST(t3, FIRST(0)(src), .Ll_exc\@)
                                                   >> 427         ADD     t2, zero, NBYTES
                                                   >> 428         LDREST(t3, REST(0)(src), .Ll_exc_copy\@)
                                                   >> 429         SUB     t2, t2, t1      # t2 = number of bytes copied
                                                   >> 430         xor     match, t0, t1
                                                   >> 431         R10KCBARRIER(0(ra))
                                                   >> 432         STFIRST(t3, FIRST(0)(dst), .Ls_exc\@)
                                                   >> 433         beq     len, t2, .Ldone\@
                                                   >> 434          SUB    len, len, t2
                                                   >> 435         ADD     dst, dst, t2
                                                   >> 436         beqz    match, .Lboth_aligned\@
                                                   >> 437          ADD    src, src, t2
                                                   >> 438 
                                                   >> 439 .Lsrc_unaligned_dst_aligned\@:
                                                   >> 440         SRL     t0, len, LOG_NBYTES+2    # +2 for 4 units/iter
                                                   >> 441         PREFS(  0, 3*32(src) )
                                                   >> 442         beqz    t0, .Lcleanup_src_unaligned\@
                                                   >> 443          and    rem, len, (4*NBYTES-1)   # rem = len % 4*NBYTES
                                                   >> 444         PREFD(  1, 3*32(dst) )
                                                   >> 445 1:
                                                   >> 446 /*
                                                   >> 447  * Avoid consecutive LD*'s to the same register since some mips
                                                   >> 448  * implementations can't issue them in the same cycle.
                                                   >> 449  * It's OK to load FIRST(N+1) before REST(N) because the two addresses
                                                   >> 450  * are to the same unit (unless src is aligned, but it's not).
                                                   >> 451  */
                                                   >> 452         R10KCBARRIER(0(ra))
                                                   >> 453         LDFIRST(t0, FIRST(0)(src), .Ll_exc\@)
                                                   >> 454         LDFIRST(t1, FIRST(1)(src), .Ll_exc_copy\@)
                                                   >> 455         SUB     len, len, 4*NBYTES
                                                   >> 456         LDREST(t0, REST(0)(src), .Ll_exc_copy\@)
                                                   >> 457         LDREST(t1, REST(1)(src), .Ll_exc_copy\@)
                                                   >> 458         LDFIRST(t2, FIRST(2)(src), .Ll_exc_copy\@)
                                                   >> 459         LDFIRST(t3, FIRST(3)(src), .Ll_exc_copy\@)
                                                   >> 460         LDREST(t2, REST(2)(src), .Ll_exc_copy\@)
                                                   >> 461         LDREST(t3, REST(3)(src), .Ll_exc_copy\@)
                                                   >> 462         PREFS(  0, 9*32(src) )          # 0 is PREF_LOAD  (not streamed)
                                                   >> 463         ADD     src, src, 4*NBYTES
                                                   >> 464 #ifdef CONFIG_CPU_SB1
                                                   >> 465         nop                             # improves slotting
                                                   >> 466 #endif
                                                   >> 467         STORE(t0, UNIT(0)(dst), .Ls_exc_p4u\@)
                                                   >> 468         STORE(t1, UNIT(1)(dst), .Ls_exc_p3u\@)
                                                   >> 469         STORE(t2, UNIT(2)(dst), .Ls_exc_p2u\@)
                                                   >> 470         STORE(t3, UNIT(3)(dst), .Ls_exc_p1u\@)
                                                   >> 471         PREFD(  1, 9*32(dst) )          # 1 is PREF_STORE (not streamed)
                                                   >> 472         .set    reorder                         /* DADDI_WAR */
                                                   >> 473         ADD     dst, dst, 4*NBYTES
                                                   >> 474         bne     len, rem, 1b
                                                   >> 475         .set    noreorder
                                                   >> 476 
                                                   >> 477 .Lcleanup_src_unaligned\@:
                                                   >> 478         beqz    len, .Ldone\@
                                                   >> 479          and    rem, len, NBYTES-1  # rem = len % NBYTES
                                                   >> 480         beq     rem, len, .Lcopy_bytes\@
                                                   >> 481          nop
                                                   >> 482 1:
                                                   >> 483         R10KCBARRIER(0(ra))
                                                   >> 484         LDFIRST(t0, FIRST(0)(src), .Ll_exc\@)
                                                   >> 485         LDREST(t0, REST(0)(src), .Ll_exc_copy\@)
                                                   >> 486         ADD     src, src, NBYTES
                                                   >> 487         SUB     len, len, NBYTES
                                                   >> 488         STORE(t0, 0(dst), .Ls_exc_p1u\@)
                                                   >> 489         .set    reorder                         /* DADDI_WAR */
                                                   >> 490         ADD     dst, dst, NBYTES
                                                   >> 491         bne     len, rem, 1b
                                                   >> 492         .set    noreorder
                                                   >> 493 
                                                   >> 494 #endif /* !CONFIG_CPU_NO_LOAD_STORE_LR */
                                                   >> 495 .Lcopy_bytes_checklen\@:
                                                   >> 496         beqz    len, .Ldone\@
                                                   >> 497          nop
                                                   >> 498 .Lcopy_bytes\@:
                                                   >> 499         /* 0 < len < NBYTES  */
                                                   >> 500         R10KCBARRIER(0(ra))
                                                   >> 501 #define COPY_BYTE(N)                    \
                                                   >> 502         LOADB(t0, N(src), .Ll_exc\@);   \
                                                   >> 503         SUB     len, len, 1;            \
                                                   >> 504         beqz    len, .Ldone\@;          \
                                                   >> 505         STOREB(t0, N(dst), .Ls_exc_p1\@)
                                                   >> 506 
                                                   >> 507         COPY_BYTE(0)
                                                   >> 508         COPY_BYTE(1)
                                                   >> 509 #ifdef USE_DOUBLE
                                                   >> 510         COPY_BYTE(2)
                                                   >> 511         COPY_BYTE(3)
                                                   >> 512         COPY_BYTE(4)
                                                   >> 513         COPY_BYTE(5)
                                                   >> 514 #endif
                                                   >> 515         LOADB(t0, NBYTES-2(src), .Ll_exc\@)
                                                   >> 516         SUB     len, len, 1
                                                   >> 517         jr      ra
                                                   >> 518         STOREB(t0, NBYTES-2(dst), .Ls_exc_p1\@)
                                                   >> 519 .Ldone\@:
                                                   >> 520         jr      ra
                                                   >> 521          nop
                                                   >> 522 
                                                   >> 523 #ifdef CONFIG_CPU_NO_LOAD_STORE_LR
                                                   >> 524 .Lcopy_unaligned_bytes\@:
                                                   >> 525 1:
                                                   >> 526         COPY_BYTE(0)
                                                   >> 527         COPY_BYTE(1)
                                                   >> 528         COPY_BYTE(2)
                                                   >> 529         COPY_BYTE(3)
                                                   >> 530         COPY_BYTE(4)
                                                   >> 531         COPY_BYTE(5)
                                                   >> 532         COPY_BYTE(6)
                                                   >> 533         COPY_BYTE(7)
                                                   >> 534         ADD     src, src, 8
                                                   >> 535         b       1b
                                                   >> 536          ADD    dst, dst, 8
                                                   >> 537 #endif /* CONFIG_CPU_NO_LOAD_STORE_LR */
                                                   >> 538         .if __memcpy == 1
                                                   >> 539         END(memcpy)
                                                   >> 540         .set __memcpy, 0
                                                   >> 541         .hidden __memcpy
                                                   >> 542         .endif
                                                   >> 543 
                                                   >> 544 .Ll_exc_copy\@:
                                                   >> 545         /*
                                                   >> 546          * Copy bytes from src until faulting load address (or until a
                                                   >> 547          * lb faults)
                                                   >> 548          *
                                                   >> 549          * When reached by a faulting LDFIRST/LDREST, THREAD_BUADDR($28)
                                                   >> 550          * may be more than a byte beyond the last address.
                                                   >> 551          * Hence, the lb below may get an exception.
                                                   >> 552          *
                                                   >> 553          * Assumes src < THREAD_BUADDR($28)
                                                   >> 554          */
                                                   >> 555         LOADK   t0, TI_TASK($28)
                                                   >> 556          nop
                                                   >> 557         LOADK   t0, THREAD_BUADDR(t0)
                                                   >> 558 1:
                                                   >> 559         LOADB(t1, 0(src), .Ll_exc\@)
                                                   >> 560         ADD     src, src, 1
                                                   >> 561         sb      t1, 0(dst)      # can't fault -- we're copy_from_user
                                                   >> 562         .set    reorder                         /* DADDI_WAR */
                                                   >> 563         ADD     dst, dst, 1
                                                   >> 564         bne     src, t0, 1b
                                                   >> 565         .set    noreorder
                                                   >> 566 .Ll_exc\@:
                                                   >> 567         LOADK   t0, TI_TASK($28)
                                                   >> 568          nop
                                                   >> 569         LOADK   t0, THREAD_BUADDR(t0)   # t0 is just past last good address
                                                   >> 570          nop
                                                   >> 571         SUB     len, AT, t0             # len number of uncopied bytes
                                                   >> 572         jr      ra
                                                   >> 573          nop
                                                   >> 574 
                                                   >> 575 #define SEXC(n)                                                 \
                                                   >> 576         .set    reorder;                        /* DADDI_WAR */ \
                                                   >> 577 .Ls_exc_p ## n ## u\@:                                          \
                                                   >> 578         ADD     len, len, n*NBYTES;                             \
                                                   >> 579         jr      ra;                                             \
                                                   >> 580         .set    noreorder
                                                   >> 581 
                                                   >> 582 SEXC(8)
                                                   >> 583 SEXC(7)
                                                   >> 584 SEXC(6)
                                                   >> 585 SEXC(5)
                                                   >> 586 SEXC(4)
                                                   >> 587 SEXC(3)
                                                   >> 588 SEXC(2)
                                                   >> 589 SEXC(1)
                                                   >> 590 
                                                   >> 591 .Ls_exc_p1\@:
                                                   >> 592         .set    reorder                         /* DADDI_WAR */
                                                   >> 593         ADD     len, len, 1
                                                   >> 594         jr      ra
                                                   >> 595         .set    noreorder
                                                   >> 596 .Ls_exc\@:
                                                   >> 597         jr      ra
                                                   >> 598          nop
                                                   >> 599         .endm
                                                   >> 600 
                                                   >> 601 #ifndef CONFIG_HAVE_PLAT_MEMCPY
                                                   >> 602         .align  5
                                                   >> 603 LEAF(memmove)
                                                   >> 604 EXPORT_SYMBOL(memmove)
                                                   >> 605         ADD     t0, a0, a2
                                                   >> 606         ADD     t1, a1, a2
                                                   >> 607         sltu    t0, a1, t0                      # dst + len <= src -> memcpy
                                                   >> 608         sltu    t1, a0, t1                      # dst >= src + len -> memcpy
                                                   >> 609         and     t0, t1
                                                   >> 610         beqz    t0, .L__memcpy
                                                   >> 611          move   v0, a0                          /* return value */
                                                   >> 612         beqz    a2, .Lr_out
                                                   >> 613         END(memmove)
                                                   >> 614 
                                                   >> 615         /* fall through to __rmemcpy */
                                                   >> 616 LEAF(__rmemcpy)                                 /* a0=dst a1=src a2=len */
                                                   >> 617          sltu   t0, a1, a0
                                                   >> 618         beqz    t0, .Lr_end_bytes_up            # src >= dst
                                                   >> 619          nop
                                                   >> 620         ADD     a0, a2                          # dst = dst + len
                                                   >> 621         ADD     a1, a2                          # src = src + len
                                                   >> 622 
                                                   >> 623 .Lr_end_bytes:
                                                   >> 624         R10KCBARRIER(0(ra))
                                                   >> 625         lb      t0, -1(a1)
                                                   >> 626         SUB     a2, a2, 0x1
                                                   >> 627         sb      t0, -1(a0)
                                                   >> 628         SUB     a1, a1, 0x1
                                                   >> 629         .set    reorder                         /* DADDI_WAR */
                                                   >> 630         SUB     a0, a0, 0x1
                                                   >> 631         bnez    a2, .Lr_end_bytes
                                                   >> 632         .set    noreorder
                                                   >> 633 
                                                   >> 634 .Lr_out:
                                                   >> 635         jr      ra
                                                   >> 636          move   a2, zero
                                                   >> 637 
                                                   >> 638 .Lr_end_bytes_up:
                                                   >> 639         R10KCBARRIER(0(ra))
                                                   >> 640         lb      t0, (a1)
                                                   >> 641         SUB     a2, a2, 0x1
                                                   >> 642         sb      t0, (a0)
                                                   >> 643         ADD     a1, a1, 0x1
                                                   >> 644         .set    reorder                         /* DADDI_WAR */
                                                   >> 645         ADD     a0, a0, 0x1
                                                   >> 646         bnez    a2, .Lr_end_bytes_up
                                                   >> 647         .set    noreorder
                                                   >> 648 
                                                   >> 649         jr      ra
                                                   >> 650          move   a2, zero
                                                   >> 651         END(__rmemcpy)
                                                   >> 652 
                                                   >> 653 /*
                                                   >> 654  * A combined memcpy/__copy_user
                                                   >> 655  * __copy_user sets len to 0 for success; else to an upper bound of
                                                   >> 656  * the number of uncopied bytes.
                                                   >> 657  * memcpy sets v0 to dst.
                                                   >> 658  */
                                                   >> 659         .align  5
                                                   >> 660 LEAF(memcpy)                                    /* a0=dst a1=src a2=len */
                                                   >> 661 EXPORT_SYMBOL(memcpy)
                                                   >> 662         move    v0, dst                         /* return value */
                                                   >> 663 .L__memcpy:
                                                   >> 664 #ifndef CONFIG_EVA
                                                   >> 665 FEXPORT(__raw_copy_from_user)
                                                   >> 666 EXPORT_SYMBOL(__raw_copy_from_user)
                                                   >> 667 FEXPORT(__raw_copy_to_user)
                                                   >> 668 EXPORT_SYMBOL(__raw_copy_to_user)
                                                   >> 669 #endif
                                                   >> 670         /* Legacy Mode, user <-> user */
                                                   >> 671         __BUILD_COPY_USER LEGACY_MODE USEROP USEROP
                                                   >> 672 
                                                   >> 673 #endif
                                                   >> 674 
                                                   >> 675 #ifdef CONFIG_EVA
                                                   >> 676 
                                                   >> 677 /*
                                                   >> 678  * For EVA we need distinct symbols for reading and writing to user space.
                                                   >> 679  * This is because we need to use specific EVA instructions to perform the
                                                   >> 680  * virtual <-> physical translation when a virtual address is actually in user
                                                   >> 681  * space
                                                   >> 682  */
                                                   >> 683 
                                                   >> 684 /*
                                                   >> 685  * __copy_from_user (EVA)
                                                   >> 686  */
                                                   >> 687 
                                                   >> 688 LEAF(__raw_copy_from_user)
                                                   >> 689 EXPORT_SYMBOL(__raw_copy_from_user)
                                                   >> 690         __BUILD_COPY_USER EVA_MODE USEROP KERNELOP
                                                   >> 691 END(__raw_copy_from_user)
                                                   >> 692 
                                                   >> 693 
                                                   >> 694 
                                                   >> 695 /*
                                                   >> 696  * __copy_to_user (EVA)
134  */                                               697  */
135                                                   698 
                                                   >> 699 LEAF(__raw_copy_to_user)
                                                   >> 700 EXPORT_SYMBOL(__raw_copy_to_user)
                                                   >> 701 __BUILD_COPY_USER EVA_MODE KERNELOP USEROP
                                                   >> 702 END(__raw_copy_to_user)
136                                                   703 
137 #define ptr_out         R0      /*  destinatio !! 704 #endif
138 #define ptr_in          R1      /*  source poi << 
139 #define len             R2      /*  length of  << 
140                                                << 
141 #define data70          R13:12  /*  lo 8 bytes << 
142 #define dataF8          R11:10  /*  hi 8 bytes << 
143 #define ldata0          R7:6    /*  even 8 byt << 
144 #define ldata1          R25:24  /*  odd 8 byte << 
145 #define data1           R7      /*  lower 8 by << 
146 #define data0           R6      /*  lower 8 by << 
147                                                << 
148 #define ifbyte          p0      /*  if transfe << 
149 #define ifhword         p0      /*  if transfe << 
150 #define ifword          p0      /*  if transfe << 
151 #define noprolog        p0      /*  no prolog, << 
152 #define nokernel        p1      /*  no 32byte  << 
153 #define noepilog        p0      /*  no epilog, << 
154 #define align           p2      /*  alignment  << 
155 #define kernel1         p0      /*  kernel cou << 
156                                                << 
157 #define dalign          R25     /*  rel alignm << 
158 #define star3           R16     /*  number byt << 
159 #define rest            R8      /*  length - p << 
160 #define back            R7      /*  nr bytes > << 
161 #define epilog          R3      /*  bytes in e << 
162 #define inc             R15:14  /*  inc kernel << 
163 #define kernel          R4      /*  number of  << 
164 #define ptr_in_p_128    R5      /*  pointer fo << 
165 #define mask            R8      /*  mask used  << 
166 #define shift           R8      /*  used to wo << 
167 #define shift2          R5      /*  in epilog  << 
168 #define prolog          R15     /*  bytes in   << 
169 #define epilogdws       R15     /*  number dwo << 
170 #define shiftb          R14     /*  used to ex << 
171 #define offset          R9      /*  same as al << 
172 #define ptr_out_p_32    R17     /*  pointer to << 
173 #define align888        R14     /*  if simple  << 
174 #define len8            R9      /*  number of  << 
175 #define over            R20     /*  nr of byte << 
176                                                << 
177 #define ptr_in_p_128kernel      R5:4    /*  pa << 
178                                                << 
179         .section .text                         << 
180         .p2align 4                             << 
181         .global memcpy                         << 
182         .type memcpy, @function                << 
183 memcpy:                                        << 
184 {                                              << 
185         p2 = cmp.eq(len, #0);           /*  =0 << 
186         align888 = or(ptr_in, ptr_out); /*  %8 << 
187         p0 = cmp.gtu(len, #23);         /*  %1 << 
188         p1 = cmp.eq(ptr_in, ptr_out);   /*  at << 
189 }                                              << 
190 {                                              << 
191         p1 = or(p2, p1);                       << 
192         p3 = cmp.gtu(len, #95);         /*  %8 << 
193         align888 = or(align888, len);   /*  %8 << 
194         len8 = lsr(len, #3);            /*  %8 << 
195 }                                              << 
196 {                                              << 
197         dcfetch(ptr_in);                /*  ze << 
198         p2 = bitsclr(align888, #7);     /*  %8 << 
199         if(p1) jumpr r31;               /*  =0 << 
200 }                                              << 
201 {                                              << 
202         p2 = and(p2,!p3);                      << 
203         if (p2.new) len = add(len, #-8);       << 
204         if (p2.new) jump:NT .Ldwordaligned;    << 
205 }                                              << 
206 {                                              << 
207         if(!p0) jump .Lbytes23orless;   /*  %1 << 
208         mask.l = #LO(0x7fffffff);              << 
209         /*  all bytes before line multiples of << 
210         prolog = sub(#0, ptr_out);             << 
211 }                                              << 
212 {                                              << 
213         /*  save r31 on stack, decrement sp by << 
214         allocframe(#24);                       << 
215         mask.h = #HI(0x7fffffff);              << 
216         ptr_in_p_128 = add(ptr_in, #32);       << 
217         back = cl0(len);                       << 
218 }                                              << 
219 {                                              << 
220         memd(sp+#0) = R17:16;           /*  sa << 
221         r31.l = #LO(.Lmemcpy_return);   /*  se << 
222         prolog &= lsr(mask, back);             << 
223         offset = and(ptr_in, #7);              << 
224 }                                              << 
225 {                                              << 
226         memd(sp+#8) = R25:24;           /*  sa << 
227         dalign = sub(ptr_out, ptr_in);         << 
228         r31.h = #HI(.Lmemcpy_return);   /*  se << 
229 }                                              << 
230 {                                              << 
231         /*  see if there if input buffer end i << 
232         over = add(len, ptr_in);               << 
233         back = add(len, offset);               << 
234         memd(sp+#16) = R21:20;          /*  sa << 
235 }                                              << 
236 {                                              << 
237         noprolog = bitsclr(prolog, #7);        << 
238         prolog = and(prolog, #31);             << 
239         dcfetch(ptr_in_p_128);                 << 
240         ptr_in_p_128 = add(ptr_in_p_128, #32); << 
241 }                                              << 
242 {                                              << 
243         kernel = sub(len, prolog);             << 
244         shift = asl(prolog, #3);               << 
245         star3 = and(prolog, #7);               << 
246         ptr_in = and(ptr_in, #-8);             << 
247 }                                              << 
248 {                                              << 
249         prolog = lsr(prolog, #3);              << 
250         epilog = and(kernel, #31);             << 
251         ptr_out_p_32 = add(ptr_out, prolog);   << 
252         over = and(over, #7);                  << 
253 }                                              << 
254 {                                              << 
255         p3 = cmp.gtu(back, #8);                << 
256         kernel = lsr(kernel, #5);              << 
257         dcfetch(ptr_in_p_128);                 << 
258         ptr_in_p_128 = add(ptr_in_p_128, #32); << 
259 }                                              << 
260 {                                              << 
261         p1 = cmp.eq(prolog, #0);               << 
262         if(!p1.new) prolog = add(prolog, #1);  << 
263         dcfetch(ptr_in_p_128);  /*  reserve th << 
264         ptr_in_p_128 = add(ptr_in_p_128, #32); << 
265 }                                              << 
266 {                                              << 
267         nokernel = cmp.eq(kernel,#0);          << 
268         dcfetch(ptr_in_p_128);  /* reserve the << 
269         ptr_in_p_128 = add(ptr_in_p_128, #32); << 
270         shiftb = and(shift, #8);               << 
271 }                                              << 
272 {                                              << 
273         dcfetch(ptr_in_p_128);          /*  re << 
274         ptr_in_p_128 = add(ptr_in_p_128, #32); << 
275         if(nokernel) jump .Lskip64;            << 
276         p2 = cmp.eq(kernel, #1);        /*  sk << 
277 }                                              << 
278 {                                              << 
279         dczeroa(ptr_out_p_32);                 << 
280         /*  don't advance pointer  */          << 
281         if(!p2) ptr_out_p_32 = add(ptr_out_p_3 << 
282 }                                              << 
283 {                                              << 
284         dalign = and(dalign, #31);             << 
285         dczeroa(ptr_out_p_32);                 << 
286 }                                              << 
287 .Lskip64:                                      << 
288 {                                              << 
289         data70 = memd(ptr_in++#16);            << 
290         if(p3) dataF8 = memd(ptr_in+#8);       << 
291         if(noprolog) jump .Lnoprolog32;        << 
292         align = offset;                        << 
293 }                                              << 
294 /*  upto initial 7 bytes  */                   << 
295 {                                              << 
296         ldata0 = valignb(dataF8, data70, align << 
297         ifbyte = tstbit(shift,#3);             << 
298         offset = add(offset, star3);           << 
299 }                                              << 
300 {                                              << 
301         if(ifbyte) memb(ptr_out++#1) = data0;  << 
302         ldata0 = lsr(ldata0, shiftb);          << 
303         shiftb = and(shift, #16);              << 
304         ifhword = tstbit(shift,#4);            << 
305 }                                              << 
306 {                                              << 
307         if(ifhword) memh(ptr_out++#2) = data0; << 
308         ldata0 = lsr(ldata0, shiftb);          << 
309         ifword = tstbit(shift,#5);             << 
310         p2 = cmp.gtu(offset, #7);              << 
311 }                                              << 
312 {                                              << 
313         if(ifword) memw(ptr_out++#4) = data0;  << 
314         if(p2) data70 = dataF8;                << 
315         if(p2) dataF8 = memd(ptr_in++#8);      << 
316         align = offset;                        << 
317 }                                              << 
318 .Lnoprolog32:                                  << 
319 {                                              << 
320         p3 = sp1loop0(.Ldword_loop_prolog, pro << 
321         rest = sub(len, star3); /*  whats left << 
322         p0 = cmp.gt(over, #0);                 << 
323 }                                              << 
324         if(p0) rest = add(rest, #16);          << 
325 .Ldword_loop_prolog:                           << 
326 {                                              << 
327         if(p3) memd(ptr_out++#8) = ldata0;     << 
328         ldata0 = valignb(dataF8, data70, align << 
329         p0 = cmp.gt(rest, #16);                << 
330 }                                              << 
331 {                                              << 
332         data70 = dataF8;                       << 
333         if(p0) dataF8 = memd(ptr_in++#8);      << 
334         rest = add(rest, #-8);                 << 
335 }:endloop0                                     << 
336 .Lkernel:                                      << 
337 {                                              << 
338         /*  kernel is at least 32bytes  */     << 
339         p3 = cmp.gtu(kernel, #0);              << 
340         /*  last itn. remove edge effects  */  << 
341         if(p3.new) kernel = add(kernel, #-1);  << 
342         /*  dealt with in last dword loop  */  << 
343         if(p3.new) epilog = add(epilog, #32);  << 
344 }                                              << 
345 {                                              << 
346         nokernel = cmp.eq(kernel, #0);         << 
347         if(nokernel.new) jump:NT .Lepilog;     << 
348         inc = combine(#32, #-1);               << 
349         p3 = cmp.gtu(dalign, #24);             << 
350 }                                              << 
351 {                                              << 
352         if(p3) jump .Lodd_alignment;           << 
353 }                                              << 
354 {                                              << 
355         loop0(.Loword_loop_25to31, kernel);    << 
356         kernel1 = cmp.gtu(kernel, #1);         << 
357         rest = kernel;                         << 
358 }                                              << 
359         .falign                                << 
360 .Loword_loop_25to31:                           << 
361 {                                              << 
362         dcfetch(ptr_in_p_128);  /*  prefetch 4 << 
363         if(kernel1) ptr_out_p_32 = add(ptr_out << 
364 }                                              << 
365 {                                              << 
366         dczeroa(ptr_out_p_32);  /*  reserve th << 
367         p3 = cmp.eq(kernel, rest);             << 
368 }                                              << 
369 {                                              << 
370         /*  kernel -= 1  */                    << 
371         ptr_in_p_128kernel = vaddw(ptr_in_p_12 << 
372         /*  kill write on first iteration  */  << 
373         if(!p3) memd(ptr_out++#8) = ldata1;    << 
374         ldata1 = valignb(dataF8, data70, align << 
375         data70 = memd(ptr_in++#8);             << 
376 }                                              << 
377 {                                              << 
378         memd(ptr_out++#8) = ldata0;            << 
379         ldata0 = valignb(data70, dataF8, align << 
380         dataF8 = memd(ptr_in++#8);             << 
381 }                                              << 
382 {                                              << 
383         memd(ptr_out++#8) = ldata1;            << 
384         ldata1 = valignb(dataF8, data70, align << 
385         data70 = memd(ptr_in++#8);             << 
386 }                                              << 
387 {                                              << 
388         memd(ptr_out++#8) = ldata0;            << 
389         ldata0 = valignb(data70, dataF8, align << 
390         dataF8 = memd(ptr_in++#8);             << 
391         kernel1 = cmp.gtu(kernel, #1);         << 
392 }:endloop0                                     << 
393 {                                              << 
394         memd(ptr_out++#8) = ldata1;            << 
395         jump .Lepilog;                         << 
396 }                                              << 
397 .Lodd_alignment:                               << 
398 {                                              << 
399         loop0(.Loword_loop_00to24, kernel);    << 
400         kernel1 = cmp.gtu(kernel, #1);         << 
401         rest = add(kernel, #-1);               << 
402 }                                              << 
403         .falign                                << 
404 .Loword_loop_00to24:                           << 
405 {                                              << 
406         dcfetch(ptr_in_p_128);  /*  prefetch 4 << 
407         ptr_in_p_128kernel = vaddw(ptr_in_p_12 << 
408         if(kernel1) ptr_out_p_32 = add(ptr_out << 
409 }                                              << 
410 {                                              << 
411         dczeroa(ptr_out_p_32);  /*  reserve th << 
412 }                                              << 
413 {                                              << 
414         memd(ptr_out++#8) = ldata0;            << 
415         ldata0 = valignb(dataF8, data70, align << 
416         data70 = memd(ptr_in++#8);             << 
417 }                                              << 
418 {                                              << 
419         memd(ptr_out++#8) = ldata0;            << 
420         ldata0 = valignb(data70, dataF8, align << 
421         dataF8 = memd(ptr_in++#8);             << 
422 }                                              << 
423 {                                              << 
424         memd(ptr_out++#8) = ldata0;            << 
425         ldata0 = valignb(dataF8, data70, align << 
426         data70 = memd(ptr_in++#8);             << 
427 }                                              << 
428 {                                              << 
429         memd(ptr_out++#8) = ldata0;            << 
430         ldata0 = valignb(data70, dataF8, align << 
431         dataF8 = memd(ptr_in++#8);             << 
432         kernel1 = cmp.gtu(kernel, #1);         << 
433 }:endloop0                                     << 
434 .Lepilog:                                      << 
435 {                                              << 
436         noepilog = cmp.eq(epilog,#0);          << 
437         epilogdws = lsr(epilog, #3);           << 
438         kernel = and(epilog, #7);              << 
439 }                                              << 
440 {                                              << 
441         if(noepilog) jumpr r31;                << 
442         if(noepilog) ptr_out = sub(ptr_out, le << 
443         p3 = cmp.eq(epilogdws, #0);            << 
444         shift2 = asl(epilog, #3);              << 
445 }                                              << 
446 {                                              << 
447         shiftb = and(shift2, #32);             << 
448         ifword = tstbit(epilog,#2);            << 
449         if(p3) jump .Lepilog60;                << 
450         if(!p3) epilog = add(epilog, #-16);    << 
451 }                                              << 
452 {                                              << 
453         loop0(.Ldword_loop_epilog, epilogdws); << 
454         /*  stop criteria is lsbs unless = 0 t << 
455         p3 = cmp.eq(kernel, #0);               << 
456         if(p3.new) kernel= #8;                 << 
457         p1 = cmp.gt(over, #0);                 << 
458 }                                              << 
459         /*  if not aligned to end of buffer ex << 
460         if(p1) kernel= #0;                     << 
461 .Ldword_loop_epilog:                           << 
462 {                                              << 
463         memd(ptr_out++#8) = ldata0;            << 
464         ldata0 = valignb(dataF8, data70, align << 
465         p3 = cmp.gt(epilog, kernel);           << 
466 }                                              << 
467 {                                              << 
468         data70 = dataF8;                       << 
469         if(p3) dataF8 = memd(ptr_in++#8);      << 
470         epilog = add(epilog, #-8);             << 
471 }:endloop0                                     << 
472 /* copy last 7 bytes */                        << 
473 .Lepilog60:                                    << 
474 {                                              << 
475         if(ifword) memw(ptr_out++#4) = data0;  << 
476         ldata0 = lsr(ldata0, shiftb);          << 
477         ifhword = tstbit(epilog,#1);           << 
478         shiftb = and(shift2, #16);             << 
479 }                                              << 
480 {                                              << 
481         if(ifhword) memh(ptr_out++#2) = data0; << 
482         ldata0 = lsr(ldata0, shiftb);          << 
483         ifbyte = tstbit(epilog,#0);            << 
484         if(ifbyte.new) len = add(len, #-1);    << 
485 }                                              << 
486 {                                              << 
487         if(ifbyte) memb(ptr_out) = data0;      << 
488         ptr_out = sub(ptr_out, len);    /*  re << 
489         jumpr r31;                             << 
490 }                                              << 
491 /*  do byte copy for small n  */               << 
492 .Lbytes23orless:                               << 
493 {                                              << 
494         p3 = sp1loop0(.Lbyte_copy, len);       << 
495         len = add(len, #-1);                   << 
496 }                                              << 
497 .Lbyte_copy:                                   << 
498 {                                              << 
499         data0 = memb(ptr_in++#1);              << 
500         if(p3) memb(ptr_out++#1) = data0;      << 
501 }:endloop0                                     << 
502 {                                              << 
503         memb(ptr_out) = data0;                 << 
504         ptr_out = sub(ptr_out, len);           << 
505         jumpr r31;                             << 
506 }                                              << 
507 /*  do dword copies for aligned in, out and le << 
508 .Ldwordaligned:                                << 
509 {                                              << 
510         p3 = sp1loop0(.Ldword_copy, len8);     << 
511 }                                              << 
512 .Ldword_copy:                                  << 
513 {                                              << 
514         if(p3) memd(ptr_out++#8) = ldata0;     << 
515         ldata0 = memd(ptr_in++#8);             << 
516 }:endloop0                                     << 
517 {                                              << 
518         memd(ptr_out) = ldata0;                << 
519         ptr_out = sub(ptr_out, len);           << 
520         jumpr r31;      /*  return to function << 
521 }                                              << 
522 .Lmemcpy_return:                               << 
523         r21:20 = memd(sp+#16);  /*  restore r2 << 
524 {                                              << 
525         r25:24 = memd(sp+#8);   /*  restore r2 << 
526         r17:16 = memd(sp+#0);   /*  restore r1 << 
527 }                                              << 
528         deallocframe;   /*  restore r31 and in << 
529         jumpr r31                              << 
                                                      

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

sflogo.php