~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/arch/alpha/lib/ev6-memcpy.S

Version: ~ [ linux-6.11.5 ] ~ [ linux-6.10.14 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.58 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.114 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.169 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.228 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.284 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.322 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.9 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 /* SPDX-License-Identifier: GPL-2.0 */
  2 /*
  3  * arch/alpha/lib/ev6-memcpy.S
  4  * 21264 version by Rick Gorton <rick.gorton@alpha-processor.com>
  5  *
  6  * Reasonably optimized memcpy() routine for the Alpha 21264
  7  *
  8  *      - memory accessed as aligned quadwords only
  9  *      - uses bcmpge to compare 8 bytes in parallel
 10  *
 11  * Much of the information about 21264 scheduling/coding comes from:
 12  *      Compiler Writer's Guide for the Alpha 21264
 13  *      abbreviated as 'CWG' in other comments here
 14  *      ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
 15  * Scheduling notation:
 16  *      E       - either cluster
 17  *      U       - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
 18  *      L       - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
 19  *
 20  * Temp usage notes:
 21  *      $1,$2,          - scratch
 22  */
 23 #include <linux/export.h>
 24         .set noreorder
 25         .set noat
 26 
 27         .align  4
 28         .globl memcpy
 29         .ent memcpy
 30 memcpy:
 31         .frame $30,0,$26,0
 32         .prologue 0
 33 
 34         mov     $16, $0                 # E : copy dest to return
 35         ble     $18, $nomoredata        # U : done with the copy?
 36         xor     $16, $17, $1            # E : are source and dest alignments the same?
 37         and     $1, 7, $1               # E : are they the same mod 8?
 38 
 39         bne     $1, $misaligned         # U : Nope - gotta do this the slow way
 40         /* source and dest are same mod 8 address */
 41         and     $16, 7, $1              # E : Are both 0mod8?
 42         beq     $1, $both_0mod8         # U : Yes
 43         nop                             # E :
 44 
 45         /*
 46          * source and dest are same misalignment.  move a byte at a time
 47          * until a 0mod8 alignment for both is reached.
 48          * At least one byte more to move
 49          */
 50 
 51 $head_align:
 52         ldbu    $1, 0($17)              # L : grab a byte
 53         subq    $18, 1, $18             # E : count--
 54         addq    $17, 1, $17             # E : src++
 55         stb     $1, 0($16)              # L :
 56         addq    $16, 1, $16             # E : dest++
 57         and     $16, 7, $1              # E : Are we at 0mod8 yet?
 58         ble     $18, $nomoredata        # U : done with the copy?
 59         bne     $1, $head_align         # U :
 60 
 61 $both_0mod8:
 62         cmple   $18, 127, $1            # E : Can we unroll the loop?
 63         bne     $1, $no_unroll          # U :
 64         and     $16, 63, $1             # E : get mod64 alignment
 65         beq     $1, $do_unroll          # U : no single quads to fiddle
 66 
 67 $single_head_quad:
 68         ldq     $1, 0($17)              # L : get 8 bytes
 69         subq    $18, 8, $18             # E : count -= 8
 70         addq    $17, 8, $17             # E : src += 8
 71         nop                             # E :
 72 
 73         stq     $1, 0($16)              # L : store
 74         addq    $16, 8, $16             # E : dest += 8
 75         and     $16, 63, $1             # E : get mod64 alignment
 76         bne     $1, $single_head_quad   # U : still not fully aligned
 77 
 78 $do_unroll:
 79         addq    $16, 64, $7             # E : Initial (+1 trip) wh64 address
 80         cmple   $18, 127, $1            # E : Can we go through the unrolled loop?
 81         bne     $1, $tail_quads         # U : Nope
 82         nop                             # E : 
 83 
 84 $unroll_body:
 85         wh64    ($7)                    # L1 : memory subsystem hint: 64 bytes at
 86                                         # ($7) are about to be over-written
 87         ldq     $6, 0($17)              # L0 : bytes 0..7
 88         nop                             # E :
 89         nop                             # E :
 90 
 91         ldq     $4, 8($17)              # L : bytes 8..15
 92         ldq     $5, 16($17)             # L : bytes 16..23
 93         addq    $7, 64, $7              # E : Update next wh64 address
 94         nop                             # E :
 95 
 96         ldq     $3, 24($17)             # L : bytes 24..31
 97         addq    $16, 64, $1             # E : fallback value for wh64
 98         nop                             # E :
 99         nop                             # E :
100 
101         addq    $17, 32, $17            # E : src += 32 bytes
102         stq     $6, 0($16)              # L : bytes 0..7
103         nop                             # E :
104         nop                             # E :
105 
106         stq     $4, 8($16)              # L : bytes 8..15
107         stq     $5, 16($16)             # L : bytes 16..23
108         subq    $18, 192, $2            # E : At least two more trips to go?
109         nop                             # E :
110 
111         stq     $3, 24($16)             # L : bytes 24..31
112         addq    $16, 32, $16            # E : dest += 32 bytes
113         nop                             # E :
114         nop                             # E :
115 
116         ldq     $6, 0($17)              # L : bytes 0..7
117         ldq     $4, 8($17)              # L : bytes 8..15
118         cmovlt  $2, $1, $7              # E : Latency 2, extra map slot - Use
119                                         # fallback wh64 address if < 2 more trips
120         nop                             # E :
121 
122         ldq     $5, 16($17)             # L : bytes 16..23
123         ldq     $3, 24($17)             # L : bytes 24..31
124         addq    $16, 32, $16            # E : dest += 32
125         subq    $18, 64, $18            # E : count -= 64
126 
127         addq    $17, 32, $17            # E : src += 32
128         stq     $6, -32($16)            # L : bytes 0..7
129         stq     $4, -24($16)            # L : bytes 8..15
130         cmple   $18, 63, $1             # E : At least one more trip?
131 
132         stq     $5, -16($16)            # L : bytes 16..23
133         stq     $3, -8($16)             # L : bytes 24..31
134         nop                             # E :
135         beq     $1, $unroll_body
136 
137 $tail_quads:
138 $no_unroll:
139         .align 4
140         subq    $18, 8, $18             # E : At least a quad left?
141         blt     $18, $less_than_8       # U : Nope
142         nop                             # E :
143         nop                             # E :
144 
145 $move_a_quad:
146         ldq     $1, 0($17)              # L : fetch 8
147         subq    $18, 8, $18             # E : count -= 8
148         addq    $17, 8, $17             # E : src += 8
149         nop                             # E :
150 
151         stq     $1, 0($16)              # L : store 8
152         addq    $16, 8, $16             # E : dest += 8
153         bge     $18, $move_a_quad       # U :
154         nop                             # E :
155 
156 $less_than_8:
157         .align 4
158         addq    $18, 8, $18             # E : add back for trailing bytes
159         ble     $18, $nomoredata        # U : All-done
160         nop                             # E :
161         nop                             # E :
162 
163         /* Trailing bytes */
164 $tail_bytes:
165         subq    $18, 1, $18             # E : count--
166         ldbu    $1, 0($17)              # L : fetch a byte
167         addq    $17, 1, $17             # E : src++
168         nop                             # E :
169 
170         stb     $1, 0($16)              # L : store a byte
171         addq    $16, 1, $16             # E : dest++
172         bgt     $18, $tail_bytes        # U : more to be done?
173         nop                             # E :
174 
175         /* branching to exit takes 3 extra cycles, so replicate exit here */
176         ret     $31, ($26), 1           # L0 :
177         nop                             # E :
178         nop                             # E :
179         nop                             # E :
180 
181 $misaligned:
182         mov     $0, $4                  # E : dest temp
183         and     $0, 7, $1               # E : dest alignment mod8
184         beq     $1, $dest_0mod8         # U : life doesnt totally suck
185         nop
186 
187 $aligndest:
188         ble     $18, $nomoredata        # U :
189         ldbu    $1, 0($17)              # L : fetch a byte
190         subq    $18, 1, $18             # E : count--
191         addq    $17, 1, $17             # E : src++
192 
193         stb     $1, 0($4)               # L : store it
194         addq    $4, 1, $4               # E : dest++
195         and     $4, 7, $1               # E : dest 0mod8 yet?
196         bne     $1, $aligndest          # U : go until we are aligned.
197 
198         /* Source has unknown alignment, but dest is known to be 0mod8 */
199 $dest_0mod8:
200         subq    $18, 8, $18             # E : At least a quad left?
201         blt     $18, $misalign_tail     # U : Nope
202         ldq_u   $3, 0($17)              # L : seed (rotating load) of 8 bytes
203         nop                             # E :
204 
205 $mis_quad:
206         ldq_u   $16, 8($17)             # L : Fetch next 8
207         extql   $3, $17, $3             # U : masking
208         extqh   $16, $17, $1            # U : masking
209         bis     $3, $1, $1              # E : merged bytes to store
210 
211         subq    $18, 8, $18             # E : count -= 8
212         addq    $17, 8, $17             # E : src += 8
213         stq     $1, 0($4)               # L : store 8 (aligned)
214         mov     $16, $3                 # E : "rotate" source data
215 
216         addq    $4, 8, $4               # E : dest += 8
217         bge     $18, $mis_quad          # U : More quads to move
218         nop
219         nop
220 
221 $misalign_tail:
222         addq    $18, 8, $18             # E : account for tail stuff
223         ble     $18, $nomoredata        # U :
224         nop
225         nop
226 
227 $misalign_byte:
228         ldbu    $1, 0($17)              # L : fetch 1
229         subq    $18, 1, $18             # E : count--
230         addq    $17, 1, $17             # E : src++
231         nop                             # E :
232 
233         stb     $1, 0($4)               # L : store
234         addq    $4, 1, $4               # E : dest++
235         bgt     $18, $misalign_byte     # U : more to go?
236         nop
237 
238 
239 $nomoredata:
240         ret     $31, ($26), 1           # L0 :
241         nop                             # E :
242         nop                             # E :
243         nop                             # E :
244 
245         .end memcpy
246         EXPORT_SYMBOL(memcpy)
247 
248 /* For backwards module compatibility.  */
249 __memcpy = memcpy
250 .globl __memcpy

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

sflogo.php