~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/arch/sh/lib/memcpy-sh4.S

Version: ~ [ linux-6.12-rc7 ] ~ [ linux-6.11.7 ] ~ [ linux-6.10.14 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.60 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.116 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.171 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.229 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.285 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.323 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.12 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 /* SPDX-License-Identifier: GPL-2.0 */
  2 /*
  3  * "memcpy" implementation of SuperH
  4  *
  5  * Copyright (C) 1999  Niibe Yutaka
  6  * Copyright (c) 2002  STMicroelectronics Ltd
  7  *   Modified from memcpy.S and micro-optimised for SH4
  8  *   Stuart Menefy (stuart.menefy@st.com)
  9  *
 10  */
 11 #include <linux/linkage.h>
 12 
 13 /*
 14  * void *memcpy(void *dst, const void *src, size_t n);
 15  *
 16  * It is assumed that there is no overlap between src and dst.
 17  * If there is an overlap, then the results are undefined.
 18  */
 19 
 20         !
 21         !       GHIJ KLMN OPQR -->  ...G HIJK LMNO PQR.
 22         !
 23 
 24         ! Size is 16 or greater, and may have trailing bytes
 25 
 26         .balign 32
 27 .Lcase1:
 28         ! Read a long word and write a long word at once
 29         ! At the start of each iteration, r7 contains last long load
 30         add     #-1,r5          !  79 EX
 31         mov     r4,r2           !   5 MT (0 cycles latency)
 32 
 33         mov.l   @(r0,r5),r7     !  21 LS (2 cycles latency)
 34         add     #-4,r5          !  50 EX
 35 
 36         add     #7,r2           !  79 EX
 37         !
 38 #ifdef CONFIG_CPU_LITTLE_ENDIAN
 39         ! 6 cycles, 4 bytes per iteration
 40 3:      mov.l   @(r0,r5),r1     !  21 LS (latency=2)    ! NMLK
 41         mov     r7, r3          !   5 MT (latency=0)    ! RQPO
 42 
 43         cmp/hi  r2,r0           !  57 MT
 44         shll16  r3              ! 103 EX
 45 
 46         mov     r1,r6           !   5 MT (latency=0)
 47         shll8   r3              ! 102 EX                ! Oxxx
 48 
 49         shlr8   r6              ! 106 EX                ! xNML
 50         mov     r1, r7          !   5 MT (latency=0)
 51 
 52         or      r6,r3           !  82 EX                ! ONML
 53         bt/s    3b              ! 109 BR
 54 
 55          mov.l  r3,@-r0         !  30 LS
 56 #else
 57 3:      mov.l   @(r0,r5),r1     !  21 LS (latency=2)    ! KLMN
 58         mov     r7,r3           !   5 MT (latency=0)    ! OPQR
 59 
 60         cmp/hi  r2,r0           !  57 MT
 61         shlr16  r3              ! 107 EX
 62 
 63         shlr8   r3              ! 106 EX                ! xxxO
 64         mov     r1,r6           !   5 MT (latency=0)
 65 
 66         shll8   r6              ! 102 EX                ! LMNx
 67         mov     r1,r7           !   5 MT (latency=0)
 68 
 69         or      r6,r3           !  82 EX                ! LMNO
 70         bt/s    3b              ! 109 BR
 71 
 72          mov.l  r3,@-r0         !  30 LS
 73 #endif
 74         ! Finally, copy a byte at once, if necessary
 75 
 76         add     #4,r5           !  50 EX
 77         cmp/eq  r4,r0           !  54 MT
 78 
 79         add     #-6,r2          !  50 EX
 80         bt      9f              ! 109 BR
 81 
 82 8:      cmp/hi  r2,r0           !  57 MT
 83         mov.b   @(r0,r5),r1     !  20 LS (latency=2)
 84 
 85         bt/s    8b              ! 109 BR
 86 
 87          mov.b  r1,@-r0         !  29 LS
 88 
 89 9:      rts
 90          nop
 91 
 92 
 93         !
 94         !       GHIJ KLMN OPQR -->  .GHI JKLM NOPQ R...
 95         !
 96 
 97         ! Size is 16 or greater, and may have trailing bytes
 98 
 99         .balign 32
100 .Lcase3:
101         ! Read a long word and write a long word at once
102         ! At the start of each iteration, r7 contains last long load
103         add     #-3,r5          ! 79 EX
104         mov     r4,r2           !  5 MT (0 cycles latency)
105 
106         mov.l   @(r0,r5),r7     ! 21 LS (2 cycles latency)
107         add     #-4,r5          ! 50 EX
108 
109         add     #7,r2           !  79 EX
110         !
111 #ifdef CONFIG_CPU_LITTLE_ENDIAN
112         ! 6 cycles, 4 bytes per iteration
113 3:      mov.l   @(r0,r5),r1     !  21 LS (latency=2)    ! NMLK
114         mov     r7, r3          !   5 MT (latency=0)    ! RQPO
115 
116         cmp/hi  r2,r0           !  57 MT
117         shll8   r3              ! 102 EX                ! QPOx
118 
119         mov     r1,r6           !   5 MT (latency=0)
120         shlr16  r6              ! 107 EX
121 
122         shlr8   r6              ! 106 EX                ! xxxN
123         mov     r1, r7          !   5 MT (latency=0)
124 
125         or      r6,r3           !  82 EX                ! QPON
126         bt/s    3b              ! 109 BR
127 
128          mov.l  r3,@-r0         !  30 LS
129 #else
130 3:      mov     r7,r3           ! OPQR
131         shlr8   r3              ! xOPQ
132         mov.l   @(r0,r5),r7     ! KLMN
133         mov     r7,r6
134         shll16  r6
135         shll8   r6              ! Nxxx
136         or      r6,r3           ! NOPQ
137         cmp/hi  r2,r0
138         bt/s    3b
139          mov.l  r3,@-r0
140 #endif
141 
142         ! Finally, copy a byte at once, if necessary
143 
144         add     #6,r5           !  50 EX
145         cmp/eq  r4,r0           !  54 MT
146 
147         add     #-6,r2          !  50 EX
148         bt      9f              ! 109 BR
149 
150 8:      cmp/hi  r2,r0           !  57 MT
151         mov.b   @(r0,r5),r1     !  20 LS (latency=2)
152 
153         bt/s    8b              ! 109 BR
154 
155          mov.b  r1,@-r0         !  29 LS
156 
157 9:      rts
158          nop
159 
160 ENTRY(memcpy)
161 
162         ! Calculate the invariants which will be used in the remainder
163         ! of the code:
164         !
165         !      r4   -->  [ ...  ] DST             [ ...  ] SRC
166         !                [ ...  ]                 [ ...  ]
167         !                  :                        :
168         !      r0   -->  [ ...  ]       r0+r5 --> [ ...  ]
169         !
170         !
171 
172         ! Short circuit the common case of src, dst and len being 32 bit aligned
173         ! and test for zero length move
174 
175         mov     r6, r0          !   5 MT (0 cycle latency)
176         or      r4, r0          !  82 EX
177 
178         or      r5, r0          !  82 EX
179         tst     r6, r6          !  86 MT
180 
181         bt/s    99f             ! 111 BR                (zero len)
182          tst    #3, r0          !  87 MT
183 
184         mov     r4, r0          !   5 MT (0 cycle latency)
185         add     r6, r0          !  49 EX
186 
187         mov     #16, r1         !   6 EX
188         bt/s    .Lcase00        ! 111 BR                (aligned)
189 
190          sub    r4, r5          !  75 EX
191 
192         ! Arguments are not nicely long word aligned or zero len.
193         ! Check for small copies, and if so do a simple byte at a time copy.
194         !
195         ! Deciding on an exact value of 'small' is not easy, as the point at which
196         ! using the optimised routines become worthwhile varies (these are the
197         ! cycle counts for differnet sizes using byte-at-a-time vs. optimised):
198         !       size    byte-at-time    long    word    byte
199         !       16      42              39-40   46-50   50-55
200         !       24      58              43-44   54-58   62-67
201         !       36      82              49-50   66-70   80-85
202         ! However the penalty for getting it 'wrong' is much higher for long word
203         ! aligned data (and this is more common), so use a value of 16.
204 
205         cmp/gt  r6,r1           !  56 MT
206 
207         add     #-1,r5          !  50 EX
208         bf/s    6f              ! 108 BR                (not small)
209 
210          mov    r5, r3          !   5 MT (latency=0)
211         shlr    r6              ! 104 EX
212 
213         mov.b   @(r0,r5),r1     !  20 LS (latency=2)
214         bf/s    4f              ! 111 BR
215 
216          add    #-1,r3          !  50 EX
217         tst     r6, r6          !  86 MT
218 
219         bt/s    98f             ! 110 BR
220          mov.b  r1,@-r0         !  29 LS
221 
222         ! 4 cycles, 2 bytes per iteration
223 3:      mov.b   @(r0,r5),r1     !  20 LS (latency=2)
224 
225 4:      mov.b   @(r0,r3),r2     !  20 LS (latency=2)
226         dt      r6              !  67 EX
227 
228         mov.b   r1,@-r0         !  29 LS
229         bf/s    3b              ! 111 BR
230 
231          mov.b  r2,@-r0         !  29 LS
232 98:
233         rts
234          nop
235 
236 99:     rts
237          mov    r4, r0
238 
239         ! Size is not small, so its worthwhile looking for optimisations.
240         ! First align destination to a long word boundary.
241         !
242         ! r5 = normal value -1
243 
244 6:      tst     #3, r0          !  87 MT
245         mov     #3, r3          !   6 EX
246 
247         bt/s    2f              ! 111 BR
248          and    r0,r3           !  78 EX
249 
250         ! 3 cycles, 1 byte per iteration
251 1:      dt      r3              !  67 EX
252         mov.b   @(r0,r5),r1     !  19 LS (latency=2)
253 
254         add     #-1, r6         !  79 EX
255         bf/s    1b              ! 109 BR
256 
257          mov.b  r1,@-r0         !  28 LS
258 
259 2:      add     #1, r5          !  79 EX
260 
261         ! Now select the appropriate bulk transfer code based on relative
262         ! alignment of src and dst.
263 
264         mov     r0, r3          !   5 MT (latency=0)
265 
266         mov     r5, r0          !   5 MT (latency=0)
267         tst     #1, r0          !  87 MT
268 
269         bf/s    1f              ! 111 BR
270          mov    #64, r7         !   6 EX
271 
272         ! bit 0 clear
273 
274         cmp/ge  r7, r6          !  55 MT
275 
276         bt/s    2f              ! 111 BR
277          tst    #2, r0          !  87 MT
278 
279         ! small
280         bt/s    .Lcase0
281          mov    r3, r0
282 
283         bra     .Lcase2
284          nop
285 
286         ! big
287 2:      bt/s    .Lcase0b
288          mov    r3, r0
289 
290         bra     .Lcase2b
291          nop
292 
293         ! bit 0 set
294 1:      tst     #2, r0          ! 87 MT
295 
296         bt/s    .Lcase1
297          mov    r3, r0
298 
299         bra     .Lcase3
300          nop
301 
302 
303         !
304         !       GHIJ KLMN OPQR -->  GHIJ KLMN OPQR
305         !
306 
307         ! src, dst and size are all long word aligned
308         ! size is non-zero
309 
310         .balign 32
311 .Lcase00:
312         mov     #64, r1         !   6 EX
313         mov     r5, r3          !   5 MT (latency=0)
314 
315         cmp/gt  r6, r1          !  56 MT
316         add     #-4, r5         !  50 EX
317 
318         bf      .Lcase00b       ! 108 BR                (big loop)
319         shlr2   r6              ! 105 EX
320 
321         shlr    r6              ! 104 EX
322         mov.l   @(r0, r5), r1   !  21 LS (latency=2)
323 
324         bf/s    4f              ! 111 BR
325          add    #-8, r3         !  50 EX
326 
327         tst     r6, r6          !  86 MT
328         bt/s    5f              ! 110 BR
329 
330          mov.l  r1,@-r0         !  30 LS
331 
332         ! 4 cycles, 2 long words per iteration
333 3:      mov.l   @(r0, r5), r1   !  21 LS (latency=2)
334 
335 4:      mov.l   @(r0, r3), r2   !  21 LS (latency=2)
336         dt      r6              !  67 EX
337 
338         mov.l   r1, @-r0        !  30 LS
339         bf/s    3b              ! 109 BR
340 
341          mov.l  r2, @-r0        !  30 LS
342 
343 5:      rts
344          nop
345 
346 
347         ! Size is 16 or greater and less than 64, but may have trailing bytes
348 
349         .balign 32
350 .Lcase0:
351         add     #-4, r5         !  50 EX
352         mov     r4, r7          !   5 MT (latency=0)
353 
354         mov.l   @(r0, r5), r1   !  21 LS (latency=2)
355         mov     #4, r2          !   6 EX
356 
357         add     #11, r7         !  50 EX
358         tst     r2, r6          !  86 MT
359 
360         mov     r5, r3          !   5 MT (latency=0)
361         bt/s    4f              ! 111 BR
362 
363          add    #-4, r3         !  50 EX
364         mov.l   r1,@-r0         !  30 LS
365 
366         ! 4 cycles, 2 long words per iteration
367 3:      mov.l   @(r0, r5), r1   !  21 LS (latency=2)
368 
369 4:      mov.l   @(r0, r3), r2   !  21 LS (latency=2)
370         cmp/hi  r7, r0
371 
372         mov.l   r1, @-r0        !  30 LS
373         bt/s    3b              ! 109 BR
374 
375          mov.l  r2, @-r0        !  30 LS
376 
377         ! Copy the final 0-3 bytes
378 
379         add     #3,r5           !  50 EX
380 
381         cmp/eq  r0, r4          !  54 MT
382         add     #-10, r7        !  50 EX
383 
384         bt      9f              ! 110 BR
385 
386         ! 3 cycles, 1 byte per iteration
387 1:      mov.b   @(r0,r5),r1     !  19 LS
388         cmp/hi  r7,r0           !  57 MT
389 
390         bt/s    1b              ! 111 BR
391          mov.b  r1,@-r0         !  28 LS
392 
393 9:      rts
394          nop
395 
396         ! Size is at least 64 bytes, so will be going round the big loop at least once.
397         !
398         !   r2 = rounded up r4
399         !   r3 = rounded down r0
400 
401         .balign 32
402 .Lcase0b:
403         add     #-4, r5         !  50 EX
404 
405 .Lcase00b:
406         mov     r0, r3          !   5 MT (latency=0)
407         mov     #(~0x1f), r1    !   6 EX
408 
409         and     r1, r3          !  78 EX
410         mov     r4, r2          !   5 MT (latency=0)
411 
412         cmp/eq  r3, r0          !  54 MT
413         add     #0x1f, r2       !  50 EX
414 
415         bt/s    1f              ! 110 BR
416          and    r1, r2          !  78 EX
417 
418         ! copy initial words until cache line aligned
419 
420         mov.l   @(r0, r5), r1   !  21 LS (latency=2)
421         tst     #4, r0          !  87 MT
422 
423         mov     r5, r6          !   5 MT (latency=0)
424         add     #-4, r6         !  50 EX
425 
426         bt/s    4f              ! 111 BR
427          add    #8, r3          !  50 EX
428 
429         tst     #0x18, r0       !  87 MT
430 
431         bt/s    1f              ! 109 BR
432          mov.l  r1,@-r0         !  30 LS
433 
434         ! 4 cycles, 2 long words per iteration
435 3:      mov.l   @(r0, r5), r1   !  21 LS (latency=2)
436 
437 4:      mov.l   @(r0, r6), r7   !  21 LS (latency=2)
438         cmp/eq  r3, r0          !  54 MT
439 
440         mov.l   r1, @-r0        !  30 LS
441         bf/s    3b              ! 109 BR
442 
443          mov.l  r7, @-r0        !  30 LS
444 
445         ! Copy the cache line aligned blocks
446         !
447         ! In use: r0, r2, r4, r5
448         ! Scratch: r1, r3, r6, r7
449         !
450         ! We could do this with the four scratch registers, but if src
451         ! and dest hit the same cache line, this will thrash, so make
452         ! use of additional registers.
453         !
454         ! We also need r0 as a temporary (for movca), so 'undo' the invariant:
455         !   r5:  src (was r0+r5)
456         !   r1:  dest (was r0)
457         ! this can be reversed at the end, so we don't need to save any extra
458         ! state.
459         !
460 1:      mov.l   r8, @-r15       !  30 LS
461         add     r0, r5          !  49 EX
462 
463         mov.l   r9, @-r15       !  30 LS
464         mov     r0, r1          !   5 MT (latency=0)
465 
466         mov.l   r10, @-r15      !  30 LS
467         add     #-0x1c, r5      !  50 EX
468 
469         mov.l   r11, @-r15      !  30 LS
470 
471         ! 16 cycles, 32 bytes per iteration
472 2:      mov.l   @(0x00,r5),r0   ! 18 LS (latency=2)
473         add     #-0x20, r1      ! 50 EX
474         mov.l   @(0x04,r5),r3   ! 18 LS (latency=2)
475         mov.l   @(0x08,r5),r6   ! 18 LS (latency=2)
476         mov.l   @(0x0c,r5),r7   ! 18 LS (latency=2)
477         mov.l   @(0x10,r5),r8   ! 18 LS (latency=2)
478         mov.l   @(0x14,r5),r9   ! 18 LS (latency=2)
479         mov.l   @(0x18,r5),r10  ! 18 LS (latency=2)
480         mov.l   @(0x1c,r5),r11  ! 18 LS (latency=2)
481         movca.l r0,@r1          ! 40 LS (latency=3-7)
482         mov.l   r3,@(0x04,r1)   ! 33 LS
483         mov.l   r6,@(0x08,r1)   ! 33 LS
484         mov.l   r7,@(0x0c,r1)   ! 33 LS
485 
486         mov.l   r8,@(0x10,r1)   ! 33 LS
487         add     #-0x20, r5      ! 50 EX
488 
489         mov.l   r9,@(0x14,r1)   ! 33 LS
490         cmp/eq  r2,r1           ! 54 MT
491 
492         mov.l   r10,@(0x18,r1)  !  33 LS
493         bf/s    2b              ! 109 BR
494 
495          mov.l  r11,@(0x1c,r1)  !  33 LS
496 
497         mov     r1, r0          !   5 MT (latency=0)
498 
499         mov.l   @r15+, r11      !  15 LS
500         sub     r1, r5          !  75 EX
501 
502         mov.l   @r15+, r10      !  15 LS
503         cmp/eq  r4, r0          !  54 MT
504 
505         bf/s    1f              ! 109 BR
506          mov.l   @r15+, r9      !  15 LS
507 
508         rts
509 1:       mov.l  @r15+, r8       !  15 LS
510         sub     r4, r1          !  75 EX                (len remaining)
511 
512         ! number of trailing bytes is non-zero
513         !
514         ! invariants restored (r5 already decremented by 4)
515         ! also r1=num bytes remaining
516 
517         mov     #4, r2          !   6 EX
518         mov     r4, r7          !   5 MT (latency=0)
519 
520         add     #0x1c, r5       !  50 EX                (back to -4)
521         cmp/hs  r2, r1          !  58 MT
522 
523         bf/s    5f              ! 108 BR
524          add     #11, r7        !  50 EX
525 
526         mov.l   @(r0, r5), r6   !  21 LS (latency=2)
527         tst     r2, r1          !  86 MT
528 
529         mov     r5, r3          !   5 MT (latency=0)
530         bt/s    4f              ! 111 BR
531 
532          add    #-4, r3         !  50 EX
533         cmp/hs  r2, r1          !  58 MT
534 
535         bt/s    5f              ! 111 BR
536          mov.l  r6,@-r0         !  30 LS
537 
538         ! 4 cycles, 2 long words per iteration
539 3:      mov.l   @(r0, r5), r6   !  21 LS (latency=2)
540 
541 4:      mov.l   @(r0, r3), r2   !  21 LS (latency=2)
542         cmp/hi  r7, r0
543 
544         mov.l   r6, @-r0        !  30 LS
545         bt/s    3b              ! 109 BR
546 
547          mov.l  r2, @-r0        !  30 LS
548 
549         ! Copy the final 0-3 bytes
550 
551 5:      cmp/eq  r0, r4          !  54 MT
552         add     #-10, r7        !  50 EX
553 
554         bt      9f              ! 110 BR
555         add     #3,r5           !  50 EX
556 
557         ! 3 cycles, 1 byte per iteration
558 1:      mov.b   @(r0,r5),r1     !  19 LS
559         cmp/hi  r7,r0           !  57 MT
560 
561         bt/s    1b              ! 111 BR
562          mov.b  r1,@-r0         !  28 LS
563 
564 9:      rts
565          nop
566 
567         !
568         !       GHIJ KLMN OPQR -->  ..GH IJKL MNOP QR..
569         !
570 
571         .balign 32
572 .Lcase2:
573         ! Size is 16 or greater and less then 64, but may have trailing bytes
574 
575 2:      mov     r5, r6          !   5 MT (latency=0)
576         add     #-2,r5          !  50 EX
577 
578         mov     r4,r2           !   5 MT (latency=0)
579         add     #-4,r6          !  50 EX
580 
581         add     #7,r2           !  50 EX
582 3:      mov.w   @(r0,r5),r1     !  20 LS (latency=2)
583 
584         mov.w   @(r0,r6),r3     !  20 LS (latency=2)
585         cmp/hi  r2,r0           !  57 MT
586 
587         mov.w   r1,@-r0         !  29 LS
588         bt/s    3b              ! 111 BR
589 
590          mov.w  r3,@-r0         !  29 LS
591 
592         bra     10f
593          nop
594 
595 
596         .balign 32
597 .Lcase2b:
598         ! Size is at least 64 bytes, so will be going round the big loop at least once.
599         !
600         !   r2 = rounded up r4
601         !   r3 = rounded down r0
602 
603         mov     r0, r3          !   5 MT (latency=0)
604         mov     #(~0x1f), r1    !   6 EX
605 
606         and     r1, r3          !  78 EX
607         mov     r4, r2          !   5 MT (latency=0)
608 
609         cmp/eq  r3, r0          !  54 MT
610         add     #0x1f, r2       !  50 EX
611 
612         add     #-2, r5         !  50 EX
613         bt/s    1f              ! 110 BR
614          and    r1, r2          !  78 EX
615 
616         ! Copy a short word one at a time until we are cache line aligned
617         !   Normal values: r0, r2, r3, r4
618         !   Unused: r1, r6, r7
619         !   Mod: r5 (=r5-2)
620         !
621         add     #2, r3          !  50 EX
622 
623 2:      mov.w   @(r0,r5),r1     !  20 LS (latency=2)
624         cmp/eq  r3,r0           !  54 MT
625 
626         bf/s    2b              ! 111 BR
627 
628          mov.w  r1,@-r0         !  29 LS
629 
630         ! Copy the cache line aligned blocks
631         !
632         ! In use: r0, r2, r4, r5 (=r5-2)
633         ! Scratch: r1, r3, r6, r7
634         !
635         ! We could do this with the four scratch registers, but if src
636         ! and dest hit the same cache line, this will thrash, so make
637         ! use of additional registers.
638         !
639         ! We also need r0 as a temporary (for movca), so 'undo' the invariant:
640         !   r5:  src (was r0+r5)
641         !   r1:  dest (was r0)
642         ! this can be reversed at the end, so we don't need to save any extra
643         ! state.
644         !
645 1:      mov.l   r8, @-r15       !  30 LS
646         add     r0, r5          !  49 EX
647 
648         mov.l   r9, @-r15       !  30 LS
649         mov     r0, r1          !   5 MT (latency=0)
650 
651         mov.l   r10, @-r15      !  30 LS
652         add     #-0x1e, r5      !  50 EX
653 
654         mov.l   r11, @-r15      !  30 LS
655 
656         mov.l   r12, @-r15      !  30 LS
657 
658         ! 17 cycles, 32 bytes per iteration
659 #ifdef CONFIG_CPU_LITTLE_ENDIAN
660 2:      mov.w   @r5+, r0        !  14 LS (latency=2)            ..JI
661         add     #-0x20, r1      !  50 EX
662 
663         mov.l   @r5+, r3        !  15 LS (latency=2)            NMLK
664 
665         mov.l   @r5+, r6        !  15 LS (latency=2)            RQPO
666         shll16  r0              ! 103 EX                        JI..
667 
668         mov.l   @r5+, r7        !  15 LS (latency=2)
669         xtrct   r3, r0          !  48 EX                        LKJI
670 
671         mov.l   @r5+, r8        !  15 LS (latency=2)
672         xtrct   r6, r3          !  48 EX                        PONM
673 
674         mov.l   @r5+, r9        !  15 LS (latency=2)
675         xtrct   r7, r6          !  48 EX
676 
677         mov.l   @r5+, r10       !  15 LS (latency=2)
678         xtrct   r8, r7          !  48 EX
679 
680         mov.l   @r5+, r11       !  15 LS (latency=2)
681         xtrct   r9, r8          !  48 EX
682 
683         mov.w   @r5+, r12       !  15 LS (latency=2)
684         xtrct   r10, r9         !  48 EX
685 
686         movca.l r0,@r1          !  40 LS (latency=3-7)
687         xtrct   r11, r10        !  48 EX
688 
689         mov.l   r3, @(0x04,r1)  !  33 LS
690         xtrct   r12, r11        !  48 EX
691 
692         mov.l   r6, @(0x08,r1)  !  33 LS
693 
694         mov.l   r7, @(0x0c,r1)  !  33 LS
695 
696         mov.l   r8, @(0x10,r1)  !  33 LS
697         add     #-0x40, r5      !  50 EX
698 
699         mov.l   r9, @(0x14,r1)  !  33 LS
700         cmp/eq  r2,r1           !  54 MT
701 
702         mov.l   r10, @(0x18,r1) !  33 LS
703         bf/s    2b              ! 109 BR
704 
705          mov.l  r11, @(0x1c,r1) !  33 LS
706 #else
707 2:      mov.w   @(0x1e,r5), r0  !  17 LS (latency=2)
708         add     #-2, r5         !  50 EX
709 
710         mov.l   @(0x1c,r5), r3  !  18 LS (latency=2)
711         add     #-4, r1         !  50 EX
712 
713         mov.l   @(0x18,r5), r6  !  18 LS (latency=2)
714         shll16  r0              ! 103 EX
715 
716         mov.l   @(0x14,r5), r7  !  18 LS (latency=2)
717         xtrct   r3, r0          !  48 EX
718 
719         mov.l   @(0x10,r5), r8  !  18 LS (latency=2)
720         xtrct   r6, r3          !  48 EX
721 
722         mov.l   @(0x0c,r5), r9  !  18 LS (latency=2)
723         xtrct   r7, r6          !  48 EX
724 
725         mov.l   @(0x08,r5), r10 !  18 LS (latency=2)
726         xtrct   r8, r7          !  48 EX
727 
728         mov.l   @(0x04,r5), r11 !  18 LS (latency=2)
729         xtrct   r9, r8          !  48 EX
730 
731         mov.l   @(0x00,r5), r12 !  18 LS (latency=2)
732         xtrct   r10, r9         !  48 EX
733 
734         movca.l r0,@r1          !  40 LS (latency=3-7)
735         add     #-0x1c, r1      !  50 EX
736 
737         mov.l   r3, @(0x18,r1)  !  33 LS
738         xtrct   r11, r10        !  48 EX
739 
740         mov.l   r6, @(0x14,r1)  !  33 LS
741         xtrct   r12, r11        !  48 EX
742 
743         mov.l   r7, @(0x10,r1)  !  33 LS
744 
745         mov.l   r8, @(0x0c,r1)  !  33 LS
746         add     #-0x1e, r5      !  50 EX
747 
748         mov.l   r9, @(0x08,r1)  !  33 LS
749         cmp/eq  r2,r1           !  54 MT
750 
751         mov.l   r10, @(0x04,r1) !  33 LS
752         bf/s    2b              ! 109 BR
753 
754          mov.l  r11, @(0x00,r1) !  33 LS
755 #endif
756 
757         mov.l   @r15+, r12
758         mov     r1, r0          !   5 MT (latency=0)
759 
760         mov.l   @r15+, r11      !  15 LS
761         sub     r1, r5          !  75 EX
762 
763         mov.l   @r15+, r10      !  15 LS
764         cmp/eq  r4, r0          !  54 MT
765 
766         bf/s    1f              ! 109 BR
767          mov.l   @r15+, r9      !  15 LS
768 
769         rts
770 1:       mov.l  @r15+, r8       !  15 LS
771 
772         add     #0x1e, r5       !  50 EX
773 
774         ! Finish off a short word at a time
775         ! r5 must be invariant - 2
776 10:     mov     r4,r2           !   5 MT (latency=0)
777         add     #1,r2           !  50 EX
778 
779         cmp/hi  r2, r0          !  57 MT
780         bf/s    1f              ! 109 BR
781 
782          add    #2, r2          !  50 EX
783 
784 3:      mov.w   @(r0,r5),r1     !  20 LS
785         cmp/hi  r2,r0           !  57 MT
786 
787         bt/s    3b              ! 109 BR
788 
789          mov.w  r1,@-r0         !  29 LS
790 1:
791 
792         !
793         ! Finally, copy the last byte if necessary
794         cmp/eq  r4,r0           !  54 MT
795         bt/s    9b
796          add    #1,r5
797         mov.b   @(r0,r5),r1
798         rts
799          mov.b  r1,@-r0
800 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

sflogo.php