~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/arch/alpha/lib/ev6-copy_page.S

Version: ~ [ linux-6.12-rc7 ] ~ [ linux-6.11.7 ] ~ [ linux-6.10.14 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.60 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.116 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.171 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.229 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.285 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.323 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.12 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 /* SPDX-License-Identifier: GPL-2.0 */
  2 /*
  3  * arch/alpha/lib/ev6-copy_page.S
  4  *
  5  * Copy an entire page.
  6  */
  7 
  8 /* The following comparison of this routine vs the normal copy_page.S
  9    was written by an unnamed ev6 hardware designer and forwarded to me
 10    via Steven Hobbs <hobbs@steven.zko.dec.com>.
 11  
 12    First Problem: STQ overflows.
 13    -----------------------------
 14 
 15         It would be nice if EV6 handled every resource overflow efficiently,
 16         but for some it doesn't.  Including store queue overflows.  It causes
 17         a trap and a restart of the pipe.
 18 
 19         To get around this we sometimes use (to borrow a term from a VSSAD
 20         researcher) "aeration".  The idea is to slow the rate at which the
 21         processor receives valid instructions by inserting nops in the fetch
 22         path.  In doing so, you can prevent the overflow and actually make
 23         the code run faster.  You can, of course, take advantage of the fact
 24         that the processor can fetch at most 4 aligned instructions per cycle.
 25 
 26         I inserted enough nops to force it to take 10 cycles to fetch the
 27         loop code.  In theory, EV6 should be able to execute this loop in
 28         9 cycles but I was not able to get it to run that fast -- the initial
 29         conditions were such that I could not reach this optimum rate on
 30         (chaotic) EV6.  I wrote the code such that everything would issue
 31         in order. 
 32 
 33    Second Problem: Dcache index matches.
 34    -------------------------------------
 35 
 36         If you are going to use this routine on random aligned pages, there
 37         is a 25% chance that the pages will be at the same dcache indices.
 38         This results in many nasty memory traps without care.
 39 
 40         The solution is to schedule the prefetches to avoid the memory
 41         conflicts.  I schedule the wh64 prefetches farther ahead of the
 42         read prefetches to avoid this problem.
 43 
 44    Third Problem: Needs more prefetching.
 45    --------------------------------------
 46 
 47         In order to improve the code I added deeper prefetching to take the
 48         most advantage of EV6's bandwidth.
 49 
 50         I also prefetched the read stream. Note that adding the read prefetch
 51         forced me to add another cycle to the inner-most kernel - up to 11
 52         from the original 8 cycles per iteration.  We could improve performance
 53         further by unrolling the loop and doing multiple prefetches per cycle.
 54 
 55    I think that the code below will be very robust and fast code for the
 56    purposes of copying aligned pages.  It is slower when both source and
 57    destination pages are in the dcache, but it is my guess that this is
 58    less important than the dcache miss case.  */
 59 
 60 #include <linux/export.h>
 61         .text
 62         .align 4
 63         .global copy_page
 64         .ent copy_page
 65 copy_page:
 66         .prologue 0
 67 
 68         /* Prefetch 5 read cachelines; write-hint 10 cache lines.  */
 69         wh64    ($16)
 70         ldl     $31,0($17)
 71         ldl     $31,64($17)
 72         lda     $1,1*64($16)
 73 
 74         wh64    ($1)
 75         ldl     $31,128($17)
 76         ldl     $31,192($17)
 77         lda     $1,2*64($16)
 78 
 79         wh64    ($1)
 80         ldl     $31,256($17)
 81         lda     $18,118
 82         lda     $1,3*64($16)
 83 
 84         wh64    ($1)
 85         nop
 86         lda     $1,4*64($16)
 87         lda     $2,5*64($16)
 88 
 89         wh64    ($1)
 90         wh64    ($2)
 91         lda     $1,6*64($16)
 92         lda     $2,7*64($16)
 93 
 94         wh64    ($1)
 95         wh64    ($2)
 96         lda     $1,8*64($16)
 97         lda     $2,9*64($16)
 98 
 99         wh64    ($1)
100         wh64    ($2)
101         lda     $19,10*64($16)
102         nop
103 
104         /* Main prefetching/write-hinting loop.  */
105 1:      ldq     $0,0($17)
106         ldq     $1,8($17)
107         unop
108         unop
109 
110         unop
111         unop
112         ldq     $2,16($17)
113         ldq     $3,24($17)
114 
115         ldq     $4,32($17)
116         ldq     $5,40($17)
117         unop
118         unop
119 
120         unop
121         unop
122         ldq     $6,48($17)
123         ldq     $7,56($17)
124 
125         ldl     $31,320($17)
126         unop
127         unop
128         unop
129 
130         /* This gives the extra cycle of aeration above the minimum.  */
131         unop                    
132         unop
133         unop
134         unop
135 
136         wh64    ($19)
137         unop
138         unop
139         unop
140 
141         stq     $0,0($16)
142         subq    $18,1,$18
143         stq     $1,8($16)
144         unop
145 
146         unop
147         stq     $2,16($16)
148         addq    $17,64,$17
149         stq     $3,24($16)
150 
151         stq     $4,32($16)
152         stq     $5,40($16)
153         addq    $19,64,$19
154         unop
155 
156         stq     $6,48($16)
157         stq     $7,56($16)
158         addq    $16,64,$16
159         bne     $18, 1b
160 
161         /* Prefetch the final 5 cache lines of the read stream.  */
162         lda     $18,10
163         ldl     $31,320($17)
164         ldl     $31,384($17)
165         ldl     $31,448($17)
166 
167         ldl     $31,512($17)
168         ldl     $31,576($17)
169         nop
170         nop
171 
172         /* Non-prefetching, non-write-hinting cleanup loop for the
173            final 10 cache lines.  */
174 2:      ldq     $0,0($17)
175         ldq     $1,8($17)
176         ldq     $2,16($17)
177         ldq     $3,24($17)
178 
179         ldq     $4,32($17)
180         ldq     $5,40($17)
181         ldq     $6,48($17)
182         ldq     $7,56($17)
183 
184         stq     $0,0($16)
185         subq    $18,1,$18
186         stq     $1,8($16)
187         addq    $17,64,$17
188 
189         stq     $2,16($16)
190         stq     $3,24($16)
191         stq     $4,32($16)
192         stq     $5,40($16)
193 
194         stq     $6,48($16)
195         stq     $7,56($16)
196         addq    $16,64,$16
197         bne     $18, 2b
198 
199         ret
200         nop
201         unop
202         nop
203 
204         .end copy_page
205         EXPORT_SYMBOL(copy_page)

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

sflogo.php