1 /* SPDX-License-Identifier: GPL-2.0 */ 1 /* SPDX-License-Identifier: GPL-2.0 */ 2 /* 2 /* 3 * arch/alpha/lib/ev6-copy_page.S 3 * arch/alpha/lib/ev6-copy_page.S 4 * 4 * 5 * Copy an entire page. 5 * Copy an entire page. 6 */ 6 */ 7 7 8 /* The following comparison of this routine vs 8 /* The following comparison of this routine vs the normal copy_page.S 9 was written by an unnamed ev6 hardware desi 9 was written by an unnamed ev6 hardware designer and forwarded to me 10 via Steven Hobbs <hobbs@steven.zko.dec.com>. 10 via Steven Hobbs <hobbs@steven.zko.dec.com>. 11 11 12 First Problem: STQ overflows. 12 First Problem: STQ overflows. 13 ----------------------------- 13 ----------------------------- 14 14 15 It would be nice if EV6 handled every 15 It would be nice if EV6 handled every resource overflow efficiently, 16 but for some it doesn't. Including st 16 but for some it doesn't. Including store queue overflows. It causes 17 a trap and a restart of the pipe. 17 a trap and a restart of the pipe. 18 18 19 To get around this we sometimes use (t 19 To get around this we sometimes use (to borrow a term from a VSSAD 20 researcher) "aeration". The idea is t 20 researcher) "aeration". The idea is to slow the rate at which the 21 processor receives valid instructions 21 processor receives valid instructions by inserting nops in the fetch 22 path. In doing so, you can prevent th 22 path. In doing so, you can prevent the overflow and actually make 23 the code run faster. You can, of cour 23 the code run faster. You can, of course, take advantage of the fact 24 that the processor can fetch at most 4 24 that the processor can fetch at most 4 aligned instructions per cycle. 25 25 26 I inserted enough nops to force it to 26 I inserted enough nops to force it to take 10 cycles to fetch the 27 loop code. In theory, EV6 should be a 27 loop code. In theory, EV6 should be able to execute this loop in 28 9 cycles but I was not able to get it 28 9 cycles but I was not able to get it to run that fast -- the initial 29 conditions were such that I could not 29 conditions were such that I could not reach this optimum rate on 30 (chaotic) EV6. I wrote the code such 30 (chaotic) EV6. I wrote the code such that everything would issue 31 in order. 31 in order. 32 32 33 Second Problem: Dcache index matches. 33 Second Problem: Dcache index matches. 34 ------------------------------------- 34 ------------------------------------- 35 35 36 If you are going to use this routine o 36 If you are going to use this routine on random aligned pages, there 37 is a 25% chance that the pages will be 37 is a 25% chance that the pages will be at the same dcache indices. 38 This results in many nasty memory trap 38 This results in many nasty memory traps without care. 39 39 40 The solution is to schedule the prefet 40 The solution is to schedule the prefetches to avoid the memory 41 conflicts. I schedule the wh64 prefet 41 conflicts. I schedule the wh64 prefetches farther ahead of the 42 read prefetches to avoid this problem. 42 read prefetches to avoid this problem. 43 43 44 Third Problem: Needs more prefetching. 44 Third Problem: Needs more prefetching. 45 -------------------------------------- 45 -------------------------------------- 46 46 47 In order to improve the code I added d 47 In order to improve the code I added deeper prefetching to take the 48 most advantage of EV6's bandwidth. 48 most advantage of EV6's bandwidth. 49 49 50 I also prefetched the read stream. Not 50 I also prefetched the read stream. Note that adding the read prefetch 51 forced me to add another cycle to the 51 forced me to add another cycle to the inner-most kernel - up to 11 52 from the original 8 cycles per iterati 52 from the original 8 cycles per iteration. We could improve performance 53 further by unrolling the loop and doin 53 further by unrolling the loop and doing multiple prefetches per cycle. 54 54 55 I think that the code below will be very ro 55 I think that the code below will be very robust and fast code for the 56 purposes of copying aligned pages. It is s 56 purposes of copying aligned pages. It is slower when both source and 57 destination pages are in the dcache, but it 57 destination pages are in the dcache, but it is my guess that this is 58 less important than the dcache miss case. 58 less important than the dcache miss case. */ 59 59 60 #include <linux/export.h> 60 #include <linux/export.h> 61 .text 61 .text 62 .align 4 62 .align 4 63 .global copy_page 63 .global copy_page 64 .ent copy_page 64 .ent copy_page 65 copy_page: 65 copy_page: 66 .prologue 0 66 .prologue 0 67 67 68 /* Prefetch 5 read cachelines; write-h 68 /* Prefetch 5 read cachelines; write-hint 10 cache lines. */ 69 wh64 ($16) 69 wh64 ($16) 70 ldl $31,0($17) 70 ldl $31,0($17) 71 ldl $31,64($17) 71 ldl $31,64($17) 72 lda $1,1*64($16) 72 lda $1,1*64($16) 73 73 74 wh64 ($1) 74 wh64 ($1) 75 ldl $31,128($17) 75 ldl $31,128($17) 76 ldl $31,192($17) 76 ldl $31,192($17) 77 lda $1,2*64($16) 77 lda $1,2*64($16) 78 78 79 wh64 ($1) 79 wh64 ($1) 80 ldl $31,256($17) 80 ldl $31,256($17) 81 lda $18,118 81 lda $18,118 82 lda $1,3*64($16) 82 lda $1,3*64($16) 83 83 84 wh64 ($1) 84 wh64 ($1) 85 nop 85 nop 86 lda $1,4*64($16) 86 lda $1,4*64($16) 87 lda $2,5*64($16) 87 lda $2,5*64($16) 88 88 89 wh64 ($1) 89 wh64 ($1) 90 wh64 ($2) 90 wh64 ($2) 91 lda $1,6*64($16) 91 lda $1,6*64($16) 92 lda $2,7*64($16) 92 lda $2,7*64($16) 93 93 94 wh64 ($1) 94 wh64 ($1) 95 wh64 ($2) 95 wh64 ($2) 96 lda $1,8*64($16) 96 lda $1,8*64($16) 97 lda $2,9*64($16) 97 lda $2,9*64($16) 98 98 99 wh64 ($1) 99 wh64 ($1) 100 wh64 ($2) 100 wh64 ($2) 101 lda $19,10*64($16) 101 lda $19,10*64($16) 102 nop 102 nop 103 103 104 /* Main prefetching/write-hinting loop 104 /* Main prefetching/write-hinting loop. */ 105 1: ldq $0,0($17) 105 1: ldq $0,0($17) 106 ldq $1,8($17) 106 ldq $1,8($17) 107 unop 107 unop 108 unop 108 unop 109 109 110 unop 110 unop 111 unop 111 unop 112 ldq $2,16($17) 112 ldq $2,16($17) 113 ldq $3,24($17) 113 ldq $3,24($17) 114 114 115 ldq $4,32($17) 115 ldq $4,32($17) 116 ldq $5,40($17) 116 ldq $5,40($17) 117 unop 117 unop 118 unop 118 unop 119 119 120 unop 120 unop 121 unop 121 unop 122 ldq $6,48($17) 122 ldq $6,48($17) 123 ldq $7,56($17) 123 ldq $7,56($17) 124 124 125 ldl $31,320($17) 125 ldl $31,320($17) 126 unop 126 unop 127 unop 127 unop 128 unop 128 unop 129 129 130 /* This gives the extra cycle of aerat 130 /* This gives the extra cycle of aeration above the minimum. */ 131 unop 131 unop 132 unop 132 unop 133 unop 133 unop 134 unop 134 unop 135 135 136 wh64 ($19) 136 wh64 ($19) 137 unop 137 unop 138 unop 138 unop 139 unop 139 unop 140 140 141 stq $0,0($16) 141 stq $0,0($16) 142 subq $18,1,$18 142 subq $18,1,$18 143 stq $1,8($16) 143 stq $1,8($16) 144 unop 144 unop 145 145 146 unop 146 unop 147 stq $2,16($16) 147 stq $2,16($16) 148 addq $17,64,$17 148 addq $17,64,$17 149 stq $3,24($16) 149 stq $3,24($16) 150 150 151 stq $4,32($16) 151 stq $4,32($16) 152 stq $5,40($16) 152 stq $5,40($16) 153 addq $19,64,$19 153 addq $19,64,$19 154 unop 154 unop 155 155 156 stq $6,48($16) 156 stq $6,48($16) 157 stq $7,56($16) 157 stq $7,56($16) 158 addq $16,64,$16 158 addq $16,64,$16 159 bne $18, 1b 159 bne $18, 1b 160 160 161 /* Prefetch the final 5 cache lines of 161 /* Prefetch the final 5 cache lines of the read stream. */ 162 lda $18,10 162 lda $18,10 163 ldl $31,320($17) 163 ldl $31,320($17) 164 ldl $31,384($17) 164 ldl $31,384($17) 165 ldl $31,448($17) 165 ldl $31,448($17) 166 166 167 ldl $31,512($17) 167 ldl $31,512($17) 168 ldl $31,576($17) 168 ldl $31,576($17) 169 nop 169 nop 170 nop 170 nop 171 171 172 /* Non-prefetching, non-write-hinting 172 /* Non-prefetching, non-write-hinting cleanup loop for the 173 final 10 cache lines. */ 173 final 10 cache lines. */ 174 2: ldq $0,0($17) 174 2: ldq $0,0($17) 175 ldq $1,8($17) 175 ldq $1,8($17) 176 ldq $2,16($17) 176 ldq $2,16($17) 177 ldq $3,24($17) 177 ldq $3,24($17) 178 178 179 ldq $4,32($17) 179 ldq $4,32($17) 180 ldq $5,40($17) 180 ldq $5,40($17) 181 ldq $6,48($17) 181 ldq $6,48($17) 182 ldq $7,56($17) 182 ldq $7,56($17) 183 183 184 stq $0,0($16) 184 stq $0,0($16) 185 subq $18,1,$18 185 subq $18,1,$18 186 stq $1,8($16) 186 stq $1,8($16) 187 addq $17,64,$17 187 addq $17,64,$17 188 188 189 stq $2,16($16) 189 stq $2,16($16) 190 stq $3,24($16) 190 stq $3,24($16) 191 stq $4,32($16) 191 stq $4,32($16) 192 stq $5,40($16) 192 stq $5,40($16) 193 193 194 stq $6,48($16) 194 stq $6,48($16) 195 stq $7,56($16) 195 stq $7,56($16) 196 addq $16,64,$16 196 addq $16,64,$16 197 bne $18, 2b 197 bne $18, 2b 198 198 199 ret 199 ret 200 nop 200 nop 201 unop 201 unop 202 nop 202 nop 203 203 204 .end copy_page 204 .end copy_page 205 EXPORT_SYMBOL(copy_page) 205 EXPORT_SYMBOL(copy_page)
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.