1 /* SPDX-License-Identifier: GPL-2.0 */ 2 /* 3 * arch/alpha/lib/ev6-copy_page.S 4 * 5 * Copy an entire page. 6 */ 7 8 /* The following comparison of this routine vs 9 was written by an unnamed ev6 hardware desi 10 via Steven Hobbs <hobbs@steven.zko.dec.com>. 11 12 First Problem: STQ overflows. 13 ----------------------------- 14 15 It would be nice if EV6 handled every 16 but for some it doesn't. Including st 17 a trap and a restart of the pipe. 18 19 To get around this we sometimes use (t 20 researcher) "aeration". The idea is t 21 processor receives valid instructions 22 path. In doing so, you can prevent th 23 the code run faster. You can, of cour 24 that the processor can fetch at most 4 25 26 I inserted enough nops to force it to 27 loop code. In theory, EV6 should be a 28 9 cycles but I was not able to get it 29 conditions were such that I could not 30 (chaotic) EV6. I wrote the code such 31 in order. 32 33 Second Problem: Dcache index matches. 34 ------------------------------------- 35 36 If you are going to use this routine o 37 is a 25% chance that the pages will be 38 This results in many nasty memory trap 39 40 The solution is to schedule the prefet 41 conflicts. I schedule the wh64 prefet 42 read prefetches to avoid this problem. 43 44 Third Problem: Needs more prefetching. 45 -------------------------------------- 46 47 In order to improve the code I added d 48 most advantage of EV6's bandwidth. 49 50 I also prefetched the read stream. Not 51 forced me to add another cycle to the 52 from the original 8 cycles per iterati 53 further by unrolling the loop and doin 54 55 I think that the code below will be very ro 56 purposes of copying aligned pages. It is s 57 destination pages are in the dcache, but it 58 less important than the dcache miss case. 59 60 #include <linux/export.h> 61 .text 62 .align 4 63 .global copy_page 64 .ent copy_page 65 copy_page: 66 .prologue 0 67 68 /* Prefetch 5 read cachelines; write-h 69 wh64 ($16) 70 ldl $31,0($17) 71 ldl $31,64($17) 72 lda $1,1*64($16) 73 74 wh64 ($1) 75 ldl $31,128($17) 76 ldl $31,192($17) 77 lda $1,2*64($16) 78 79 wh64 ($1) 80 ldl $31,256($17) 81 lda $18,118 82 lda $1,3*64($16) 83 84 wh64 ($1) 85 nop 86 lda $1,4*64($16) 87 lda $2,5*64($16) 88 89 wh64 ($1) 90 wh64 ($2) 91 lda $1,6*64($16) 92 lda $2,7*64($16) 93 94 wh64 ($1) 95 wh64 ($2) 96 lda $1,8*64($16) 97 lda $2,9*64($16) 98 99 wh64 ($1) 100 wh64 ($2) 101 lda $19,10*64($16) 102 nop 103 104 /* Main prefetching/write-hinting loop 105 1: ldq $0,0($17) 106 ldq $1,8($17) 107 unop 108 unop 109 110 unop 111 unop 112 ldq $2,16($17) 113 ldq $3,24($17) 114 115 ldq $4,32($17) 116 ldq $5,40($17) 117 unop 118 unop 119 120 unop 121 unop 122 ldq $6,48($17) 123 ldq $7,56($17) 124 125 ldl $31,320($17) 126 unop 127 unop 128 unop 129 130 /* This gives the extra cycle of aerat 131 unop 132 unop 133 unop 134 unop 135 136 wh64 ($19) 137 unop 138 unop 139 unop 140 141 stq $0,0($16) 142 subq $18,1,$18 143 stq $1,8($16) 144 unop 145 146 unop 147 stq $2,16($16) 148 addq $17,64,$17 149 stq $3,24($16) 150 151 stq $4,32($16) 152 stq $5,40($16) 153 addq $19,64,$19 154 unop 155 156 stq $6,48($16) 157 stq $7,56($16) 158 addq $16,64,$16 159 bne $18, 1b 160 161 /* Prefetch the final 5 cache lines of 162 lda $18,10 163 ldl $31,320($17) 164 ldl $31,384($17) 165 ldl $31,448($17) 166 167 ldl $31,512($17) 168 ldl $31,576($17) 169 nop 170 nop 171 172 /* Non-prefetching, non-write-hinting 173 final 10 cache lines. */ 174 2: ldq $0,0($17) 175 ldq $1,8($17) 176 ldq $2,16($17) 177 ldq $3,24($17) 178 179 ldq $4,32($17) 180 ldq $5,40($17) 181 ldq $6,48($17) 182 ldq $7,56($17) 183 184 stq $0,0($16) 185 subq $18,1,$18 186 stq $1,8($16) 187 addq $17,64,$17 188 189 stq $2,16($16) 190 stq $3,24($16) 191 stq $4,32($16) 192 stq $5,40($16) 193 194 stq $6,48($16) 195 stq $7,56($16) 196 addq $16,64,$16 197 bne $18, 2b 198 199 ret 200 nop 201 unop 202 nop 203 204 .end copy_page 205 EXPORT_SYMBOL(copy_page)
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.