1 /* SPDX-License-Identifier: GPL-2.0 */ 1 /* SPDX-License-Identifier: GPL-2.0 */ 2 /* 2 /* 3 * arch/alpha/lib/ev6-divide.S 3 * arch/alpha/lib/ev6-divide.S 4 * 4 * 5 * 21264 version contributed by Rick Gorton <ri 5 * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com> 6 * 6 * 7 * Alpha division.. 7 * Alpha division.. 8 */ 8 */ 9 9 10 /* 10 /* 11 * The alpha chip doesn't provide hardware div 11 * The alpha chip doesn't provide hardware division, so we have to do it 12 * by hand. The compiler expects the function 12 * by hand. The compiler expects the functions 13 * 13 * 14 * __divqu: 64-bit unsigned long divide 14 * __divqu: 64-bit unsigned long divide 15 * __remqu: 64-bit unsigned long remainde 15 * __remqu: 64-bit unsigned long remainder 16 * __divqs/__remqs: signed 64-bit 16 * __divqs/__remqs: signed 64-bit 17 * __divlu/__remlu: unsigned 32-bit 17 * __divlu/__remlu: unsigned 32-bit 18 * __divls/__remls: signed 32-bit 18 * __divls/__remls: signed 32-bit 19 * 19 * 20 * These are not normal C functions: instead o 20 * These are not normal C functions: instead of the normal 21 * calling sequence, these expect their argume 21 * calling sequence, these expect their arguments in registers 22 * $24 and $25, and return the result in $27. 22 * $24 and $25, and return the result in $27. Register $28 may 23 * be clobbered (assembly temporary), anything 23 * be clobbered (assembly temporary), anything else must be saved. 24 * 24 * 25 * In short: painful. 25 * In short: painful. 26 * 26 * 27 * This is a rather simple bit-at-a-time algor 27 * This is a rather simple bit-at-a-time algorithm: it's very good 28 * at dividing random 64-bit numbers, but the 28 * at dividing random 64-bit numbers, but the more usual case where 29 * the divisor is small is handled better by t 29 * the divisor is small is handled better by the DEC algorithm 30 * using lookup tables. This uses much less me 30 * using lookup tables. This uses much less memory, though, and is 31 * nicer on the cache.. Besides, I don't know 31 * nicer on the cache.. Besides, I don't know the copyright status 32 * of the DEC code. 32 * of the DEC code. 33 */ 33 */ 34 34 35 /* 35 /* 36 * My temporaries: 36 * My temporaries: 37 * $0 - current bit 37 * $0 - current bit 38 * $1 - shifted divisor 38 * $1 - shifted divisor 39 * $2 - modulus/quotient 39 * $2 - modulus/quotient 40 * 40 * 41 * $23 - return address 41 * $23 - return address 42 * $24 - dividend 42 * $24 - dividend 43 * $25 - divisor 43 * $25 - divisor 44 * 44 * 45 * $27 - quotient/modulus 45 * $27 - quotient/modulus 46 * $28 - compare status 46 * $28 - compare status 47 * 47 * 48 * Much of the information about 21264 schedul 48 * Much of the information about 21264 scheduling/coding comes from: 49 * Compiler Writer's Guide for the Alpha 49 * Compiler Writer's Guide for the Alpha 21264 50 * abbreviated as 'CWG' in other comments 50 * abbreviated as 'CWG' in other comments here 51 * ftp.digital.com/pub/Digital/info/semic 51 * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html 52 * Scheduling notation: 52 * Scheduling notation: 53 * E - either cluster 53 * E - either cluster 54 * U - upper subcluster; U0 - subcl 54 * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1 55 * L - lower subcluster; L0 - subcl 55 * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1 56 * Try not to change the actual algorithm if p 56 * Try not to change the actual algorithm if possible for consistency. 57 */ 57 */ 58 58 59 #include <linux/export.h> 59 #include <linux/export.h> 60 #define halt .long 0 60 #define halt .long 0 61 61 62 /* 62 /* 63 * Select function type and registers 63 * Select function type and registers 64 */ 64 */ 65 #define mask $0 65 #define mask $0 66 #define divisor $1 66 #define divisor $1 67 #define compare $28 67 #define compare $28 68 #define tmp1 $3 68 #define tmp1 $3 69 #define tmp2 $4 69 #define tmp2 $4 70 70 71 #ifdef DIV 71 #ifdef DIV 72 #define DIV_ONLY(x,y...) x,##y 72 #define DIV_ONLY(x,y...) x,##y 73 #define MOD_ONLY(x,y...) 73 #define MOD_ONLY(x,y...) 74 #define func(x) __div##x 74 #define func(x) __div##x 75 #define modulus $2 75 #define modulus $2 76 #define quotient $27 76 #define quotient $27 77 #define GETSIGN(x) xor $24,$25,x 77 #define GETSIGN(x) xor $24,$25,x 78 #define STACK 48 78 #define STACK 48 79 #else 79 #else 80 #define DIV_ONLY(x,y...) 80 #define DIV_ONLY(x,y...) 81 #define MOD_ONLY(x,y...) x,##y 81 #define MOD_ONLY(x,y...) x,##y 82 #define func(x) __rem##x 82 #define func(x) __rem##x 83 #define modulus $27 83 #define modulus $27 84 #define quotient $2 84 #define quotient $2 85 #define GETSIGN(x) bis $24,$24,x 85 #define GETSIGN(x) bis $24,$24,x 86 #define STACK 32 86 #define STACK 32 87 #endif 87 #endif 88 88 89 /* 89 /* 90 * For 32-bit operations, we need to extend to 90 * For 32-bit operations, we need to extend to 64-bit 91 */ 91 */ 92 #ifdef INTSIZE 92 #ifdef INTSIZE 93 #define ufunction func(lu) 93 #define ufunction func(lu) 94 #define sfunction func(l) 94 #define sfunction func(l) 95 #define LONGIFY(x) zapnot x,15,x 95 #define LONGIFY(x) zapnot x,15,x 96 #define SLONGIFY(x) addl x,0,x 96 #define SLONGIFY(x) addl x,0,x 97 #else 97 #else 98 #define ufunction func(qu) 98 #define ufunction func(qu) 99 #define sfunction func(q) 99 #define sfunction func(q) 100 #define LONGIFY(x) 100 #define LONGIFY(x) 101 #define SLONGIFY(x) 101 #define SLONGIFY(x) 102 #endif 102 #endif 103 103 104 .set noat 104 .set noat 105 .align 4 105 .align 4 106 .globl ufunction 106 .globl ufunction 107 .ent ufunction 107 .ent ufunction 108 ufunction: 108 ufunction: 109 subq $30,STACK,$30 # E : 109 subq $30,STACK,$30 # E : 110 .frame $30,STACK,$23 110 .frame $30,STACK,$23 111 .prologue 0 111 .prologue 0 112 112 113 7: stq $1, 0($30) # L : 113 7: stq $1, 0($30) # L : 114 bis $25,$25,divisor # E : 114 bis $25,$25,divisor # E : 115 stq $2, 8($30) # L : 115 stq $2, 8($30) # L : L U L U 116 116 117 bis $24,$24,modulus # E : 117 bis $24,$24,modulus # E : 118 stq $0,16($30) # L : 118 stq $0,16($30) # L : 119 bis $31,$31,quotient # E : 119 bis $31,$31,quotient # E : 120 LONGIFY(divisor) # E : 120 LONGIFY(divisor) # E : U L L U 121 121 122 stq tmp1,24($30) # L : 122 stq tmp1,24($30) # L : 123 LONGIFY(modulus) # E : 123 LONGIFY(modulus) # E : 124 bis $31,1,mask # E : 124 bis $31,1,mask # E : 125 DIV_ONLY(stq tmp2,32($30)) # L : 125 DIV_ONLY(stq tmp2,32($30)) # L : L U U L 126 126 127 beq divisor, 9f 127 beq divisor, 9f /* div by zero */ 128 /* 128 /* 129 * In spite of the DIV_ONLY being eith 129 * In spite of the DIV_ONLY being either a non-instruction 130 * or an actual stq, the addition of t 130 * or an actual stq, the addition of the .align directive 131 * below ensures that label 1 is going 131 * below ensures that label 1 is going to be nicely aligned 132 */ 132 */ 133 133 134 .align 4 134 .align 4 135 #ifdef INTSIZE 135 #ifdef INTSIZE 136 /* 136 /* 137 * shift divisor left, using 3-bit shi 137 * shift divisor left, using 3-bit shifts for 138 * 32-bit divides as we can't overflow 138 * 32-bit divides as we can't overflow. Three-bit 139 * shifts will result in looping three 139 * shifts will result in looping three times less 140 * here, but can result in two loops m 140 * here, but can result in two loops more later. 141 * Thus using a large shift isn't wort 141 * Thus using a large shift isn't worth it (and 142 * s8add pairs better than a sll..) 142 * s8add pairs better than a sll..) 143 */ 143 */ 144 1: cmpult divisor,modulus,compare # E : 144 1: cmpult divisor,modulus,compare # E : 145 s8addq divisor,$31,divisor # E : 145 s8addq divisor,$31,divisor # E : 146 s8addq mask,$31,mask # E : 146 s8addq mask,$31,mask # E : 147 bne compare,1b # U : 147 bne compare,1b # U : U L U L 148 #else 148 #else 149 1: cmpult divisor,modulus,compare # E : 149 1: cmpult divisor,modulus,compare # E : 150 nop # E : 150 nop # E : 151 nop # E : 151 nop # E : 152 blt divisor, 2f # U : 152 blt divisor, 2f # U : U L U L 153 153 154 addq divisor,divisor,divisor # E : 154 addq divisor,divisor,divisor # E : 155 addq mask,mask,mask # E : 155 addq mask,mask,mask # E : 156 unop # E : 156 unop # E : 157 bne compare,1b # U : 157 bne compare,1b # U : U L U L 158 #endif 158 #endif 159 159 160 /* ok, start to go right again.. */ 160 /* ok, start to go right again.. */ 161 2: 161 2: 162 /* 162 /* 163 * Keep things nicely bundled... use a 163 * Keep things nicely bundled... use a nop instead of not 164 * having an instruction for DIV_ONLY 164 * having an instruction for DIV_ONLY 165 */ 165 */ 166 #ifdef DIV 166 #ifdef DIV 167 DIV_ONLY(addq quotient,mask,tmp2) # E 167 DIV_ONLY(addq quotient,mask,tmp2) # E : 168 #else 168 #else 169 nop # E : 169 nop # E : 170 #endif 170 #endif 171 srl mask,1,mask # U : 171 srl mask,1,mask # U : 172 cmpule divisor,modulus,compare # E : 172 cmpule divisor,modulus,compare # E : 173 subq modulus,divisor,tmp1 # E : 173 subq modulus,divisor,tmp1 # E : 174 174 175 #ifdef DIV 175 #ifdef DIV 176 DIV_ONLY(cmovne compare,tmp2,quotient) 176 DIV_ONLY(cmovne compare,tmp2,quotient) # E : Latency 2, extra map slot 177 nop # E : 177 nop # E : as part of the cmovne 178 srl divisor,1,divisor # U : 178 srl divisor,1,divisor # U : 179 nop # E : 179 nop # E : L U L U 180 180 181 nop # E : 181 nop # E : 182 cmovne compare,tmp1,modulus # E : 182 cmovne compare,tmp1,modulus # E : Latency 2, extra map slot 183 nop # E : 183 nop # E : as part of the cmovne 184 bne mask,2b # U : 184 bne mask,2b # U : U L U L 185 #else 185 #else 186 srl divisor,1,divisor # U : 186 srl divisor,1,divisor # U : 187 cmovne compare,tmp1,modulus # E : 187 cmovne compare,tmp1,modulus # E : Latency 2, extra map slot 188 nop # E : 188 nop # E : as part of the cmovne 189 bne mask,2b # U : 189 bne mask,2b # U : U L L U 190 #endif 190 #endif 191 191 192 9: ldq $1, 0($30) # L : 192 9: ldq $1, 0($30) # L : 193 ldq $2, 8($30) # L : 193 ldq $2, 8($30) # L : 194 nop # E : 194 nop # E : 195 nop # E : 195 nop # E : U U L L 196 196 197 ldq $0,16($30) # L : 197 ldq $0,16($30) # L : 198 ldq tmp1,24($30) # L : 198 ldq tmp1,24($30) # L : 199 nop # E : 199 nop # E : 200 nop # E : 200 nop # E : 201 201 202 #ifdef DIV 202 #ifdef DIV 203 DIV_ONLY(ldq tmp2,32($30)) # L : 203 DIV_ONLY(ldq tmp2,32($30)) # L : 204 #else 204 #else 205 nop # E : 205 nop # E : 206 #endif 206 #endif 207 addq $30,STACK,$30 # E : 207 addq $30,STACK,$30 # E : 208 ret $31,($23),1 # L0 : 208 ret $31,($23),1 # L0 : L U U L 209 .end ufunction 209 .end ufunction 210 EXPORT_SYMBOL(ufunction) 210 EXPORT_SYMBOL(ufunction) 211 211 212 /* 212 /* 213 * Uhh.. Ugly signed division. I'd rather not 213 * Uhh.. Ugly signed division. I'd rather not have it at all, but 214 * it's needed in some circumstances. There ar 214 * it's needed in some circumstances. There are different ways to 215 * handle this, really. This does: 215 * handle this, really. This does: 216 * -a / b = a / -b = -(a / b) 216 * -a / b = a / -b = -(a / b) 217 * -a % b = -(a % b) 217 * -a % b = -(a % b) 218 * a % -b = a % b 218 * a % -b = a % b 219 * which is probably not the best solution, bu 219 * which is probably not the best solution, but at least should 220 * have the property that (x/y)*y + (x%y) = x. 220 * have the property that (x/y)*y + (x%y) = x. 221 */ 221 */ 222 .align 4 222 .align 4 223 .globl sfunction 223 .globl sfunction 224 .ent sfunction 224 .ent sfunction 225 sfunction: 225 sfunction: 226 subq $30,STACK,$30 # E : 226 subq $30,STACK,$30 # E : 227 .frame $30,STACK,$23 227 .frame $30,STACK,$23 228 .prologue 0 228 .prologue 0 229 bis $24,$25,$28 # E : 229 bis $24,$25,$28 # E : 230 SLONGIFY($28) # E : 230 SLONGIFY($28) # E : 231 bge $28,7b # U : 231 bge $28,7b # U : 232 232 233 stq $24,0($30) # L : 233 stq $24,0($30) # L : 234 subq $31,$24,$28 # E : 234 subq $31,$24,$28 # E : 235 stq $25,8($30) # L : 235 stq $25,8($30) # L : 236 nop # E : 236 nop # E : U L U L 237 237 238 cmovlt $24,$28,$24 /* abs($24) */ 238 cmovlt $24,$28,$24 /* abs($24) */ # E : Latency 2, extra map slot 239 nop # E : 239 nop # E : as part of the cmov 240 stq $23,16($30) # L : 240 stq $23,16($30) # L : 241 subq $31,$25,$28 # E : 241 subq $31,$25,$28 # E : U L U L 242 242 243 stq tmp1,24($30) # L : 243 stq tmp1,24($30) # L : 244 cmovlt $25,$28,$25 /* abs($25) */ 244 cmovlt $25,$28,$25 /* abs($25) */ # E : Latency 2, extra map slot 245 nop # E : 245 nop # E : 246 bsr $23,ufunction # L0: 246 bsr $23,ufunction # L0: L U L U 247 247 248 ldq $24,0($30) # L : 248 ldq $24,0($30) # L : 249 ldq $25,8($30) # L : 249 ldq $25,8($30) # L : 250 GETSIGN($28) # E : 250 GETSIGN($28) # E : 251 subq $31,$27,tmp1 # E : 251 subq $31,$27,tmp1 # E : U U L L 252 252 253 SLONGIFY($28) # E : 253 SLONGIFY($28) # E : 254 ldq $23,16($30) # L : 254 ldq $23,16($30) # L : 255 cmovlt $28,tmp1,$27 # E : 255 cmovlt $28,tmp1,$27 # E : Latency 2, extra map slot 256 nop # E : 256 nop # E : U L L U : as part of the cmov 257 257 258 ldq tmp1,24($30) # L : 258 ldq tmp1,24($30) # L : 259 nop # E : 259 nop # E : as part of the cmov 260 addq $30,STACK,$30 # E : 260 addq $30,STACK,$30 # E : 261 ret $31,($23),1 # L0 : 261 ret $31,($23),1 # L0 : L U U L 262 .end sfunction 262 .end sfunction 263 EXPORT_SYMBOL(sfunction) 263 EXPORT_SYMBOL(sfunction)
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.