1 /* SPDX-License-Identifier: GPL-2.0-or-later * 2 /* 3 * Fast SHA-256 implementation for SPE instruc 4 * 5 * This code makes use of the SPE SIMD instruc 6 * http://cache.freescale.com/files/32bit/doc/ 7 * Implementation is based on optimization gui 8 * http://cache.freescale.com/files/32bit/doc/ 9 * 10 * Copyright (c) 2015 Markus Stockhausen <stock 11 */ 12 13 #include <asm/ppc_asm.h> 14 #include <asm/asm-offsets.h> 15 16 #define rHP r3 /* pointer to hash val 17 #define rKP r24 /* pointer to round co 18 #define rWP r4 /* pointer to input da 19 20 #define rH0 r5 /* 8 32 bit hash value 21 #define rH1 r6 22 #define rH2 r7 23 #define rH3 r8 24 #define rH4 r9 25 #define rH5 r10 26 #define rH6 r11 27 #define rH7 r12 28 29 #define rW0 r14 /* 64 bit registers. 1 30 #define rW1 r15 31 #define rW2 r16 32 #define rW3 r17 33 #define rW4 r18 34 #define rW5 r19 35 #define rW6 r20 36 #define rW7 r21 37 38 #define rT0 r22 /* 64 bit temporaries 39 #define rT1 r23 40 #define rT2 r0 /* 32 bit temporaries 41 #define rT3 r25 42 43 #define CMP_KN_LOOP 44 #define CMP_KC_LOOP \ 45 cmpwi rT1,0; 46 47 #define INITIALIZE \ 48 stwu r1,-128(r1); /* cre 49 evstdw r14,8(r1); /* We 50 evstdw r15,16(r1); /* reg 51 evstdw r16,24(r1); /* and 52 evstdw r17,32(r1); 53 evstdw r18,40(r1); 54 evstdw r19,48(r1); 55 evstdw r20,56(r1); 56 evstdw r21,64(r1); 57 evstdw r22,72(r1); 58 evstdw r23,80(r1); 59 stw r24,88(r1); /* sav 60 stw r25,92(r1); 61 62 63 #define FINALIZE \ 64 evldw r14,8(r1); /* res 65 evldw r15,16(r1); 66 evldw r16,24(r1); 67 evldw r17,32(r1); 68 evldw r18,40(r1); 69 evldw r19,48(r1); 70 evldw r20,56(r1); 71 evldw r21,64(r1); 72 evldw r22,72(r1); 73 evldw r23,80(r1); 74 lwz r24,88(r1); /* res 75 lwz r25,92(r1); 76 xor r0,r0,r0; 77 stw r0,8(r1); /* Del 78 stw r0,16(r1); /* tha 79 stw r0,24(r1); /* fro 80 stw r0,32(r1); /* the 81 stw r0,40(r1); /* the 82 stw r0,48(r1); /* was 83 stw r0,56(r1); /* the 84 stw r0,64(r1); 85 stw r0,72(r1); 86 stw r0,80(r1); 87 addi r1,r1,128; /* cle 88 89 #ifdef __BIG_ENDIAN__ 90 #define LOAD_DATA(reg, off) \ 91 lwz reg,off(rWP); /* loa 92 #define NEXT_BLOCK \ 93 addi rWP,rWP,64; /* inc 94 #else 95 #define LOAD_DATA(reg, off) \ 96 lwbrx reg,0,rWP; /* loa 97 addi rWP,rWP,4; /* inc 98 #define NEXT_BLOCK /* not 99 #endif 100 101 #define R_LOAD_W(a, b, c, d, e, f, g, h, w, of 102 LOAD_DATA(w, off) /* 1: 103 rotrwi rT0,e,6; /* 1: 104 rotrwi rT1,e,11; /* 1: 105 rotrwi rT2,e,25; /* 1: 106 xor rT0,rT0,rT1; /* 1: 107 and rT3,e,f; /* 1: 108 xor rT0,rT0,rT2; /* 1: 109 andc rT1,g,e; /* 1: 110 lwz rT2,off(rKP); /* 1: 111 xor rT3,rT3,rT1; /* 1: 112 add h,h,rT0; /* 1: 113 add rT3,rT3,w; /* 1: 114 rotrwi rT0,a,2; /* 1: 115 add h,h,rT3; /* 1: 116 rotrwi rT1,a,13; /* 1: 117 add h,h,rT2; /* 1: 118 rotrwi rT3,a,22; /* 1: 119 xor rT0,rT0,rT1; /* 1: 120 add d,d,h; /* 1: 121 xor rT3,rT0,rT3; /* 1: 122 evmergelo w,w,w; /* 123 or rT2,a,b; /* 1: 124 and rT1,a,b; /* 1: 125 and rT2,rT2,c; /* 1: 126 LOAD_DATA(w, off+4) /* 2: 127 or rT2,rT1,rT2; /* 1: 128 rotrwi rT0,d,6; /* 2: 129 add rT3,rT3,rT2; /* 1: 130 rotrwi rT1,d,11; /* 2: 131 add h,h,rT3; /* 1: 132 rotrwi rT2,d,25; /* 2: 133 xor rT0,rT0,rT1; /* 2: 134 and rT3,d,e; /* 2: 135 xor rT0,rT0,rT2; /* 2: 136 andc rT1,f,d; /* 2: 137 lwz rT2,off+4(rKP); /* 2: 138 xor rT3,rT3,rT1; /* 2: 139 add g,g,rT0; /* 2: 140 add rT3,rT3,w; /* 2: 141 rotrwi rT0,h,2; /* 2: 142 add g,g,rT3; /* 2: 143 rotrwi rT1,h,13; /* 2: 144 add g,g,rT2; /* 2: 145 rotrwi rT3,h,22; /* 2: 146 xor rT0,rT0,rT1; /* 2: 147 or rT2,h,a; /* 2: 148 xor rT3,rT0,rT3; /* 2: 149 and rT1,h,a; /* 2: 150 and rT2,rT2,b; /* 2: 151 add c,c,g; /* 2: 152 or rT2,rT1,rT2; /* 2: 153 add rT3,rT3,rT2; /* 2: 154 add g,g,rT3 /* 2: 155 156 #define R_CALC_W(a, b, c, d, e, f, g, h, w0, w 157 rotrwi rT2,e,6; /* 1: 158 evmergelohi rT0,w0,w1; /* 159 rotrwi rT3,e,11; /* 1: 160 evsrwiu rT1,rT0,3; /* 161 xor rT2,rT2,rT3; /* 1: 162 evrlwi rT0,rT0,25; /* 163 rotrwi rT3,e,25; /* 1: 164 evxor rT1,rT1,rT0; /* 165 xor rT2,rT2,rT3; /* 1: 166 evrlwi rT0,rT0,21; /* 167 add h,h,rT2; /* 1: 168 evxor rT0,rT0,rT1; /* 169 and rT2,e,f; /* 1: 170 evaddw w0,w0,rT0; /* 171 andc rT3,g,e; /* 1: 172 evsrwiu rT0,w7,10; /* 173 xor rT2,rT2,rT3; /* 1: 174 evrlwi rT1,w7,15; /* 175 add h,h,rT2; /* 1: 176 evxor rT0,rT0,rT1; /* 177 rotrwi rT2,a,2; /* 1: 178 evrlwi rT1,w7,13; /* 179 rotrwi rT3,a,13; /* 1: 180 evxor rT0,rT0,rT1; /* 181 xor rT2,rT2,rT3; /* 1: 182 evldw rT1,off(rKP); /* 183 rotrwi rT3,a,22; /* 1: 184 evaddw w0,w0,rT0; /* 185 xor rT2,rT2,rT3; /* 1: 186 evmergelohi rT0,w4,w5; /* 187 and rT3,a,b; /* 1: 188 evaddw w0,w0,rT0; /* 189 CMP_K##k##_LOOP 190 add rT2,rT2,rT3; /* 1: 191 evaddw rT1,rT1,w0; /* 192 xor rT3,a,b; /* 1: 193 evmergehi rT0,rT1,rT1; /* 194 and rT3,rT3,c; /* 1: 195 add h,h,rT0; /* 1: 196 add rT2,rT2,rT3; /* 1: 197 add g,g,rT1; /* 2: 198 add d,d,h; /* 1: 199 rotrwi rT0,d,6; /* 2: 200 add h,h,rT2; /* 1: 201 rotrwi rT1,d,11; /* 2: 202 rotrwi rT2,d,25; /* 2: 203 xor rT0,rT0,rT1; /* 2: 204 and rT3,d,e; /* 2: 205 xor rT0,rT0,rT2; /* 2: 206 andc rT1,f,d; /* 2: 207 add g,g,rT0; /* 2: 208 xor rT3,rT3,rT1; /* 2: 209 rotrwi rT0,h,2; /* 2: 210 add g,g,rT3; /* 2: 211 rotrwi rT1,h,13; /* 2: 212 rotrwi rT3,h,22; /* 2: 213 xor rT0,rT0,rT1; /* 2: 214 or rT2,h,a; /* 2: 215 and rT1,h,a; /* 2: 216 and rT2,rT2,b; /* 2: 217 xor rT3,rT0,rT3; /* 2: 218 or rT2,rT1,rT2; /* 2: 219 add c,c,g; /* 2: 220 add rT3,rT3,rT2; /* 2: 221 add g,g,rT3 /* 2: 222 223 _GLOBAL(ppc_spe_sha256_transform) 224 INITIALIZE 225 226 mtctr r5 227 lwz rH0,0(rHP) 228 lwz rH1,4(rHP) 229 lwz rH2,8(rHP) 230 lwz rH3,12(rHP) 231 lwz rH4,16(rHP) 232 lwz rH5,20(rHP) 233 lwz rH6,24(rHP) 234 lwz rH7,28(rHP) 235 236 ppc_spe_sha256_main: 237 lis rKP,PPC_SPE_SHA256_K@h 238 addi rKP,rKP,PPC_SPE_SHA256 239 240 R_LOAD_W(rH0, rH1, rH2, rH3, rH4, rH5, 241 R_LOAD_W(rH6, rH7, rH0, rH1, rH2, rH3, 242 R_LOAD_W(rH4, rH5, rH6, rH7, rH0, rH1, 243 R_LOAD_W(rH2, rH3, rH4, rH5, rH6, rH7, 244 R_LOAD_W(rH0, rH1, rH2, rH3, rH4, rH5, 245 R_LOAD_W(rH6, rH7, rH0, rH1, rH2, rH3, 246 R_LOAD_W(rH4, rH5, rH6, rH7, rH0, rH1, 247 R_LOAD_W(rH2, rH3, rH4, rH5, rH6, rH7, 248 ppc_spe_sha256_16_rounds: 249 addi rKP,rKP,64 250 R_CALC_W(rH0, rH1, rH2, rH3, rH4, rH5, 251 rW0, rW1, rW4, rW5, rW7, N, 0 252 R_CALC_W(rH6, rH7, rH0, rH1, rH2, rH3, 253 rW1, rW2, rW5, rW6, rW0, N, 8 254 R_CALC_W(rH4, rH5, rH6, rH7, rH0, rH1, 255 rW2, rW3, rW6, rW7, rW1, N, 1 256 R_CALC_W(rH2, rH3, rH4, rH5, rH6, rH7, 257 rW3, rW4, rW7, rW0, rW2, N, 2 258 R_CALC_W(rH0, rH1, rH2, rH3, rH4, rH5, 259 rW4, rW5, rW0, rW1, rW3, N, 3 260 R_CALC_W(rH6, rH7, rH0, rH1, rH2, rH3, 261 rW5, rW6, rW1, rW2, rW4, N, 4 262 R_CALC_W(rH4, rH5, rH6, rH7, rH0, rH1, 263 rW6, rW7, rW2, rW3, rW5, N, 4 264 R_CALC_W(rH2, rH3, rH4, rH5, rH6, rH7, 265 rW7, rW0, rW3, rW4, rW6, C, 5 266 bt gt,ppc_spe_sha256_16_r 267 268 lwz rW0,0(rHP) 269 NEXT_BLOCK 270 lwz rW1,4(rHP) 271 lwz rW2,8(rHP) 272 lwz rW3,12(rHP) 273 lwz rW4,16(rHP) 274 lwz rW5,20(rHP) 275 lwz rW6,24(rHP) 276 lwz rW7,28(rHP) 277 278 add rH0,rH0,rW0 279 stw rH0,0(rHP) 280 add rH1,rH1,rW1 281 stw rH1,4(rHP) 282 add rH2,rH2,rW2 283 stw rH2,8(rHP) 284 add rH3,rH3,rW3 285 stw rH3,12(rHP) 286 add rH4,rH4,rW4 287 stw rH4,16(rHP) 288 add rH5,rH5,rW5 289 stw rH5,20(rHP) 290 add rH6,rH6,rW6 291 stw rH6,24(rHP) 292 add rH7,rH7,rW7 293 stw rH7,28(rHP) 294 295 bdnz ppc_spe_sha256_main 296 297 FINALIZE 298 blr 299 300 .data 301 .align 5 302 PPC_SPE_SHA256_K: 303 .long 0x428a2f98,0x71374491,0xb5c0fbcf 304 .long 0x3956c25b,0x59f111f1,0x923f82a4 305 .long 0xd807aa98,0x12835b01,0x243185be 306 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7 307 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6 308 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc 309 .long 0x983e5152,0xa831c66d,0xb00327c8 310 .long 0xc6e00bf3,0xd5a79147,0x06ca6351 311 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc 312 .long 0x650a7354,0x766a0abb,0x81c2c92e 313 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70 314 .long 0xd192e819,0xd6990624,0xf40e3585 315 .long 0x19a4c116,0x1e376c08,0x2748774c 316 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f 317 .long 0x748f82ee,0x78a5636f,0x84c87814 318 .long 0x90befffa,0xa4506ceb,0xbef9a3f7
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.