1 /* SPDX-License-Identifier: GPL-2.0 */ 2 /* 3 * Implementation of POLYVAL using ARMv8 Crypt 4 * 5 * Copyright 2021 Google LLC 6 */ 7 /* 8 * This is an efficient implementation of POLY 9 * It works on 8 blocks at a time, by precompu 10 * ..., h^1 in the POLYVAL finite field. This 11 * finite field multiplication into two steps. 12 * 13 * In the first step, we consider h^i, m_i as 14 * than 128. We then compute p(x) = h^8m_0 + . 15 * is simply polynomial multiplication. 16 * 17 * In the second step, we compute the reductio 18 * modulus g(x) = x^128 + x^127 + x^126 + x^12 19 * 20 * This two step process is equivalent to comp 21 * multiplication is finite field multiplicati 22 * two-step process only requires 1 finite fi 23 * polynomial multiplications. Further paralle 24 * multiplications and polynomial reductions. 25 */ 26 27 #include <linux/linkage.h> 28 #define STRIDE_BLOCKS 8 29 30 KEY_POWERS .req x0 31 MSG .req x1 32 BLOCKS_LEFT .req x2 33 ACCUMULATOR .req x3 34 KEY_START .req x10 35 EXTRA_BYTES .req x11 36 TMP .req x13 37 38 M0 .req v0 39 M1 .req v1 40 M2 .req v2 41 M3 .req v3 42 M4 .req v4 43 M5 .req v5 44 M6 .req v6 45 M7 .req v7 46 KEY8 .req v8 47 KEY7 .req v9 48 KEY6 .req v10 49 KEY5 .req v11 50 KEY4 .req v12 51 KEY3 .req v13 52 KEY2 .req v14 53 KEY1 .req v15 54 PL .req v16 55 PH .req v17 56 TMP_V .req v18 57 LO .req v20 58 MI .req v21 59 HI .req v22 60 SUM .req v23 61 GSTAR .req v24 62 63 .text 64 65 .arch armv8-a+crypto 66 .align 4 67 68 .Lgstar: 69 .quad 0xc200000000000000, 0xc2000000 70 71 /* 72 * Computes the product of two 128-bit polynom 73 * components of the 256-bit product into LO, 74 * 75 * Given: 76 * X = [X_1 : X_0] 77 * Y = [Y_1 : Y_0] 78 * 79 * We compute: 80 * LO += X_0 * Y_0 81 * MI += (X_0 + X_1) * (Y_0 + Y_1) 82 * HI += X_1 * Y_1 83 * 84 * Later, the 256-bit result can be extracted 85 * [HI_1 : HI_0 + HI_1 + MI_1 + LO_1 : LO_1 86 * This step is done when computing the polyno 87 * reasons. 88 * 89 * Karatsuba multiplication is used instead of 90 * it was found to be slightly faster on ARM64 91 * 92 */ 93 .macro karatsuba1 X Y 94 X .req \X 95 Y .req \Y 96 ext v25.16b, X.16b, X.16b, #8 97 ext v26.16b, Y.16b, Y.16b, #8 98 eor v25.16b, v25.16b, X.16b 99 eor v26.16b, v26.16b, Y.16b 100 pmull2 v28.1q, X.2d, Y.2d 101 pmull v29.1q, X.1d, Y.1d 102 pmull v27.1q, v25.1d, v26.1d 103 eor HI.16b, HI.16b, v28.16b 104 eor LO.16b, LO.16b, v29.16b 105 eor MI.16b, MI.16b, v27.16b 106 .unreq X 107 .unreq Y 108 .endm 109 110 /* 111 * Same as karatsuba1, except overwrites HI, L 112 * them. 113 */ 114 .macro karatsuba1_store X Y 115 X .req \X 116 Y .req \Y 117 ext v25.16b, X.16b, X.16b, #8 118 ext v26.16b, Y.16b, Y.16b, #8 119 eor v25.16b, v25.16b, X.16b 120 eor v26.16b, v26.16b, Y.16b 121 pmull2 HI.1q, X.2d, Y.2d 122 pmull LO.1q, X.1d, Y.1d 123 pmull MI.1q, v25.1d, v26.1d 124 .unreq X 125 .unreq Y 126 .endm 127 128 /* 129 * Computes the 256-bit polynomial represented 130 * the result in PL, PH. 131 * [PH : PL] = 132 * [HI_1 : HI_1 + HI_0 + MI_1 + LO_1 : HI_0 133 */ 134 .macro karatsuba2 135 // v4 = [HI_1 + MI_1 : HI_0 + MI_0] 136 eor v4.16b, HI.16b, MI.16b 137 // v4 = [HI_1 + MI_1 + LO_1 : HI_0 + M 138 eor v4.16b, v4.16b, LO.16b 139 // v5 = [HI_0 : LO_1] 140 ext v5.16b, LO.16b, HI.16b, #8 141 // v4 = [HI_1 + HI_0 + MI_1 + LO_1 : H 142 eor v4.16b, v4.16b, v5.16b 143 // HI = [HI_0 : HI_1] 144 ext HI.16b, HI.16b, HI.16b, #8 145 // LO = [LO_0 : LO_1] 146 ext LO.16b, LO.16b, LO.16b, #8 147 // PH = [HI_1 : HI_1 + HI_0 + MI_1 + L 148 ext PH.16b, v4.16b, HI.16b, #8 149 // PL = [HI_0 + MI_0 + LO_1 + LO_0 : L 150 ext PL.16b, LO.16b, v4.16b, #8 151 .endm 152 153 /* 154 * Computes the 128-bit reduction of PH : PL. 155 * 156 * This macro computes p(x) mod g(x) where p(x 157 * x^128 + x^127 + x^126 + x^121 + 1. 158 * 159 * We have a 256-bit polynomial PH : PL = P_3 160 * product of two 128-bit polynomials in Montg 161 * mod g(x). Also, since polynomials in Montg 162 * of x^128, this product has two extra factor 163 * Montgomery form, we need to remove one of t 164 * 165 * To accomplish both of these goals, we add m 166 * the low 128 bits P_1 : P_0, leaving just th 167 * bits are zero, the polynomial division by x 168 * shifting. 169 * 170 * Since the only nonzero term in the low 64 b 171 * the multiple of g(x) needed to cancel out P 172 * only do 64x64 bit multiplications, so split 173 * x^64 * g*(x) * P_0 + P_0, where g*(x) is bi 174 * the original polynomial gives P_3 : P_2 + P 175 * = T_1 : T_0 = g*(x) * P_0. Thus, bits 0-63 176 * 177 * Repeating this same process on the next 64 178 * 128-255, giving the answer in bits 128-255. 179 * + T_0 in bits 64-127. The multiple of g(x) 180 * x^64. Adding this to our previous computati 181 * P_2 + P_0 + T_1 + V_0 : 0 : 0, where V = V_ 182 * 183 * So our final computation is: 184 * T = T_1 : T_0 = g*(x) * P_0 185 * V = V_1 : V_0 = g*(x) * (P_1 + T_0) 186 * p(x) / x^{128} mod g(x) = P_3 + P_1 + T_0 187 * 188 * The implementation below saves a XOR instru 189 * + T_1 and XORing into dest, rather than sep 190 * T_1 into dest. This allows us to reuse P_1 191 */ 192 .macro montgomery_reduction dest 193 DEST .req \dest 194 // TMP_V = T_1 : T_0 = P_0 * g*(x) 195 pmull TMP_V.1q, PL.1d, GSTAR.1d 196 // TMP_V = T_0 : T_1 197 ext TMP_V.16b, TMP_V.16b, TMP_V.16 198 // TMP_V = P_1 + T_0 : P_0 + T_1 199 eor TMP_V.16b, PL.16b, TMP_V.16b 200 // PH = P_3 + P_1 + T_0 : P_2 + P_0 + 201 eor PH.16b, PH.16b, TMP_V.16b 202 // TMP_V = V_1 : V_0 = (P_1 + T_0) * g 203 pmull2 TMP_V.1q, TMP_V.2d, GSTAR.2d 204 eor DEST.16b, PH.16b, TMP_V.16b 205 .unreq DEST 206 .endm 207 208 /* 209 * Compute Polyval on 8 blocks. 210 * 211 * If reduce is set, also computes the montgom 212 * previous full_stride call and XORs with the 213 * (m_0 + REDUCE(PL, PH))h^8 + ... + m_7h^1. 214 * I.e., the first multiplication uses m_0 + R 215 * 216 * Sets PL, PH. 217 */ 218 .macro full_stride reduce 219 eor LO.16b, LO.16b, LO.16b 220 eor MI.16b, MI.16b, MI.16b 221 eor HI.16b, HI.16b, HI.16b 222 223 ld1 {M0.16b, M1.16b, M2.16 224 ld1 {M4.16b, M5.16b, M6.16 225 226 karatsuba1 M7 KEY1 227 .if \reduce 228 pmull TMP_V.1q, PL.1d, GSTAR.1d 229 .endif 230 231 karatsuba1 M6 KEY2 232 .if \reduce 233 ext TMP_V.16b, TMP_V.16b, TMP_V.16 234 .endif 235 236 karatsuba1 M5 KEY3 237 .if \reduce 238 eor TMP_V.16b, PL.16b, TMP_V.16b 239 .endif 240 241 karatsuba1 M4 KEY4 242 .if \reduce 243 eor PH.16b, PH.16b, TMP_V.16b 244 .endif 245 246 karatsuba1 M3 KEY5 247 .if \reduce 248 pmull2 TMP_V.1q, TMP_V.2d, GSTAR.2d 249 .endif 250 251 karatsuba1 M2 KEY6 252 .if \reduce 253 eor SUM.16b, PH.16b, TMP_V.16b 254 .endif 255 256 karatsuba1 M1 KEY7 257 eor M0.16b, M0.16b, SUM.16b 258 259 karatsuba1 M0 KEY8 260 karatsuba2 261 .endm 262 263 /* 264 * Handle any extra blocks after full_stride l 265 */ 266 .macro partial_stride 267 add KEY_POWERS, KEY_START, #(STRID 268 sub KEY_POWERS, KEY_POWERS, BLOCKS 269 ld1 {KEY1.16b}, [KEY_POWERS], #16 270 271 ld1 {TMP_V.16b}, [MSG], #16 272 eor SUM.16b, SUM.16b, TMP_V.16b 273 karatsuba1_store KEY1 SUM 274 sub BLOCKS_LEFT, BLOCKS_LEFT, #1 275 276 tst BLOCKS_LEFT, #4 277 beq .Lpartial4BlocksDone 278 ld1 {M0.16b, M1.16b, M2.16b, M3.1 279 ld1 {KEY8.16b, KEY7.16b, KEY6.16b, 280 karatsuba1 M0 KEY8 281 karatsuba1 M1 KEY7 282 karatsuba1 M2 KEY6 283 karatsuba1 M3 KEY5 284 .Lpartial4BlocksDone: 285 tst BLOCKS_LEFT, #2 286 beq .Lpartial2BlocksDone 287 ld1 {M0.16b, M1.16b}, [MSG], #32 288 ld1 {KEY8.16b, KEY7.16b}, [KEY_POW 289 karatsuba1 M0 KEY8 290 karatsuba1 M1 KEY7 291 .Lpartial2BlocksDone: 292 tst BLOCKS_LEFT, #1 293 beq .LpartialDone 294 ld1 {M0.16b}, [MSG], #16 295 ld1 {KEY8.16b}, [KEY_POWERS], #16 296 karatsuba1 M0 KEY8 297 .LpartialDone: 298 karatsuba2 299 montgomery_reduction SUM 300 .endm 301 302 /* 303 * Perform montgomery multiplication in GF(2^1 304 * 305 * Computes op1*op2*x^{-128} mod x^128 + x^127 306 * If op1, op2 are in montgomery form, this co 307 * form of op1*op2. 308 * 309 * void pmull_polyval_mul(u8 *op1, const u8 *o 310 */ 311 SYM_FUNC_START(pmull_polyval_mul) 312 adr TMP, .Lgstar 313 ld1 {GSTAR.2d}, [TMP] 314 ld1 {v0.16b}, [x0] 315 ld1 {v1.16b}, [x1] 316 karatsuba1_store v0 v1 317 karatsuba2 318 montgomery_reduction SUM 319 st1 {SUM.16b}, [x0] 320 ret 321 SYM_FUNC_END(pmull_polyval_mul) 322 323 /* 324 * Perform polynomial evaluation as specified 325 * h^n * accumulator + h^n * m_0 + ... + 326 * where n=nblocks, h is the hash key, and m_i 327 * 328 * x0 - pointer to precomputed key powers h^8 329 * x1 - pointer to message blocks 330 * x2 - number of blocks to hash 331 * x3 - pointer to accumulator 332 * 333 * void pmull_polyval_update(const struct poly 334 * size_t nblocks, u 335 */ 336 SYM_FUNC_START(pmull_polyval_update) 337 adr TMP, .Lgstar 338 mov KEY_START, KEY_POWERS 339 ld1 {GSTAR.2d}, [TMP] 340 ld1 {SUM.16b}, [ACCUMULATOR] 341 subs BLOCKS_LEFT, BLOCKS_LEFT, #STR 342 blt .LstrideLoopExit 343 ld1 {KEY8.16b, KEY7.16b, KEY6.16b, 344 ld1 {KEY4.16b, KEY3.16b, KEY2.16b, 345 full_stride 0 346 subs BLOCKS_LEFT, BLOCKS_LEFT, #STR 347 blt .LstrideLoopExitReduce 348 .LstrideLoop: 349 full_stride 1 350 subs BLOCKS_LEFT, BLOCKS_LEFT, #STR 351 bge .LstrideLoop 352 .LstrideLoopExitReduce: 353 montgomery_reduction SUM 354 .LstrideLoopExit: 355 adds BLOCKS_LEFT, BLOCKS_LEFT, #STR 356 beq .LskipPartial 357 partial_stride 358 .LskipPartial: 359 st1 {SUM.16b}, [ACCUMULATOR] 360 ret 361 SYM_FUNC_END(pmull_polyval_update)
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.