~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/arch/arm64/crypto/ghash-ce-core.S

Version: ~ [ linux-6.12-rc7 ] ~ [ linux-6.11.7 ] ~ [ linux-6.10.14 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.60 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.116 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.171 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.229 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.285 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.323 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.12 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 /* SPDX-License-Identifier: GPL-2.0-only */
  2 /*
  3  * Accelerated GHASH implementation with ARMv8 PMULL instructions.
  4  *
  5  * Copyright (C) 2014 - 2018 Linaro Ltd. <ard.biesheuvel@linaro.org>
  6  */
  7 
  8 #include <linux/linkage.h>
  9 #include <linux/cfi_types.h>
 10 #include <asm/assembler.h>
 11 
 12         SHASH           .req    v0
 13         SHASH2          .req    v1
 14         T1              .req    v2
 15         T2              .req    v3
 16         MASK            .req    v4
 17         XM              .req    v5
 18         XL              .req    v6
 19         XH              .req    v7
 20         IN1             .req    v7
 21 
 22         k00_16          .req    v8
 23         k32_48          .req    v9
 24 
 25         t3              .req    v10
 26         t4              .req    v11
 27         t5              .req    v12
 28         t6              .req    v13
 29         t7              .req    v14
 30         t8              .req    v15
 31         t9              .req    v16
 32 
 33         perm1           .req    v17
 34         perm2           .req    v18
 35         perm3           .req    v19
 36 
 37         sh1             .req    v20
 38         sh2             .req    v21
 39         sh3             .req    v22
 40         sh4             .req    v23
 41 
 42         ss1             .req    v24
 43         ss2             .req    v25
 44         ss3             .req    v26
 45         ss4             .req    v27
 46 
 47         XL2             .req    v8
 48         XM2             .req    v9
 49         XH2             .req    v10
 50         XL3             .req    v11
 51         XM3             .req    v12
 52         XH3             .req    v13
 53         TT3             .req    v14
 54         TT4             .req    v15
 55         HH              .req    v16
 56         HH3             .req    v17
 57         HH4             .req    v18
 58         HH34            .req    v19
 59 
 60         .text
 61         .arch           armv8-a+crypto
 62 
 63         .macro          __pmull_p64, rd, rn, rm
 64         pmull           \rd\().1q, \rn\().1d, \rm\().1d
 65         .endm
 66 
 67         .macro          __pmull2_p64, rd, rn, rm
 68         pmull2          \rd\().1q, \rn\().2d, \rm\().2d
 69         .endm
 70 
 71         .macro          __pmull_p8, rq, ad, bd
 72         ext             t3.8b, \ad\().8b, \ad\().8b, #1         // A1
 73         ext             t5.8b, \ad\().8b, \ad\().8b, #2         // A2
 74         ext             t7.8b, \ad\().8b, \ad\().8b, #3         // A3
 75 
 76         __pmull_p8_\bd  \rq, \ad
 77         .endm
 78 
 79         .macro          __pmull2_p8, rq, ad, bd
 80         tbl             t3.16b, {\ad\().16b}, perm1.16b         // A1
 81         tbl             t5.16b, {\ad\().16b}, perm2.16b         // A2
 82         tbl             t7.16b, {\ad\().16b}, perm3.16b         // A3
 83 
 84         __pmull2_p8_\bd \rq, \ad
 85         .endm
 86 
 87         .macro          __pmull_p8_SHASH, rq, ad
 88         __pmull_p8_tail \rq, \ad\().8b, SHASH.8b, 8b,, sh1, sh2, sh3, sh4
 89         .endm
 90 
 91         .macro          __pmull_p8_SHASH2, rq, ad
 92         __pmull_p8_tail \rq, \ad\().8b, SHASH2.8b, 8b,, ss1, ss2, ss3, ss4
 93         .endm
 94 
 95         .macro          __pmull2_p8_SHASH, rq, ad
 96         __pmull_p8_tail \rq, \ad\().16b, SHASH.16b, 16b, 2, sh1, sh2, sh3, sh4
 97         .endm
 98 
 99         .macro          __pmull_p8_tail, rq, ad, bd, nb, t, b1, b2, b3, b4
100         pmull\t         t3.8h, t3.\nb, \bd                      // F = A1*B
101         pmull\t         t4.8h, \ad, \b1\().\nb                  // E = A*B1
102         pmull\t         t5.8h, t5.\nb, \bd                      // H = A2*B
103         pmull\t         t6.8h, \ad, \b2\().\nb                  // G = A*B2
104         pmull\t         t7.8h, t7.\nb, \bd                      // J = A3*B
105         pmull\t         t8.8h, \ad, \b3\().\nb                  // I = A*B3
106         pmull\t         t9.8h, \ad, \b4\().\nb                  // K = A*B4
107         pmull\t         \rq\().8h, \ad, \bd                     // D = A*B
108 
109         eor             t3.16b, t3.16b, t4.16b                  // L = E + F
110         eor             t5.16b, t5.16b, t6.16b                  // M = G + H
111         eor             t7.16b, t7.16b, t8.16b                  // N = I + J
112 
113         uzp1            t4.2d, t3.2d, t5.2d
114         uzp2            t3.2d, t3.2d, t5.2d
115         uzp1            t6.2d, t7.2d, t9.2d
116         uzp2            t7.2d, t7.2d, t9.2d
117 
118         // t3 = (L) (P0 + P1) << 8
119         // t5 = (M) (P2 + P3) << 16
120         eor             t4.16b, t4.16b, t3.16b
121         and             t3.16b, t3.16b, k32_48.16b
122 
123         // t7 = (N) (P4 + P5) << 24
124         // t9 = (K) (P6 + P7) << 32
125         eor             t6.16b, t6.16b, t7.16b
126         and             t7.16b, t7.16b, k00_16.16b
127 
128         eor             t4.16b, t4.16b, t3.16b
129         eor             t6.16b, t6.16b, t7.16b
130 
131         zip2            t5.2d, t4.2d, t3.2d
132         zip1            t3.2d, t4.2d, t3.2d
133         zip2            t9.2d, t6.2d, t7.2d
134         zip1            t7.2d, t6.2d, t7.2d
135 
136         ext             t3.16b, t3.16b, t3.16b, #15
137         ext             t5.16b, t5.16b, t5.16b, #14
138         ext             t7.16b, t7.16b, t7.16b, #13
139         ext             t9.16b, t9.16b, t9.16b, #12
140 
141         eor             t3.16b, t3.16b, t5.16b
142         eor             t7.16b, t7.16b, t9.16b
143         eor             \rq\().16b, \rq\().16b, t3.16b
144         eor             \rq\().16b, \rq\().16b, t7.16b
145         .endm
146 
147         .macro          __pmull_pre_p64
148         add             x8, x3, #16
149         ld1             {HH.2d-HH4.2d}, [x8]
150 
151         trn1            SHASH2.2d, SHASH.2d, HH.2d
152         trn2            T1.2d, SHASH.2d, HH.2d
153         eor             SHASH2.16b, SHASH2.16b, T1.16b
154 
155         trn1            HH34.2d, HH3.2d, HH4.2d
156         trn2            T1.2d, HH3.2d, HH4.2d
157         eor             HH34.16b, HH34.16b, T1.16b
158 
159         movi            MASK.16b, #0xe1
160         shl             MASK.2d, MASK.2d, #57
161         .endm
162 
163         .macro          __pmull_pre_p8
164         ext             SHASH2.16b, SHASH.16b, SHASH.16b, #8
165         eor             SHASH2.16b, SHASH2.16b, SHASH.16b
166 
167         // k00_16 := 0x0000000000000000_000000000000ffff
168         // k32_48 := 0x00000000ffffffff_0000ffffffffffff
169         movi            k32_48.2d, #0xffffffff
170         mov             k32_48.h[2], k32_48.h[0]
171         ushr            k00_16.2d, k32_48.2d, #32
172 
173         // prepare the permutation vectors
174         mov_q           x5, 0x080f0e0d0c0b0a09
175         movi            T1.8b, #8
176         dup             perm1.2d, x5
177         eor             perm1.16b, perm1.16b, T1.16b
178         ushr            perm2.2d, perm1.2d, #8
179         ushr            perm3.2d, perm1.2d, #16
180         ushr            T1.2d, perm1.2d, #24
181         sli             perm2.2d, perm1.2d, #56
182         sli             perm3.2d, perm1.2d, #48
183         sli             T1.2d, perm1.2d, #40
184 
185         // precompute loop invariants
186         tbl             sh1.16b, {SHASH.16b}, perm1.16b
187         tbl             sh2.16b, {SHASH.16b}, perm2.16b
188         tbl             sh3.16b, {SHASH.16b}, perm3.16b
189         tbl             sh4.16b, {SHASH.16b}, T1.16b
190         ext             ss1.8b, SHASH2.8b, SHASH2.8b, #1
191         ext             ss2.8b, SHASH2.8b, SHASH2.8b, #2
192         ext             ss3.8b, SHASH2.8b, SHASH2.8b, #3
193         ext             ss4.8b, SHASH2.8b, SHASH2.8b, #4
194         .endm
195 
196         //
197         // PMULL (64x64->128) based reduction for CPUs that can do
198         // it in a single instruction.
199         //
200         .macro          __pmull_reduce_p64
201         pmull           T2.1q, XL.1d, MASK.1d
202         eor             XM.16b, XM.16b, T1.16b
203 
204         mov             XH.d[0], XM.d[1]
205         mov             XM.d[1], XL.d[0]
206 
207         eor             XL.16b, XM.16b, T2.16b
208         ext             T2.16b, XL.16b, XL.16b, #8
209         pmull           XL.1q, XL.1d, MASK.1d
210         .endm
211 
212         //
213         // Alternative reduction for CPUs that lack support for the
214         // 64x64->128 PMULL instruction
215         //
216         .macro          __pmull_reduce_p8
217         eor             XM.16b, XM.16b, T1.16b
218 
219         mov             XL.d[1], XM.d[0]
220         mov             XH.d[0], XM.d[1]
221 
222         shl             T1.2d, XL.2d, #57
223         shl             T2.2d, XL.2d, #62
224         eor             T2.16b, T2.16b, T1.16b
225         shl             T1.2d, XL.2d, #63
226         eor             T2.16b, T2.16b, T1.16b
227         ext             T1.16b, XL.16b, XH.16b, #8
228         eor             T2.16b, T2.16b, T1.16b
229 
230         mov             XL.d[1], T2.d[0]
231         mov             XH.d[0], T2.d[1]
232 
233         ushr            T2.2d, XL.2d, #1
234         eor             XH.16b, XH.16b, XL.16b
235         eor             XL.16b, XL.16b, T2.16b
236         ushr            T2.2d, T2.2d, #6
237         ushr            XL.2d, XL.2d, #1
238         .endm
239 
240         .macro          __pmull_ghash, pn
241         ld1             {SHASH.2d}, [x3]
242         ld1             {XL.2d}, [x1]
243 
244         __pmull_pre_\pn
245 
246         /* do the head block first, if supplied */
247         cbz             x4, 0f
248         ld1             {T1.2d}, [x4]
249         mov             x4, xzr
250         b               3f
251 
252 0:      .ifc            \pn, p64
253         tbnz            w0, #0, 2f              // skip until #blocks is a
254         tbnz            w0, #1, 2f              // round multiple of 4
255 
256 1:      ld1             {XM3.16b-TT4.16b}, [x2], #64
257 
258         sub             w0, w0, #4
259 
260         rev64           T1.16b, XM3.16b
261         rev64           T2.16b, XH3.16b
262         rev64           TT4.16b, TT4.16b
263         rev64           TT3.16b, TT3.16b
264 
265         ext             IN1.16b, TT4.16b, TT4.16b, #8
266         ext             XL3.16b, TT3.16b, TT3.16b, #8
267 
268         eor             TT4.16b, TT4.16b, IN1.16b
269         pmull2          XH2.1q, SHASH.2d, IN1.2d        // a1 * b1
270         pmull           XL2.1q, SHASH.1d, IN1.1d        // a0 * b0
271         pmull           XM2.1q, SHASH2.1d, TT4.1d       // (a1 + a0)(b1 + b0)
272 
273         eor             TT3.16b, TT3.16b, XL3.16b
274         pmull2          XH3.1q, HH.2d, XL3.2d           // a1 * b1
275         pmull           XL3.1q, HH.1d, XL3.1d           // a0 * b0
276         pmull2          XM3.1q, SHASH2.2d, TT3.2d       // (a1 + a0)(b1 + b0)
277 
278         ext             IN1.16b, T2.16b, T2.16b, #8
279         eor             XL2.16b, XL2.16b, XL3.16b
280         eor             XH2.16b, XH2.16b, XH3.16b
281         eor             XM2.16b, XM2.16b, XM3.16b
282 
283         eor             T2.16b, T2.16b, IN1.16b
284         pmull2          XH3.1q, HH3.2d, IN1.2d          // a1 * b1
285         pmull           XL3.1q, HH3.1d, IN1.1d          // a0 * b0
286         pmull           XM3.1q, HH34.1d, T2.1d          // (a1 + a0)(b1 + b0)
287 
288         eor             XL2.16b, XL2.16b, XL3.16b
289         eor             XH2.16b, XH2.16b, XH3.16b
290         eor             XM2.16b, XM2.16b, XM3.16b
291 
292         ext             IN1.16b, T1.16b, T1.16b, #8
293         ext             TT3.16b, XL.16b, XL.16b, #8
294         eor             XL.16b, XL.16b, IN1.16b
295         eor             T1.16b, T1.16b, TT3.16b
296 
297         pmull2          XH.1q, HH4.2d, XL.2d            // a1 * b1
298         eor             T1.16b, T1.16b, XL.16b
299         pmull           XL.1q, HH4.1d, XL.1d            // a0 * b0
300         pmull2          XM.1q, HH34.2d, T1.2d           // (a1 + a0)(b1 + b0)
301 
302         eor             XL.16b, XL.16b, XL2.16b
303         eor             XH.16b, XH.16b, XH2.16b
304         eor             XM.16b, XM.16b, XM2.16b
305 
306         eor             T2.16b, XL.16b, XH.16b
307         ext             T1.16b, XL.16b, XH.16b, #8
308         eor             XM.16b, XM.16b, T2.16b
309 
310         __pmull_reduce_p64
311 
312         eor             T2.16b, T2.16b, XH.16b
313         eor             XL.16b, XL.16b, T2.16b
314 
315         cbz             w0, 5f
316         b               1b
317         .endif
318 
319 2:      ld1             {T1.2d}, [x2], #16
320         sub             w0, w0, #1
321 
322 3:      /* multiply XL by SHASH in GF(2^128) */
323 CPU_LE( rev64           T1.16b, T1.16b  )
324 
325         ext             T2.16b, XL.16b, XL.16b, #8
326         ext             IN1.16b, T1.16b, T1.16b, #8
327         eor             T1.16b, T1.16b, T2.16b
328         eor             XL.16b, XL.16b, IN1.16b
329 
330         __pmull2_\pn    XH, XL, SHASH                   // a1 * b1
331         eor             T1.16b, T1.16b, XL.16b
332         __pmull_\pn     XL, XL, SHASH                   // a0 * b0
333         __pmull_\pn     XM, T1, SHASH2                  // (a1 + a0)(b1 + b0)
334 
335 4:      eor             T2.16b, XL.16b, XH.16b
336         ext             T1.16b, XL.16b, XH.16b, #8
337         eor             XM.16b, XM.16b, T2.16b
338 
339         __pmull_reduce_\pn
340 
341         eor             T2.16b, T2.16b, XH.16b
342         eor             XL.16b, XL.16b, T2.16b
343 
344         cbnz            w0, 0b
345 
346 5:      st1             {XL.2d}, [x1]
347         ret
348         .endm
349 
350         /*
351          * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
352          *                         struct ghash_key const *k, const char *head)
353          */
354 SYM_TYPED_FUNC_START(pmull_ghash_update_p64)
355         __pmull_ghash   p64
356 SYM_FUNC_END(pmull_ghash_update_p64)
357 
358 SYM_TYPED_FUNC_START(pmull_ghash_update_p8)
359         __pmull_ghash   p8
360 SYM_FUNC_END(pmull_ghash_update_p8)
361 
362         KS0             .req    v8
363         KS1             .req    v9
364         KS2             .req    v10
365         KS3             .req    v11
366 
367         INP0            .req    v21
368         INP1            .req    v22
369         INP2            .req    v23
370         INP3            .req    v24
371 
372         K0              .req    v25
373         K1              .req    v26
374         K2              .req    v27
375         K3              .req    v28
376         K4              .req    v12
377         K5              .req    v13
378         K6              .req    v4
379         K7              .req    v5
380         K8              .req    v14
381         K9              .req    v15
382         KK              .req    v29
383         KL              .req    v30
384         KM              .req    v31
385 
386         .macro          load_round_keys, rounds, rk, tmp
387         add             \tmp, \rk, #64
388         ld1             {K0.4s-K3.4s}, [\rk]
389         ld1             {K4.4s-K5.4s}, [\tmp]
390         add             \tmp, \rk, \rounds, lsl #4
391         sub             \tmp, \tmp, #32
392         ld1             {KK.4s-KM.4s}, [\tmp]
393         .endm
394 
395         .macro          enc_round, state, key
396         aese            \state\().16b, \key\().16b
397         aesmc           \state\().16b, \state\().16b
398         .endm
399 
400         .macro          enc_qround, s0, s1, s2, s3, key
401         enc_round       \s0, \key
402         enc_round       \s1, \key
403         enc_round       \s2, \key
404         enc_round       \s3, \key
405         .endm
406 
407         .macro          enc_block, state, rounds, rk, tmp
408         add             \tmp, \rk, #96
409         ld1             {K6.4s-K7.4s}, [\tmp], #32
410         .irp            key, K0, K1, K2, K3, K4 K5
411         enc_round       \state, \key
412         .endr
413 
414         tbnz            \rounds, #2, .Lnot128_\@
415 .Lout256_\@:
416         enc_round       \state, K6
417         enc_round       \state, K7
418 
419 .Lout192_\@:
420         enc_round       \state, KK
421         aese            \state\().16b, KL.16b
422         eor             \state\().16b, \state\().16b, KM.16b
423 
424         .subsection     1
425 .Lnot128_\@:
426         ld1             {K8.4s-K9.4s}, [\tmp], #32
427         enc_round       \state, K6
428         enc_round       \state, K7
429         ld1             {K6.4s-K7.4s}, [\tmp]
430         enc_round       \state, K8
431         enc_round       \state, K9
432         tbz             \rounds, #1, .Lout192_\@
433         b               .Lout256_\@
434         .previous
435         .endm
436 
437         .align          6
438         .macro          pmull_gcm_do_crypt, enc
439         frame_push      1
440 
441         load_round_keys x7, x6, x8
442 
443         ld1             {SHASH.2d}, [x3], #16
444         ld1             {HH.2d-HH4.2d}, [x3]
445 
446         trn1            SHASH2.2d, SHASH.2d, HH.2d
447         trn2            T1.2d, SHASH.2d, HH.2d
448         eor             SHASH2.16b, SHASH2.16b, T1.16b
449 
450         trn1            HH34.2d, HH3.2d, HH4.2d
451         trn2            T1.2d, HH3.2d, HH4.2d
452         eor             HH34.16b, HH34.16b, T1.16b
453 
454         ld1             {XL.2d}, [x4]
455 
456         cbz             x0, 3f                          // tag only?
457 
458         ldr             w8, [x5, #12]                   // load lower counter
459 CPU_LE( rev             w8, w8          )
460 
461 0:      mov             w9, #4                          // max blocks per round
462         add             x10, x0, #0xf
463         lsr             x10, x10, #4                    // remaining blocks
464 
465         subs            x0, x0, #64
466         csel            w9, w10, w9, mi
467         add             w8, w8, w9
468 
469         bmi             1f
470         ld1             {INP0.16b-INP3.16b}, [x2], #64
471         .subsection     1
472         /*
473          * Populate the four input registers right to left with up to 63 bytes
474          * of data, using overlapping loads to avoid branches.
475          *
476          *                INP0     INP1     INP2     INP3
477          *  1 byte     |        |        |        |x       |
478          * 16 bytes    |        |        |        |xxxxxxxx|
479          * 17 bytes    |        |        |xxxxxxxx|x       |
480          * 47 bytes    |        |xxxxxxxx|xxxxxxxx|xxxxxxx |
481          * etc etc
482          *
483          * Note that this code may read up to 15 bytes before the start of
484          * the input. It is up to the calling code to ensure this is safe if
485          * this happens in the first iteration of the loop (i.e., when the
486          * input size is < 16 bytes)
487          */
488 1:      mov             x15, #16
489         ands            x19, x0, #0xf
490         csel            x19, x19, x15, ne
491         adr_l           x17, .Lpermute_table + 16
492 
493         sub             x11, x15, x19
494         add             x12, x17, x11
495         sub             x17, x17, x11
496         ld1             {T1.16b}, [x12]
497         sub             x10, x1, x11
498         sub             x11, x2, x11
499 
500         cmp             x0, #-16
501         csel            x14, x15, xzr, gt
502         cmp             x0, #-32
503         csel            x15, x15, xzr, gt
504         cmp             x0, #-48
505         csel            x16, x19, xzr, gt
506         csel            x1, x1, x10, gt
507         csel            x2, x2, x11, gt
508 
509         ld1             {INP0.16b}, [x2], x14
510         ld1             {INP1.16b}, [x2], x15
511         ld1             {INP2.16b}, [x2], x16
512         ld1             {INP3.16b}, [x2]
513         tbl             INP3.16b, {INP3.16b}, T1.16b
514         b               2f
515         .previous
516 
517 2:      .if             \enc == 0
518         bl              pmull_gcm_ghash_4x
519         .endif
520 
521         bl              pmull_gcm_enc_4x
522 
523         tbnz            x0, #63, 6f
524         st1             {INP0.16b-INP3.16b}, [x1], #64
525         .if             \enc == 1
526         bl              pmull_gcm_ghash_4x
527         .endif
528         bne             0b
529 
530 3:      ldr             x10, [sp, #.Lframe_local_offset]
531         cbz             x10, 5f                         // output tag?
532 
533         ld1             {INP3.16b}, [x10]               // load lengths[]
534         mov             w9, #1
535         bl              pmull_gcm_ghash_4x
536 
537         mov             w11, #(0x1 << 24)               // BE '1U'
538         ld1             {KS0.16b}, [x5]
539         mov             KS0.s[3], w11
540 
541         enc_block       KS0, x7, x6, x12
542 
543         ext             XL.16b, XL.16b, XL.16b, #8
544         rev64           XL.16b, XL.16b
545         eor             XL.16b, XL.16b, KS0.16b
546 
547         .if             \enc == 1
548         st1             {XL.16b}, [x10]                 // store tag
549         .else
550         ldp             x11, x12, [sp, #40]             // load tag pointer and authsize
551         adr_l           x17, .Lpermute_table
552         ld1             {KS0.16b}, [x11]                // load supplied tag
553         add             x17, x17, x12
554         ld1             {KS1.16b}, [x17]                // load permute vector
555 
556         cmeq            XL.16b, XL.16b, KS0.16b         // compare tags
557         mvn             XL.16b, XL.16b                  // -1 for fail, 0 for pass
558         tbl             XL.16b, {XL.16b}, KS1.16b       // keep authsize bytes only
559         sminv           b0, XL.16b                      // signed minimum across XL
560         smov            w0, v0.b[0]                     // return b0
561         .endif
562 
563 4:      frame_pop
564         ret
565 
566 5:
567 CPU_LE( rev             w8, w8          )
568         str             w8, [x5, #12]                   // store lower counter
569         st1             {XL.2d}, [x4]
570         b               4b
571 
572 6:      ld1             {T1.16b-T2.16b}, [x17], #32     // permute vectors
573         sub             x17, x17, x19, lsl #1
574 
575         cmp             w9, #1
576         beq             7f
577         .subsection     1
578 7:      ld1             {INP2.16b}, [x1]
579         tbx             INP2.16b, {INP3.16b}, T1.16b
580         mov             INP3.16b, INP2.16b
581         b               8f
582         .previous
583 
584         st1             {INP0.16b}, [x1], x14
585         st1             {INP1.16b}, [x1], x15
586         st1             {INP2.16b}, [x1], x16
587         tbl             INP3.16b, {INP3.16b}, T1.16b
588         tbx             INP3.16b, {INP2.16b}, T2.16b
589 8:      st1             {INP3.16b}, [x1]
590 
591         .if             \enc == 1
592         ld1             {T1.16b}, [x17]
593         tbl             INP3.16b, {INP3.16b}, T1.16b    // clear non-data bits
594         bl              pmull_gcm_ghash_4x
595         .endif
596         b               3b
597         .endm
598 
599         /*
600          * void pmull_gcm_encrypt(int blocks, u8 dst[], const u8 src[],
601          *                        struct ghash_key const *k, u64 dg[], u8 ctr[],
602          *                        int rounds, u8 tag)
603          */
604 SYM_FUNC_START(pmull_gcm_encrypt)
605         pmull_gcm_do_crypt      1
606 SYM_FUNC_END(pmull_gcm_encrypt)
607 
608         /*
609          * void pmull_gcm_decrypt(int blocks, u8 dst[], const u8 src[],
610          *                        struct ghash_key const *k, u64 dg[], u8 ctr[],
611          *                        int rounds, u8 tag)
612          */
613 SYM_FUNC_START(pmull_gcm_decrypt)
614         pmull_gcm_do_crypt      0
615 SYM_FUNC_END(pmull_gcm_decrypt)
616 
617 SYM_FUNC_START_LOCAL(pmull_gcm_ghash_4x)
618         movi            MASK.16b, #0xe1
619         shl             MASK.2d, MASK.2d, #57
620 
621         rev64           T1.16b, INP0.16b
622         rev64           T2.16b, INP1.16b
623         rev64           TT3.16b, INP2.16b
624         rev64           TT4.16b, INP3.16b
625 
626         ext             XL.16b, XL.16b, XL.16b, #8
627 
628         tbz             w9, #2, 0f                      // <4 blocks?
629         .subsection     1
630 0:      movi            XH2.16b, #0
631         movi            XM2.16b, #0
632         movi            XL2.16b, #0
633 
634         tbz             w9, #0, 1f                      // 2 blocks?
635         tbz             w9, #1, 2f                      // 1 block?
636 
637         eor             T2.16b, T2.16b, XL.16b
638         ext             T1.16b, T2.16b, T2.16b, #8
639         b               .Lgh3
640 
641 1:      eor             TT3.16b, TT3.16b, XL.16b
642         ext             T2.16b, TT3.16b, TT3.16b, #8
643         b               .Lgh2
644 
645 2:      eor             TT4.16b, TT4.16b, XL.16b
646         ext             IN1.16b, TT4.16b, TT4.16b, #8
647         b               .Lgh1
648         .previous
649 
650         eor             T1.16b, T1.16b, XL.16b
651         ext             IN1.16b, T1.16b, T1.16b, #8
652 
653         pmull2          XH2.1q, HH4.2d, IN1.2d          // a1 * b1
654         eor             T1.16b, T1.16b, IN1.16b
655         pmull           XL2.1q, HH4.1d, IN1.1d          // a0 * b0
656         pmull2          XM2.1q, HH34.2d, T1.2d          // (a1 + a0)(b1 + b0)
657 
658         ext             T1.16b, T2.16b, T2.16b, #8
659 .Lgh3:  eor             T2.16b, T2.16b, T1.16b
660         pmull2          XH.1q, HH3.2d, T1.2d            // a1 * b1
661         pmull           XL.1q, HH3.1d, T1.1d            // a0 * b0
662         pmull           XM.1q, HH34.1d, T2.1d           // (a1 + a0)(b1 + b0)
663 
664         eor             XH2.16b, XH2.16b, XH.16b
665         eor             XL2.16b, XL2.16b, XL.16b
666         eor             XM2.16b, XM2.16b, XM.16b
667 
668         ext             T2.16b, TT3.16b, TT3.16b, #8
669 .Lgh2:  eor             TT3.16b, TT3.16b, T2.16b
670         pmull2          XH.1q, HH.2d, T2.2d             // a1 * b1
671         pmull           XL.1q, HH.1d, T2.1d             // a0 * b0
672         pmull2          XM.1q, SHASH2.2d, TT3.2d        // (a1 + a0)(b1 + b0)
673 
674         eor             XH2.16b, XH2.16b, XH.16b
675         eor             XL2.16b, XL2.16b, XL.16b
676         eor             XM2.16b, XM2.16b, XM.16b
677 
678         ext             IN1.16b, TT4.16b, TT4.16b, #8
679 .Lgh1:  eor             TT4.16b, TT4.16b, IN1.16b
680         pmull           XL.1q, SHASH.1d, IN1.1d         // a0 * b0
681         pmull2          XH.1q, SHASH.2d, IN1.2d         // a1 * b1
682         pmull           XM.1q, SHASH2.1d, TT4.1d        // (a1 + a0)(b1 + b0)
683 
684         eor             XH.16b, XH.16b, XH2.16b
685         eor             XL.16b, XL.16b, XL2.16b
686         eor             XM.16b, XM.16b, XM2.16b
687 
688         eor             T2.16b, XL.16b, XH.16b
689         ext             T1.16b, XL.16b, XH.16b, #8
690         eor             XM.16b, XM.16b, T2.16b
691 
692         __pmull_reduce_p64
693 
694         eor             T2.16b, T2.16b, XH.16b
695         eor             XL.16b, XL.16b, T2.16b
696 
697         ret
698 SYM_FUNC_END(pmull_gcm_ghash_4x)
699 
700 SYM_FUNC_START_LOCAL(pmull_gcm_enc_4x)
701         ld1             {KS0.16b}, [x5]                 // load upper counter
702         sub             w10, w8, #4
703         sub             w11, w8, #3
704         sub             w12, w8, #2
705         sub             w13, w8, #1
706         rev             w10, w10
707         rev             w11, w11
708         rev             w12, w12
709         rev             w13, w13
710         mov             KS1.16b, KS0.16b
711         mov             KS2.16b, KS0.16b
712         mov             KS3.16b, KS0.16b
713         ins             KS0.s[3], w10                   // set lower counter
714         ins             KS1.s[3], w11
715         ins             KS2.s[3], w12
716         ins             KS3.s[3], w13
717 
718         add             x10, x6, #96                    // round key pointer
719         ld1             {K6.4s-K7.4s}, [x10], #32
720         .irp            key, K0, K1, K2, K3, K4, K5
721         enc_qround      KS0, KS1, KS2, KS3, \key
722         .endr
723 
724         tbnz            x7, #2, .Lnot128
725         .subsection     1
726 .Lnot128:
727         ld1             {K8.4s-K9.4s}, [x10], #32
728         .irp            key, K6, K7
729         enc_qround      KS0, KS1, KS2, KS3, \key
730         .endr
731         ld1             {K6.4s-K7.4s}, [x10]
732         .irp            key, K8, K9
733         enc_qround      KS0, KS1, KS2, KS3, \key
734         .endr
735         tbz             x7, #1, .Lout192
736         b               .Lout256
737         .previous
738 
739 .Lout256:
740         .irp            key, K6, K7
741         enc_qround      KS0, KS1, KS2, KS3, \key
742         .endr
743 
744 .Lout192:
745         enc_qround      KS0, KS1, KS2, KS3, KK
746 
747         aese            KS0.16b, KL.16b
748         aese            KS1.16b, KL.16b
749         aese            KS2.16b, KL.16b
750         aese            KS3.16b, KL.16b
751 
752         eor             KS0.16b, KS0.16b, KM.16b
753         eor             KS1.16b, KS1.16b, KM.16b
754         eor             KS2.16b, KS2.16b, KM.16b
755         eor             KS3.16b, KS3.16b, KM.16b
756 
757         eor             INP0.16b, INP0.16b, KS0.16b
758         eor             INP1.16b, INP1.16b, KS1.16b
759         eor             INP2.16b, INP2.16b, KS2.16b
760         eor             INP3.16b, INP3.16b, KS3.16b
761 
762         ret
763 SYM_FUNC_END(pmull_gcm_enc_4x)
764 
765         .section        ".rodata", "a"
766         .align          6
767 .Lpermute_table:
768         .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
769         .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
770         .byte            0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
771         .byte            0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
772         .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
773         .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
774         .byte            0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
775         .byte            0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
776         .previous

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

sflogo.php