~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/arch/arm64/crypto/sm4-neon-core.S

Version: ~ [ linux-6.12-rc7 ] ~ [ linux-6.11.7 ] ~ [ linux-6.10.14 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.60 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.116 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.171 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.229 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.285 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.323 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.12 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 /* SPDX-License-Identifier: GPL-2.0-or-later */
  2 /*
  3  * SM4 Cipher Algorithm for ARMv8 NEON
  4  * as specified in
  5  * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html
  6  *
  7  * Copyright (C) 2022, Alibaba Group.
  8  * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
  9  */
 10 
 11 #include <linux/linkage.h>
 12 #include <asm/assembler.h>
 13 
 14 /* Register macros */
 15 
 16 #define RTMP0   v8
 17 #define RTMP1   v9
 18 #define RTMP2   v10
 19 #define RTMP3   v11
 20 
 21 #define RTMP4   v12
 22 #define RTMP5   v13
 23 #define RTMP6   v14
 24 #define RTMP7   v15
 25 
 26 #define RX0     v12
 27 #define RX1     v13
 28 #define RKEY    v14
 29 #define RIV     v15
 30 
 31 /* Helper macros. */
 32 
 33 #define SM4_PREPARE()                                           \
 34         adr_l           x5, crypto_sm4_sbox;                    \
 35         ld1             {v16.16b-v19.16b}, [x5], #64;           \
 36         ld1             {v20.16b-v23.16b}, [x5], #64;           \
 37         ld1             {v24.16b-v27.16b}, [x5], #64;           \
 38         ld1             {v28.16b-v31.16b}, [x5];
 39 
 40 #define transpose_4x4(s0, s1, s2, s3)                           \
 41         zip1            RTMP0.4s, s0.4s, s1.4s;                 \
 42         zip1            RTMP1.4s, s2.4s, s3.4s;                 \
 43         zip2            RTMP2.4s, s0.4s, s1.4s;                 \
 44         zip2            RTMP3.4s, s2.4s, s3.4s;                 \
 45         zip1            s0.2d, RTMP0.2d, RTMP1.2d;              \
 46         zip2            s1.2d, RTMP0.2d, RTMP1.2d;              \
 47         zip1            s2.2d, RTMP2.2d, RTMP3.2d;              \
 48         zip2            s3.2d, RTMP2.2d, RTMP3.2d;
 49 
 50 #define transpose_4x4_2x(s0, s1, s2, s3, s4, s5, s6, s7)        \
 51         zip1            RTMP0.4s, s0.4s, s1.4s;                 \
 52         zip1            RTMP1.4s, s2.4s, s3.4s;                 \
 53         zip2            RTMP2.4s, s0.4s, s1.4s;                 \
 54         zip2            RTMP3.4s, s2.4s, s3.4s;                 \
 55         zip1            RTMP4.4s, s4.4s, s5.4s;                 \
 56         zip1            RTMP5.4s, s6.4s, s7.4s;                 \
 57         zip2            RTMP6.4s, s4.4s, s5.4s;                 \
 58         zip2            RTMP7.4s, s6.4s, s7.4s;                 \
 59         zip1            s0.2d, RTMP0.2d, RTMP1.2d;              \
 60         zip2            s1.2d, RTMP0.2d, RTMP1.2d;              \
 61         zip1            s2.2d, RTMP2.2d, RTMP3.2d;              \
 62         zip2            s3.2d, RTMP2.2d, RTMP3.2d;              \
 63         zip1            s4.2d, RTMP4.2d, RTMP5.2d;              \
 64         zip2            s5.2d, RTMP4.2d, RTMP5.2d;              \
 65         zip1            s6.2d, RTMP6.2d, RTMP7.2d;              \
 66         zip2            s7.2d, RTMP6.2d, RTMP7.2d;
 67 
 68 #define rotate_clockwise_4x4(s0, s1, s2, s3)                    \
 69         zip1            RTMP0.4s, s1.4s, s0.4s;                 \
 70         zip2            RTMP1.4s, s1.4s, s0.4s;                 \
 71         zip1            RTMP2.4s, s3.4s, s2.4s;                 \
 72         zip2            RTMP3.4s, s3.4s, s2.4s;                 \
 73         zip1            s0.2d, RTMP2.2d, RTMP0.2d;              \
 74         zip2            s1.2d, RTMP2.2d, RTMP0.2d;              \
 75         zip1            s2.2d, RTMP3.2d, RTMP1.2d;              \
 76         zip2            s3.2d, RTMP3.2d, RTMP1.2d;
 77 
 78 #define rotate_clockwise_4x4_2x(s0, s1, s2, s3, s4, s5, s6, s7) \
 79         zip1            RTMP0.4s, s1.4s, s0.4s;                 \
 80         zip1            RTMP2.4s, s3.4s, s2.4s;                 \
 81         zip2            RTMP1.4s, s1.4s, s0.4s;                 \
 82         zip2            RTMP3.4s, s3.4s, s2.4s;                 \
 83         zip1            RTMP4.4s, s5.4s, s4.4s;                 \
 84         zip1            RTMP6.4s, s7.4s, s6.4s;                 \
 85         zip2            RTMP5.4s, s5.4s, s4.4s;                 \
 86         zip2            RTMP7.4s, s7.4s, s6.4s;                 \
 87         zip1            s0.2d, RTMP2.2d, RTMP0.2d;              \
 88         zip2            s1.2d, RTMP2.2d, RTMP0.2d;              \
 89         zip1            s2.2d, RTMP3.2d, RTMP1.2d;              \
 90         zip2            s3.2d, RTMP3.2d, RTMP1.2d;              \
 91         zip1            s4.2d, RTMP6.2d, RTMP4.2d;              \
 92         zip2            s5.2d, RTMP6.2d, RTMP4.2d;              \
 93         zip1            s6.2d, RTMP7.2d, RTMP5.2d;              \
 94         zip2            s7.2d, RTMP7.2d, RTMP5.2d;
 95 
 96 #define ROUND4(round, s0, s1, s2, s3)                           \
 97         dup             RX0.4s, RKEY.s[round];                  \
 98         /* rk ^ s1 ^ s2 ^ s3 */                                 \
 99         eor             RTMP1.16b, s2.16b, s3.16b;              \
100         eor             RX0.16b, RX0.16b, s1.16b;               \
101         eor             RX0.16b, RX0.16b, RTMP1.16b;            \
102                                                                 \
103         /* sbox, non-linear part */                             \
104         movi            RTMP3.16b, #64;  /* sizeof(sbox) / 4 */ \
105         tbl             RTMP0.16b, {v16.16b-v19.16b}, RX0.16b;  \
106         sub             RX0.16b, RX0.16b, RTMP3.16b;            \
107         tbx             RTMP0.16b, {v20.16b-v23.16b}, RX0.16b;  \
108         sub             RX0.16b, RX0.16b, RTMP3.16b;            \
109         tbx             RTMP0.16b, {v24.16b-v27.16b}, RX0.16b;  \
110         sub             RX0.16b, RX0.16b, RTMP3.16b;            \
111         tbx             RTMP0.16b, {v28.16b-v31.16b}, RX0.16b;  \
112                                                                 \
113         /* linear part */                                       \
114         shl             RTMP1.4s, RTMP0.4s, #8;                 \
115         shl             RTMP2.4s, RTMP0.4s, #16;                \
116         shl             RTMP3.4s, RTMP0.4s, #24;                \
117         sri             RTMP1.4s, RTMP0.4s, #(32-8);            \
118         sri             RTMP2.4s, RTMP0.4s, #(32-16);           \
119         sri             RTMP3.4s, RTMP0.4s, #(32-24);           \
120         /* RTMP1 = x ^ rol32(x, 8) ^ rol32(x, 16) */            \
121         eor             RTMP1.16b, RTMP1.16b, RTMP0.16b;        \
122         eor             RTMP1.16b, RTMP1.16b, RTMP2.16b;        \
123         /* RTMP3 = x ^ rol32(x, 24) ^ rol32(RTMP1, 2) */        \
124         eor             RTMP3.16b, RTMP3.16b, RTMP0.16b;        \
125         shl             RTMP2.4s, RTMP1.4s, 2;                  \
126         sri             RTMP2.4s, RTMP1.4s, #(32-2);            \
127         eor             RTMP3.16b, RTMP3.16b, RTMP2.16b;        \
128         /* s0 ^= RTMP3 */                                       \
129         eor             s0.16b, s0.16b, RTMP3.16b;
130 
131 #define SM4_CRYPT_BLK4_BE(b0, b1, b2, b3)                       \
132         mov             x6, 8;                                  \
133 4:                                                              \
134         ld1             {RKEY.4s}, [x0], #16;                   \
135         subs            x6, x6, #1;                             \
136                                                                 \
137         ROUND4(0, b0, b1, b2, b3);                              \
138         ROUND4(1, b1, b2, b3, b0);                              \
139         ROUND4(2, b2, b3, b0, b1);                              \
140         ROUND4(3, b3, b0, b1, b2);                              \
141                                                                 \
142         bne             4b;                                     \
143                                                                 \
144         rev32           b0.16b, b0.16b;                         \
145         rev32           b1.16b, b1.16b;                         \
146         rev32           b2.16b, b2.16b;                         \
147         rev32           b3.16b, b3.16b;                         \
148                                                                 \
149         rotate_clockwise_4x4(b0, b1, b2, b3);                   \
150                                                                 \
151         /* repoint to rkey */                                   \
152         sub             x0, x0, #128;
153 
154 #define SM4_CRYPT_BLK4(b0, b1, b2, b3)                          \
155         rev32           b0.16b, b0.16b;                         \
156         rev32           b1.16b, b1.16b;                         \
157         rev32           b2.16b, b2.16b;                         \
158         rev32           b3.16b, b3.16b;                         \
159         SM4_CRYPT_BLK4_BE(b0, b1, b2, b3);
160 
161 #define ROUND8(round, s0, s1, s2, s3, t0, t1, t2, t3)           \
162         /* rk ^ s1 ^ s2 ^ s3 */                                 \
163         dup             RX0.4s, RKEY.s[round];                  \
164         eor             RTMP0.16b, s2.16b, s3.16b;              \
165         mov             RX1.16b, RX0.16b;                       \
166         eor             RTMP1.16b, t2.16b, t3.16b;              \
167         eor             RX0.16b, RX0.16b, s1.16b;               \
168         eor             RX1.16b, RX1.16b, t1.16b;               \
169         eor             RX0.16b, RX0.16b, RTMP0.16b;            \
170         eor             RX1.16b, RX1.16b, RTMP1.16b;            \
171                                                                 \
172         /* sbox, non-linear part */                             \
173         movi            RTMP3.16b, #64;  /* sizeof(sbox) / 4 */ \
174         tbl             RTMP0.16b, {v16.16b-v19.16b}, RX0.16b;  \
175         tbl             RTMP1.16b, {v16.16b-v19.16b}, RX1.16b;  \
176         sub             RX0.16b, RX0.16b, RTMP3.16b;            \
177         sub             RX1.16b, RX1.16b, RTMP3.16b;            \
178         tbx             RTMP0.16b, {v20.16b-v23.16b}, RX0.16b;  \
179         tbx             RTMP1.16b, {v20.16b-v23.16b}, RX1.16b;  \
180         sub             RX0.16b, RX0.16b, RTMP3.16b;            \
181         sub             RX1.16b, RX1.16b, RTMP3.16b;            \
182         tbx             RTMP0.16b, {v24.16b-v27.16b}, RX0.16b;  \
183         tbx             RTMP1.16b, {v24.16b-v27.16b}, RX1.16b;  \
184         sub             RX0.16b, RX0.16b, RTMP3.16b;            \
185         sub             RX1.16b, RX1.16b, RTMP3.16b;            \
186         tbx             RTMP0.16b, {v28.16b-v31.16b}, RX0.16b;  \
187         tbx             RTMP1.16b, {v28.16b-v31.16b}, RX1.16b;  \
188                                                                 \
189         /* linear part */                                       \
190         shl             RX0.4s, RTMP0.4s, #8;                   \
191         shl             RX1.4s, RTMP1.4s, #8;                   \
192         shl             RTMP2.4s, RTMP0.4s, #16;                \
193         shl             RTMP3.4s, RTMP1.4s, #16;                \
194         sri             RX0.4s, RTMP0.4s, #(32 - 8);            \
195         sri             RX1.4s, RTMP1.4s, #(32 - 8);            \
196         sri             RTMP2.4s, RTMP0.4s, #(32 - 16);         \
197         sri             RTMP3.4s, RTMP1.4s, #(32 - 16);         \
198         /* RX = x ^ rol32(x, 8) ^ rol32(x, 16) */               \
199         eor             RX0.16b, RX0.16b, RTMP0.16b;            \
200         eor             RX1.16b, RX1.16b, RTMP1.16b;            \
201         eor             RX0.16b, RX0.16b, RTMP2.16b;            \
202         eor             RX1.16b, RX1.16b, RTMP3.16b;            \
203         /* RTMP0/1 ^= x ^ rol32(x, 24) ^ rol32(RX, 2) */        \
204         shl             RTMP2.4s, RTMP0.4s, #24;                \
205         shl             RTMP3.4s, RTMP1.4s, #24;                \
206         sri             RTMP2.4s, RTMP0.4s, #(32 - 24);         \
207         sri             RTMP3.4s, RTMP1.4s, #(32 - 24);         \
208         eor             RTMP0.16b, RTMP0.16b, RTMP2.16b;        \
209         eor             RTMP1.16b, RTMP1.16b, RTMP3.16b;        \
210         shl             RTMP2.4s, RX0.4s, #2;                   \
211         shl             RTMP3.4s, RX1.4s, #2;                   \
212         sri             RTMP2.4s, RX0.4s, #(32 - 2);            \
213         sri             RTMP3.4s, RX1.4s, #(32 - 2);            \
214         eor             RTMP0.16b, RTMP0.16b, RTMP2.16b;        \
215         eor             RTMP1.16b, RTMP1.16b, RTMP3.16b;        \
216         /* s0/t0 ^= RTMP0/1 */                                  \
217         eor             s0.16b, s0.16b, RTMP0.16b;              \
218         eor             t0.16b, t0.16b, RTMP1.16b;
219 
220 #define SM4_CRYPT_BLK8_norotate(b0, b1, b2, b3, b4, b5, b6, b7) \
221         rev32           b0.16b, b0.16b;                         \
222         rev32           b1.16b, b1.16b;                         \
223         rev32           b2.16b, b2.16b;                         \
224         rev32           b3.16b, b3.16b;                         \
225         rev32           b4.16b, b4.16b;                         \
226         rev32           b5.16b, b5.16b;                         \
227         rev32           b6.16b, b6.16b;                         \
228         rev32           b7.16b, b7.16b;                         \
229                                                                 \
230         mov             x6, 8;                                  \
231 8:                                                              \
232         ld1             {RKEY.4s}, [x0], #16;                   \
233         subs            x6, x6, #1;                             \
234                                                                 \
235         ROUND8(0, b0, b1, b2, b3, b4, b5, b6, b7);              \
236         ROUND8(1, b1, b2, b3, b0, b5, b6, b7, b4);              \
237         ROUND8(2, b2, b3, b0, b1, b6, b7, b4, b5);              \
238         ROUND8(3, b3, b0, b1, b2, b7, b4, b5, b6);              \
239                                                                 \
240         bne             8b;                                     \
241                                                                 \
242         rev32           b0.16b, b0.16b;                         \
243         rev32           b1.16b, b1.16b;                         \
244         rev32           b2.16b, b2.16b;                         \
245         rev32           b3.16b, b3.16b;                         \
246         rev32           b4.16b, b4.16b;                         \
247         rev32           b5.16b, b5.16b;                         \
248         rev32           b6.16b, b6.16b;                         \
249         rev32           b7.16b, b7.16b;                         \
250                                                                 \
251         /* repoint to rkey */                                   \
252         sub             x0, x0, #128;
253 
254 #define SM4_CRYPT_BLK8(b0, b1, b2, b3, b4, b5, b6, b7)                  \
255         SM4_CRYPT_BLK8_norotate(b0, b1, b2, b3, b4, b5, b6, b7);        \
256         rotate_clockwise_4x4_2x(b0, b1, b2, b3, b4, b5, b6, b7);        \
257 
258 
259 .align 3
260 SYM_FUNC_START(sm4_neon_crypt)
261         /* input:
262          *   x0: round key array, CTX
263          *   x1: dst
264          *   x2: src
265          *   w3: nblocks
266          */
267         SM4_PREPARE()
268 
269 .Lcrypt_loop_8x:
270         sub             w3, w3, #8
271         tbnz            w3, #31, .Lcrypt_4x
272 
273         ld4             {v0.4s-v3.4s}, [x2], #64
274         ld4             {v4.4s-v7.4s}, [x2], #64
275 
276         SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
277 
278         st1             {v0.16b-v3.16b}, [x1], #64
279         st1             {v4.16b-v7.16b}, [x1], #64
280 
281         cbz             w3, .Lcrypt_end
282         b               .Lcrypt_loop_8x
283 
284 .Lcrypt_4x:
285         add             w3, w3, #8
286         cmp             w3, #4
287         blt             .Lcrypt_tail
288 
289         sub             w3, w3, #4
290 
291         ld4             {v0.4s-v3.4s}, [x2], #64
292 
293         SM4_CRYPT_BLK4(v0, v1, v2, v3)
294 
295         st1             {v0.16b-v3.16b}, [x1], #64
296 
297         cbz             w3, .Lcrypt_end
298 
299 .Lcrypt_tail:
300         cmp             w3, #2
301         ld1             {v0.16b}, [x2], #16
302         blt             .Lcrypt_tail_load_done
303         ld1             {v1.16b}, [x2], #16
304         beq             .Lcrypt_tail_load_done
305         ld1             {v2.16b}, [x2], #16
306 
307 .Lcrypt_tail_load_done:
308         transpose_4x4(v0, v1, v2, v3)
309 
310         SM4_CRYPT_BLK4(v0, v1, v2, v3)
311 
312         cmp             w3, #2
313         st1             {v0.16b}, [x1], #16
314         blt             .Lcrypt_end
315         st1             {v1.16b}, [x1], #16
316         beq             .Lcrypt_end
317         st1             {v2.16b}, [x1], #16
318 
319 .Lcrypt_end:
320         ret
321 SYM_FUNC_END(sm4_neon_crypt)
322 
323 .align 3
324 SYM_FUNC_START(sm4_neon_cbc_dec)
325         /* input:
326          *   x0: round key array, CTX
327          *   x1: dst
328          *   x2: src
329          *   x3: iv (big endian, 128 bit)
330          *   w4: nblocks
331          */
332         SM4_PREPARE()
333 
334         ld1             {RIV.16b}, [x3]
335 
336 .Lcbc_dec_loop_8x:
337         sub             w4, w4, #8
338         tbnz            w4, #31, .Lcbc_dec_4x
339 
340         ld4             {v0.4s-v3.4s}, [x2], #64
341         ld4             {v4.4s-v7.4s}, [x2]
342 
343         SM4_CRYPT_BLK8_norotate(v0, v1, v2, v3, v4, v5, v6, v7)
344 
345         /* Avoid overwriting the RIV register */
346         rotate_clockwise_4x4(v0, v1, v2, v3)
347         rotate_clockwise_4x4(v4, v5, v6, v7)
348 
349         sub             x2, x2, #64
350 
351         eor             v0.16b, v0.16b, RIV.16b
352 
353         ld1             {RTMP0.16b-RTMP3.16b}, [x2], #64
354         ld1             {RTMP4.16b-RTMP7.16b}, [x2], #64
355 
356         eor             v1.16b, v1.16b, RTMP0.16b
357         eor             v2.16b, v2.16b, RTMP1.16b
358         eor             v3.16b, v3.16b, RTMP2.16b
359         eor             v4.16b, v4.16b, RTMP3.16b
360         eor             v5.16b, v5.16b, RTMP4.16b
361         eor             v6.16b, v6.16b, RTMP5.16b
362         eor             v7.16b, v7.16b, RTMP6.16b
363 
364         mov             RIV.16b, RTMP7.16b
365 
366         st1             {v0.16b-v3.16b}, [x1], #64
367         st1             {v4.16b-v7.16b}, [x1], #64
368 
369         cbz             w4, .Lcbc_dec_end
370         b               .Lcbc_dec_loop_8x
371 
372 .Lcbc_dec_4x:
373         add             w4, w4, #8
374         cmp             w4, #4
375         blt             .Lcbc_dec_tail
376 
377         sub             w4, w4, #4
378 
379         ld1             {v0.16b-v3.16b}, [x2], #64
380 
381         rev32           v4.16b, v0.16b
382         rev32           v5.16b, v1.16b
383         rev32           v6.16b, v2.16b
384         rev32           v7.16b, v3.16b
385 
386         transpose_4x4(v4, v5, v6, v7)
387 
388         SM4_CRYPT_BLK4_BE(v4, v5, v6, v7)
389 
390         eor             v4.16b, v4.16b, RIV.16b
391         eor             v5.16b, v5.16b, v0.16b
392         eor             v6.16b, v6.16b, v1.16b
393         eor             v7.16b, v7.16b, v2.16b
394 
395         mov             RIV.16b, v3.16b
396 
397         st1             {v4.16b-v7.16b}, [x1], #64
398 
399         cbz             w4, .Lcbc_dec_end
400 
401 .Lcbc_dec_tail:
402         cmp             w4, #2
403         ld1             {v0.16b}, [x2], #16
404         blt             .Lcbc_dec_tail_load_done
405         ld1             {v1.16b}, [x2], #16
406         beq             .Lcbc_dec_tail_load_done
407         ld1             {v2.16b}, [x2], #16
408 
409 .Lcbc_dec_tail_load_done:
410         rev32           v4.16b, v0.16b
411         rev32           v5.16b, v1.16b
412         rev32           v6.16b, v2.16b
413 
414         transpose_4x4(v4, v5, v6, v7)
415 
416         SM4_CRYPT_BLK4_BE(v4, v5, v6, v7)
417 
418         cmp             w4, #2
419         eor             v4.16b, v4.16b, RIV.16b
420         mov             RIV.16b, v0.16b
421         st1             {v4.16b}, [x1], #16
422         blt             .Lcbc_dec_end
423 
424         eor             v5.16b, v5.16b, v0.16b
425         mov             RIV.16b, v1.16b
426         st1             {v5.16b}, [x1], #16
427         beq             .Lcbc_dec_end
428 
429         eor             v6.16b, v6.16b, v1.16b
430         mov             RIV.16b, v2.16b
431         st1             {v6.16b}, [x1], #16
432 
433 .Lcbc_dec_end:
434         /* store new IV */
435         st1             {RIV.16b}, [x3]
436 
437         ret
438 SYM_FUNC_END(sm4_neon_cbc_dec)
439 
440 .align 3
441 SYM_FUNC_START(sm4_neon_ctr_crypt)
442         /* input:
443          *   x0: round key array, CTX
444          *   x1: dst
445          *   x2: src
446          *   x3: ctr (big endian, 128 bit)
447          *   w4: nblocks
448          */
449         SM4_PREPARE()
450 
451         ldp             x7, x8, [x3]
452         rev             x7, x7
453         rev             x8, x8
454 
455 .Lctr_crypt_loop_8x:
456         sub             w4, w4, #8
457         tbnz            w4, #31, .Lctr_crypt_4x
458 
459 #define inc_le128(vctr)                             \
460                 mov             vctr.d[1], x8;      \
461                 mov             vctr.d[0], x7;      \
462                 adds            x8, x8, #1;         \
463                 rev64           vctr.16b, vctr.16b; \
464                 adc             x7, x7, xzr;
465 
466         /* construct CTRs */
467         inc_le128(v0)                   /* +0 */
468         inc_le128(v1)                   /* +1 */
469         inc_le128(v2)                   /* +2 */
470         inc_le128(v3)                   /* +3 */
471         inc_le128(v4)                   /* +4 */
472         inc_le128(v5)                   /* +5 */
473         inc_le128(v6)                   /* +6 */
474         inc_le128(v7)                   /* +7 */
475 
476         transpose_4x4_2x(v0, v1, v2, v3, v4, v5, v6, v7)
477 
478         SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
479 
480         ld1             {RTMP0.16b-RTMP3.16b}, [x2], #64
481         ld1             {RTMP4.16b-RTMP7.16b}, [x2], #64
482 
483         eor             v0.16b, v0.16b, RTMP0.16b
484         eor             v1.16b, v1.16b, RTMP1.16b
485         eor             v2.16b, v2.16b, RTMP2.16b
486         eor             v3.16b, v3.16b, RTMP3.16b
487         eor             v4.16b, v4.16b, RTMP4.16b
488         eor             v5.16b, v5.16b, RTMP5.16b
489         eor             v6.16b, v6.16b, RTMP6.16b
490         eor             v7.16b, v7.16b, RTMP7.16b
491 
492         st1             {v0.16b-v3.16b}, [x1], #64
493         st1             {v4.16b-v7.16b}, [x1], #64
494 
495         cbz             w4, .Lctr_crypt_end
496         b               .Lctr_crypt_loop_8x
497 
498 .Lctr_crypt_4x:
499         add             w4, w4, #8
500         cmp             w4, #4
501         blt             .Lctr_crypt_tail
502 
503         sub             w4, w4, #4
504 
505         /* construct CTRs */
506         inc_le128(v0)                   /* +0 */
507         inc_le128(v1)                   /* +1 */
508         inc_le128(v2)                   /* +2 */
509         inc_le128(v3)                   /* +3 */
510 
511         ld1             {v4.16b-v7.16b}, [x2], #64
512 
513         transpose_4x4(v0, v1, v2, v3)
514 
515         SM4_CRYPT_BLK4(v0, v1, v2, v3)
516 
517         eor             v0.16b, v0.16b, v4.16b
518         eor             v1.16b, v1.16b, v5.16b
519         eor             v2.16b, v2.16b, v6.16b
520         eor             v3.16b, v3.16b, v7.16b
521 
522         st1             {v0.16b-v3.16b}, [x1], #64
523 
524         cbz             w4, .Lctr_crypt_end
525 
526 .Lctr_crypt_tail:
527         /* inc_le128 will change the sign bit */
528         ld1             {v4.16b}, [x2], #16
529         inc_le128(v0)
530         cmp             w4, #2
531         blt             .Lctr_crypt_tail_load_done
532 
533         ld1             {v5.16b}, [x2], #16
534         inc_le128(v1)
535         cmp             w4, #2
536         beq             .Lctr_crypt_tail_load_done
537 
538         ld1             {v6.16b}, [x2], #16
539         inc_le128(v2)
540 
541 .Lctr_crypt_tail_load_done:
542         transpose_4x4(v0, v1, v2, v3)
543 
544         SM4_CRYPT_BLK4(v0, v1, v2, v3)
545 
546         cmp             w4, #2
547 
548         eor             v0.16b, v0.16b, v4.16b
549         st1             {v0.16b}, [x1], #16
550         blt             .Lctr_crypt_end
551 
552         eor             v1.16b, v1.16b, v5.16b
553         st1             {v1.16b}, [x1], #16
554         beq             .Lctr_crypt_end
555 
556         eor             v2.16b, v2.16b, v6.16b
557         st1             {v2.16b}, [x1], #16
558 
559 .Lctr_crypt_end:
560         /* store new CTR */
561         rev             x7, x7
562         rev             x8, x8
563         stp             x7, x8, [x3]
564 
565         ret
566 SYM_FUNC_END(sm4_neon_ctr_crypt)

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

sflogo.php