~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/arch/arm64/crypto/sm4-ce-core.S

Version: ~ [ linux-6.12-rc7 ] ~ [ linux-6.11.7 ] ~ [ linux-6.10.14 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.60 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.116 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.171 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.229 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.285 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.323 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.12 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 /* SPDX-License-Identifier: GPL-2.0-or-later */
  2 /*
  3  * SM4 Cipher Algorithm for ARMv8 with Crypto Extensions
  4  * as specified in
  5  * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html
  6  *
  7  * Copyright (C) 2022, Alibaba Group.
  8  * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
  9  */
 10 
 11 #include <linux/linkage.h>
 12 #include <asm/assembler.h>
 13 #include "sm4-ce-asm.h"
 14 
 15 .arch   armv8-a+crypto
 16 
 17 .irp b, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \
 18                 20, 24, 25, 26, 27, 28, 29, 30, 31
 19         .set .Lv\b\().4s, \b
 20 .endr
 21 
 22 .macro sm4e, vd, vn
 23         .inst 0xcec08400 | (.L\vn << 5) | .L\vd
 24 .endm
 25 
 26 .macro sm4ekey, vd, vn, vm
 27         .inst 0xce60c800 | (.L\vm << 16) | (.L\vn << 5) | .L\vd
 28 .endm
 29 
 30 /* Register macros */
 31 
 32 #define RTMP0   v16
 33 #define RTMP1   v17
 34 #define RTMP2   v18
 35 #define RTMP3   v19
 36 
 37 #define RIV     v20
 38 #define RMAC    v20
 39 #define RMASK   v21
 40 
 41 
 42 .align 3
 43 SYM_FUNC_START(sm4_ce_expand_key)
 44         /* input:
 45          *   x0: 128-bit key
 46          *   x1: rkey_enc
 47          *   x2: rkey_dec
 48          *   x3: fk array
 49          *   x4: ck array
 50          */
 51         ld1             {v0.16b}, [x0];
 52         rev32           v0.16b, v0.16b;
 53         ld1             {v1.16b}, [x3];
 54         /* load ck */
 55         ld1             {v24.16b-v27.16b}, [x4], #64;
 56         ld1             {v28.16b-v31.16b}, [x4];
 57 
 58         /* input ^ fk */
 59         eor             v0.16b, v0.16b, v1.16b;
 60 
 61         sm4ekey         v0.4s, v0.4s, v24.4s;
 62         sm4ekey         v1.4s, v0.4s, v25.4s;
 63         sm4ekey         v2.4s, v1.4s, v26.4s;
 64         sm4ekey         v3.4s, v2.4s, v27.4s;
 65         sm4ekey         v4.4s, v3.4s, v28.4s;
 66         sm4ekey         v5.4s, v4.4s, v29.4s;
 67         sm4ekey         v6.4s, v5.4s, v30.4s;
 68         sm4ekey         v7.4s, v6.4s, v31.4s;
 69 
 70         adr_l           x5, .Lbswap128_mask
 71         ld1             {v24.16b}, [x5]
 72 
 73         st1             {v0.16b-v3.16b}, [x1], #64;
 74         st1             {v4.16b-v7.16b}, [x1];
 75 
 76         tbl             v16.16b, {v7.16b}, v24.16b
 77         tbl             v17.16b, {v6.16b}, v24.16b
 78         tbl             v18.16b, {v5.16b}, v24.16b
 79         tbl             v19.16b, {v4.16b}, v24.16b
 80         tbl             v20.16b, {v3.16b}, v24.16b
 81         tbl             v21.16b, {v2.16b}, v24.16b
 82         tbl             v22.16b, {v1.16b}, v24.16b
 83         tbl             v23.16b, {v0.16b}, v24.16b
 84 
 85         st1             {v16.16b-v19.16b}, [x2], #64
 86         st1             {v20.16b-v23.16b}, [x2]
 87 
 88         ret;
 89 SYM_FUNC_END(sm4_ce_expand_key)
 90 
 91 .align 3
 92 SYM_FUNC_START(sm4_ce_crypt_block)
 93         /* input:
 94          *   x0: round key array, CTX
 95          *   x1: dst
 96          *   x2: src
 97          */
 98         SM4_PREPARE(x0)
 99 
100         ld1             {v0.16b}, [x2];
101         SM4_CRYPT_BLK(v0);
102         st1             {v0.16b}, [x1];
103 
104         ret;
105 SYM_FUNC_END(sm4_ce_crypt_block)
106 
107 .align 3
108 SYM_FUNC_START(sm4_ce_crypt)
109         /* input:
110          *   x0: round key array, CTX
111          *   x1: dst
112          *   x2: src
113          *   w3: nblocks
114          */
115         SM4_PREPARE(x0)
116 
117 .Lcrypt_loop_blk:
118         sub             w3, w3, #8;
119         tbnz            w3, #31, .Lcrypt_tail8;
120 
121         ld1             {v0.16b-v3.16b}, [x2], #64;
122         ld1             {v4.16b-v7.16b}, [x2], #64;
123 
124         SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7);
125 
126         st1             {v0.16b-v3.16b}, [x1], #64;
127         st1             {v4.16b-v7.16b}, [x1], #64;
128 
129         cbz             w3, .Lcrypt_end;
130         b               .Lcrypt_loop_blk;
131 
132 .Lcrypt_tail8:
133         add             w3, w3, #8;
134         cmp             w3, #4;
135         blt             .Lcrypt_tail4;
136 
137         sub             w3, w3, #4;
138 
139         ld1             {v0.16b-v3.16b}, [x2], #64;
140         SM4_CRYPT_BLK4(v0, v1, v2, v3);
141         st1             {v0.16b-v3.16b}, [x1], #64;
142 
143         cbz             w3, .Lcrypt_end;
144 
145 .Lcrypt_tail4:
146         sub             w3, w3, #1;
147 
148         ld1             {v0.16b}, [x2], #16;
149         SM4_CRYPT_BLK(v0);
150         st1             {v0.16b}, [x1], #16;
151 
152         cbnz            w3, .Lcrypt_tail4;
153 
154 .Lcrypt_end:
155         ret;
156 SYM_FUNC_END(sm4_ce_crypt)
157 
158 .align 3
159 SYM_FUNC_START(sm4_ce_cbc_enc)
160         /* input:
161          *   x0: round key array, CTX
162          *   x1: dst
163          *   x2: src
164          *   x3: iv (big endian, 128 bit)
165          *   w4: nblocks
166          */
167         SM4_PREPARE(x0)
168 
169         ld1             {RIV.16b}, [x3]
170 
171 .Lcbc_enc_loop_4x:
172         cmp             w4, #4
173         blt             .Lcbc_enc_loop_1x
174 
175         sub             w4, w4, #4
176 
177         ld1             {v0.16b-v3.16b}, [x2], #64
178 
179         eor             v0.16b, v0.16b, RIV.16b
180         SM4_CRYPT_BLK(v0)
181         eor             v1.16b, v1.16b, v0.16b
182         SM4_CRYPT_BLK(v1)
183         eor             v2.16b, v2.16b, v1.16b
184         SM4_CRYPT_BLK(v2)
185         eor             v3.16b, v3.16b, v2.16b
186         SM4_CRYPT_BLK(v3)
187 
188         st1             {v0.16b-v3.16b}, [x1], #64
189         mov             RIV.16b, v3.16b
190 
191         cbz             w4, .Lcbc_enc_end
192         b               .Lcbc_enc_loop_4x
193 
194 .Lcbc_enc_loop_1x:
195         sub             w4, w4, #1
196 
197         ld1             {v0.16b}, [x2], #16
198 
199         eor             RIV.16b, RIV.16b, v0.16b
200         SM4_CRYPT_BLK(RIV)
201 
202         st1             {RIV.16b}, [x1], #16
203 
204         cbnz            w4, .Lcbc_enc_loop_1x
205 
206 .Lcbc_enc_end:
207         /* store new IV */
208         st1             {RIV.16b}, [x3]
209 
210         ret
211 SYM_FUNC_END(sm4_ce_cbc_enc)
212 
213 .align 3
214 SYM_FUNC_START(sm4_ce_cbc_dec)
215         /* input:
216          *   x0: round key array, CTX
217          *   x1: dst
218          *   x2: src
219          *   x3: iv (big endian, 128 bit)
220          *   w4: nblocks
221          */
222         SM4_PREPARE(x0)
223 
224         ld1             {RIV.16b}, [x3]
225 
226 .Lcbc_dec_loop_8x:
227         sub             w4, w4, #8
228         tbnz            w4, #31, .Lcbc_dec_4x
229 
230         ld1             {v0.16b-v3.16b}, [x2], #64
231         ld1             {v4.16b-v7.16b}, [x2], #64
232 
233         rev32           v8.16b, v0.16b
234         rev32           v9.16b, v1.16b
235         rev32           v10.16b, v2.16b
236         rev32           v11.16b, v3.16b
237         rev32           v12.16b, v4.16b
238         rev32           v13.16b, v5.16b
239         rev32           v14.16b, v6.16b
240         rev32           v15.16b, v7.16b
241 
242         SM4_CRYPT_BLK8_BE(v8, v9, v10, v11, v12, v13, v14, v15)
243 
244         eor             v8.16b, v8.16b, RIV.16b
245         eor             v9.16b, v9.16b, v0.16b
246         eor             v10.16b, v10.16b, v1.16b
247         eor             v11.16b, v11.16b, v2.16b
248         eor             v12.16b, v12.16b, v3.16b
249         eor             v13.16b, v13.16b, v4.16b
250         eor             v14.16b, v14.16b, v5.16b
251         eor             v15.16b, v15.16b, v6.16b
252 
253         st1             {v8.16b-v11.16b}, [x1], #64
254         st1             {v12.16b-v15.16b}, [x1], #64
255 
256         mov             RIV.16b, v7.16b
257 
258         cbz             w4, .Lcbc_dec_end
259         b               .Lcbc_dec_loop_8x
260 
261 .Lcbc_dec_4x:
262         add             w4, w4, #8
263         cmp             w4, #4
264         blt             .Lcbc_dec_loop_1x
265 
266         sub             w4, w4, #4
267 
268         ld1             {v0.16b-v3.16b}, [x2], #64
269 
270         rev32           v8.16b, v0.16b
271         rev32           v9.16b, v1.16b
272         rev32           v10.16b, v2.16b
273         rev32           v11.16b, v3.16b
274 
275         SM4_CRYPT_BLK4_BE(v8, v9, v10, v11)
276 
277         eor             v8.16b, v8.16b, RIV.16b
278         eor             v9.16b, v9.16b, v0.16b
279         eor             v10.16b, v10.16b, v1.16b
280         eor             v11.16b, v11.16b, v2.16b
281 
282         st1             {v8.16b-v11.16b}, [x1], #64
283 
284         mov             RIV.16b, v3.16b
285 
286         cbz             w4, .Lcbc_dec_end
287 
288 .Lcbc_dec_loop_1x:
289         sub             w4, w4, #1
290 
291         ld1             {v0.16b}, [x2], #16
292 
293         rev32           v8.16b, v0.16b
294 
295         SM4_CRYPT_BLK_BE(v8)
296 
297         eor             v8.16b, v8.16b, RIV.16b
298         st1             {v8.16b}, [x1], #16
299 
300         mov             RIV.16b, v0.16b
301 
302         cbnz            w4, .Lcbc_dec_loop_1x
303 
304 .Lcbc_dec_end:
305         /* store new IV */
306         st1             {RIV.16b}, [x3]
307 
308         ret
309 SYM_FUNC_END(sm4_ce_cbc_dec)
310 
311 .align 3
312 SYM_FUNC_START(sm4_ce_cbc_cts_enc)
313         /* input:
314          *   x0: round key array, CTX
315          *   x1: dst
316          *   x2: src
317          *   x3: iv (big endian, 128 bit)
318          *   w4: nbytes
319          */
320         SM4_PREPARE(x0)
321 
322         sub             w5, w4, #16
323         uxtw            x5, w5
324 
325         ld1             {RIV.16b}, [x3]
326 
327         ld1             {v0.16b}, [x2]
328         eor             RIV.16b, RIV.16b, v0.16b
329         SM4_CRYPT_BLK(RIV)
330 
331         /* load permute table */
332         adr_l           x6, .Lcts_permute_table
333         add             x7, x6, #32
334         add             x6, x6, x5
335         sub             x7, x7, x5
336         ld1             {v3.16b}, [x6]
337         ld1             {v4.16b}, [x7]
338 
339         /* overlapping loads */
340         add             x2, x2, x5
341         ld1             {v1.16b}, [x2]
342 
343         /* create Cn from En-1 */
344         tbl             v0.16b, {RIV.16b}, v3.16b
345         /* padding Pn with zeros */
346         tbl             v1.16b, {v1.16b}, v4.16b
347 
348         eor             v1.16b, v1.16b, RIV.16b
349         SM4_CRYPT_BLK(v1)
350 
351         /* overlapping stores */
352         add             x5, x1, x5
353         st1             {v0.16b}, [x5]
354         st1             {v1.16b}, [x1]
355 
356         ret
357 SYM_FUNC_END(sm4_ce_cbc_cts_enc)
358 
359 .align 3
360 SYM_FUNC_START(sm4_ce_cbc_cts_dec)
361         /* input:
362          *   x0: round key array, CTX
363          *   x1: dst
364          *   x2: src
365          *   x3: iv (big endian, 128 bit)
366          *   w4: nbytes
367          */
368         SM4_PREPARE(x0)
369 
370         sub             w5, w4, #16
371         uxtw            x5, w5
372 
373         ld1             {RIV.16b}, [x3]
374 
375         /* load permute table */
376         adr_l           x6, .Lcts_permute_table
377         add             x7, x6, #32
378         add             x6, x6, x5
379         sub             x7, x7, x5
380         ld1             {v3.16b}, [x6]
381         ld1             {v4.16b}, [x7]
382 
383         /* overlapping loads */
384         ld1             {v0.16b}, [x2], x5
385         ld1             {v1.16b}, [x2]
386 
387         SM4_CRYPT_BLK(v0)
388         /* select the first Ln bytes of Xn to create Pn */
389         tbl             v2.16b, {v0.16b}, v3.16b
390         eor             v2.16b, v2.16b, v1.16b
391 
392         /* overwrite the first Ln bytes with Cn to create En-1 */
393         tbx             v0.16b, {v1.16b}, v4.16b
394         SM4_CRYPT_BLK(v0)
395         eor             v0.16b, v0.16b, RIV.16b
396 
397         /* overlapping stores */
398         add             x5, x1, x5
399         st1             {v2.16b}, [x5]
400         st1             {v0.16b}, [x1]
401 
402         ret
403 SYM_FUNC_END(sm4_ce_cbc_cts_dec)
404 
405 .align 3
406 SYM_FUNC_START(sm4_ce_ctr_enc)
407         /* input:
408          *   x0: round key array, CTX
409          *   x1: dst
410          *   x2: src
411          *   x3: ctr (big endian, 128 bit)
412          *   w4: nblocks
413          */
414         SM4_PREPARE(x0)
415 
416         ldp             x7, x8, [x3]
417         rev             x7, x7
418         rev             x8, x8
419 
420 .Lctr_loop_8x:
421         sub             w4, w4, #8
422         tbnz            w4, #31, .Lctr_4x
423 
424 #define inc_le128(vctr)                                 \
425                 mov             vctr.d[1], x8;          \
426                 mov             vctr.d[0], x7;          \
427                 adds            x8, x8, #1;             \
428                 rev64           vctr.16b, vctr.16b;     \
429                 adc             x7, x7, xzr;
430 
431         /* construct CTRs */
432         inc_le128(v0)                   /* +0 */
433         inc_le128(v1)                   /* +1 */
434         inc_le128(v2)                   /* +2 */
435         inc_le128(v3)                   /* +3 */
436         inc_le128(v4)                   /* +4 */
437         inc_le128(v5)                   /* +5 */
438         inc_le128(v6)                   /* +6 */
439         inc_le128(v7)                   /* +7 */
440 
441         ld1             {v8.16b-v11.16b}, [x2], #64
442         ld1             {v12.16b-v15.16b}, [x2], #64
443 
444         SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
445 
446         eor             v0.16b, v0.16b, v8.16b
447         eor             v1.16b, v1.16b, v9.16b
448         eor             v2.16b, v2.16b, v10.16b
449         eor             v3.16b, v3.16b, v11.16b
450         eor             v4.16b, v4.16b, v12.16b
451         eor             v5.16b, v5.16b, v13.16b
452         eor             v6.16b, v6.16b, v14.16b
453         eor             v7.16b, v7.16b, v15.16b
454 
455         st1             {v0.16b-v3.16b}, [x1], #64
456         st1             {v4.16b-v7.16b}, [x1], #64
457 
458         cbz             w4, .Lctr_end
459         b               .Lctr_loop_8x
460 
461 .Lctr_4x:
462         add             w4, w4, #8
463         cmp             w4, #4
464         blt             .Lctr_loop_1x
465 
466         sub             w4, w4, #4
467 
468         /* construct CTRs */
469         inc_le128(v0)                   /* +0 */
470         inc_le128(v1)                   /* +1 */
471         inc_le128(v2)                   /* +2 */
472         inc_le128(v3)                   /* +3 */
473 
474         ld1             {v8.16b-v11.16b}, [x2], #64
475 
476         SM4_CRYPT_BLK4(v0, v1, v2, v3)
477 
478         eor             v0.16b, v0.16b, v8.16b
479         eor             v1.16b, v1.16b, v9.16b
480         eor             v2.16b, v2.16b, v10.16b
481         eor             v3.16b, v3.16b, v11.16b
482 
483         st1             {v0.16b-v3.16b}, [x1], #64
484 
485         cbz             w4, .Lctr_end
486 
487 .Lctr_loop_1x:
488         sub             w4, w4, #1
489 
490         /* construct CTRs */
491         inc_le128(v0)
492 
493         ld1             {v8.16b}, [x2], #16
494 
495         SM4_CRYPT_BLK(v0)
496 
497         eor             v0.16b, v0.16b, v8.16b
498         st1             {v0.16b}, [x1], #16
499 
500         cbnz            w4, .Lctr_loop_1x
501 
502 .Lctr_end:
503         /* store new CTR */
504         rev             x7, x7
505         rev             x8, x8
506         stp             x7, x8, [x3]
507 
508         ret
509 SYM_FUNC_END(sm4_ce_ctr_enc)
510 
511 
512 #define tweak_next(vt, vin, RTMP)                                       \
513                 sshr            RTMP.2d, vin.2d, #63;                   \
514                 and             RTMP.16b, RTMP.16b, RMASK.16b;          \
515                 add             vt.2d, vin.2d, vin.2d;                  \
516                 ext             RTMP.16b, RTMP.16b, RTMP.16b, #8;       \
517                 eor             vt.16b, vt.16b, RTMP.16b;
518 
519 .align 3
520 SYM_FUNC_START(sm4_ce_xts_enc)
521         /* input:
522          *   x0: round key array, CTX
523          *   x1: dst
524          *   x2: src
525          *   x3: tweak (big endian, 128 bit)
526          *   w4: nbytes
527          *   x5: round key array for IV
528          */
529         ld1             {v8.16b}, [x3]
530 
531         cbz             x5, .Lxts_enc_nofirst
532 
533         SM4_PREPARE(x5)
534 
535         /* Generate first tweak */
536         SM4_CRYPT_BLK(v8)
537 
538 .Lxts_enc_nofirst:
539         SM4_PREPARE(x0)
540 
541         ands            w5, w4, #15
542         lsr             w4, w4, #4
543         sub             w6, w4, #1
544         csel            w4, w4, w6, eq
545         uxtw            x5, w5
546 
547         movi            RMASK.2s, #0x1
548         movi            RTMP0.2s, #0x87
549         uzp1            RMASK.4s, RMASK.4s, RTMP0.4s
550 
551         cbz             w4, .Lxts_enc_cts
552 
553 .Lxts_enc_loop_8x:
554         sub             w4, w4, #8
555         tbnz            w4, #31, .Lxts_enc_4x
556 
557         tweak_next( v9,  v8, RTMP0)
558         tweak_next(v10,  v9, RTMP1)
559         tweak_next(v11, v10, RTMP2)
560         tweak_next(v12, v11, RTMP3)
561         tweak_next(v13, v12, RTMP0)
562         tweak_next(v14, v13, RTMP1)
563         tweak_next(v15, v14, RTMP2)
564 
565         ld1             {v0.16b-v3.16b}, [x2], #64
566         ld1             {v4.16b-v7.16b}, [x2], #64
567         eor             v0.16b, v0.16b,  v8.16b
568         eor             v1.16b, v1.16b,  v9.16b
569         eor             v2.16b, v2.16b, v10.16b
570         eor             v3.16b, v3.16b, v11.16b
571         eor             v4.16b, v4.16b, v12.16b
572         eor             v5.16b, v5.16b, v13.16b
573         eor             v6.16b, v6.16b, v14.16b
574         eor             v7.16b, v7.16b, v15.16b
575 
576         SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
577 
578         eor             v0.16b, v0.16b,  v8.16b
579         eor             v1.16b, v1.16b,  v9.16b
580         eor             v2.16b, v2.16b, v10.16b
581         eor             v3.16b, v3.16b, v11.16b
582         eor             v4.16b, v4.16b, v12.16b
583         eor             v5.16b, v5.16b, v13.16b
584         eor             v6.16b, v6.16b, v14.16b
585         eor             v7.16b, v7.16b, v15.16b
586         st1             {v0.16b-v3.16b}, [x1], #64
587         st1             {v4.16b-v7.16b}, [x1], #64
588 
589         tweak_next(v8, v15, RTMP3)
590 
591         cbz             w4, .Lxts_enc_cts
592         b               .Lxts_enc_loop_8x
593 
594 .Lxts_enc_4x:
595         add             w4, w4, #8
596         cmp             w4, #4
597         blt             .Lxts_enc_loop_1x
598 
599         sub             w4, w4, #4
600 
601         tweak_next( v9,  v8, RTMP0)
602         tweak_next(v10,  v9, RTMP1)
603         tweak_next(v11, v10, RTMP2)
604 
605         ld1             {v0.16b-v3.16b}, [x2], #64
606         eor             v0.16b, v0.16b,  v8.16b
607         eor             v1.16b, v1.16b,  v9.16b
608         eor             v2.16b, v2.16b, v10.16b
609         eor             v3.16b, v3.16b, v11.16b
610 
611         SM4_CRYPT_BLK4(v0, v1, v2, v3)
612 
613         eor             v0.16b, v0.16b,  v8.16b
614         eor             v1.16b, v1.16b,  v9.16b
615         eor             v2.16b, v2.16b, v10.16b
616         eor             v3.16b, v3.16b, v11.16b
617         st1             {v0.16b-v3.16b}, [x1], #64
618 
619         tweak_next(v8, v11, RTMP3)
620 
621         cbz             w4, .Lxts_enc_cts
622 
623 .Lxts_enc_loop_1x:
624         sub             w4, w4, #1
625 
626         ld1             {v0.16b}, [x2], #16
627         eor             v0.16b, v0.16b, v8.16b
628 
629         SM4_CRYPT_BLK(v0)
630 
631         eor             v0.16b, v0.16b, v8.16b
632         st1             {v0.16b}, [x1], #16
633 
634         tweak_next(v8, v8, RTMP0)
635 
636         cbnz            w4, .Lxts_enc_loop_1x
637 
638 .Lxts_enc_cts:
639         cbz             x5, .Lxts_enc_end
640 
641         /* cipher text stealing */
642 
643         tweak_next(v9, v8, RTMP0)
644         ld1             {v0.16b}, [x2]
645         eor             v0.16b, v0.16b, v8.16b
646         SM4_CRYPT_BLK(v0)
647         eor             v0.16b, v0.16b, v8.16b
648 
649         /* load permute table */
650         adr_l           x6, .Lcts_permute_table
651         add             x7, x6, #32
652         add             x6, x6, x5
653         sub             x7, x7, x5
654         ld1             {v3.16b}, [x6]
655         ld1             {v4.16b}, [x7]
656 
657         /* overlapping loads */
658         add             x2, x2, x5
659         ld1             {v1.16b}, [x2]
660 
661         /* create Cn from En-1 */
662         tbl             v2.16b, {v0.16b}, v3.16b
663         /* padding Pn with En-1 at the end */
664         tbx             v0.16b, {v1.16b}, v4.16b
665 
666         eor             v0.16b, v0.16b, v9.16b
667         SM4_CRYPT_BLK(v0)
668         eor             v0.16b, v0.16b, v9.16b
669 
670 
671         /* overlapping stores */
672         add             x5, x1, x5
673         st1             {v2.16b}, [x5]
674         st1             {v0.16b}, [x1]
675 
676         b               .Lxts_enc_ret
677 
678 .Lxts_enc_end:
679         /* store new tweak */
680         st1             {v8.16b}, [x3]
681 
682 .Lxts_enc_ret:
683         ret
684 SYM_FUNC_END(sm4_ce_xts_enc)
685 
686 .align 3
687 SYM_FUNC_START(sm4_ce_xts_dec)
688         /* input:
689          *   x0: round key array, CTX
690          *   x1: dst
691          *   x2: src
692          *   x3: tweak (big endian, 128 bit)
693          *   w4: nbytes
694          *   x5: round key array for IV
695          */
696         ld1             {v8.16b}, [x3]
697 
698         cbz             x5, .Lxts_dec_nofirst
699 
700         SM4_PREPARE(x5)
701 
702         /* Generate first tweak */
703         SM4_CRYPT_BLK(v8)
704 
705 .Lxts_dec_nofirst:
706         SM4_PREPARE(x0)
707 
708         ands            w5, w4, #15
709         lsr             w4, w4, #4
710         sub             w6, w4, #1
711         csel            w4, w4, w6, eq
712         uxtw            x5, w5
713 
714         movi            RMASK.2s, #0x1
715         movi            RTMP0.2s, #0x87
716         uzp1            RMASK.4s, RMASK.4s, RTMP0.4s
717 
718         cbz             w4, .Lxts_dec_cts
719 
720 .Lxts_dec_loop_8x:
721         sub             w4, w4, #8
722         tbnz            w4, #31, .Lxts_dec_4x
723 
724         tweak_next( v9,  v8, RTMP0)
725         tweak_next(v10,  v9, RTMP1)
726         tweak_next(v11, v10, RTMP2)
727         tweak_next(v12, v11, RTMP3)
728         tweak_next(v13, v12, RTMP0)
729         tweak_next(v14, v13, RTMP1)
730         tweak_next(v15, v14, RTMP2)
731 
732         ld1             {v0.16b-v3.16b}, [x2], #64
733         ld1             {v4.16b-v7.16b}, [x2], #64
734         eor             v0.16b, v0.16b,  v8.16b
735         eor             v1.16b, v1.16b,  v9.16b
736         eor             v2.16b, v2.16b, v10.16b
737         eor             v3.16b, v3.16b, v11.16b
738         eor             v4.16b, v4.16b, v12.16b
739         eor             v5.16b, v5.16b, v13.16b
740         eor             v6.16b, v6.16b, v14.16b
741         eor             v7.16b, v7.16b, v15.16b
742 
743         SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)
744 
745         eor             v0.16b, v0.16b,  v8.16b
746         eor             v1.16b, v1.16b,  v9.16b
747         eor             v2.16b, v2.16b, v10.16b
748         eor             v3.16b, v3.16b, v11.16b
749         eor             v4.16b, v4.16b, v12.16b
750         eor             v5.16b, v5.16b, v13.16b
751         eor             v6.16b, v6.16b, v14.16b
752         eor             v7.16b, v7.16b, v15.16b
753         st1             {v0.16b-v3.16b}, [x1], #64
754         st1             {v4.16b-v7.16b}, [x1], #64
755 
756         tweak_next(v8, v15, RTMP3)
757 
758         cbz             w4, .Lxts_dec_cts
759         b               .Lxts_dec_loop_8x
760 
761 .Lxts_dec_4x:
762         add             w4, w4, #8
763         cmp             w4, #4
764         blt             .Lxts_dec_loop_1x
765 
766         sub             w4, w4, #4
767 
768         tweak_next( v9,  v8, RTMP0)
769         tweak_next(v10,  v9, RTMP1)
770         tweak_next(v11, v10, RTMP2)
771 
772         ld1             {v0.16b-v3.16b}, [x2], #64
773         eor             v0.16b, v0.16b,  v8.16b
774         eor             v1.16b, v1.16b,  v9.16b
775         eor             v2.16b, v2.16b, v10.16b
776         eor             v3.16b, v3.16b, v11.16b
777 
778         SM4_CRYPT_BLK4(v0, v1, v2, v3)
779 
780         eor             v0.16b, v0.16b,  v8.16b
781         eor             v1.16b, v1.16b,  v9.16b
782         eor             v2.16b, v2.16b, v10.16b
783         eor             v3.16b, v3.16b, v11.16b
784         st1             {v0.16b-v3.16b}, [x1], #64
785 
786         tweak_next(v8, v11, RTMP3)
787 
788         cbz             w4, .Lxts_dec_cts
789 
790 .Lxts_dec_loop_1x:
791         sub             w4, w4, #1
792 
793         ld1             {v0.16b}, [x2], #16
794         eor             v0.16b, v0.16b, v8.16b
795 
796         SM4_CRYPT_BLK(v0)
797 
798         eor             v0.16b, v0.16b, v8.16b
799         st1             {v0.16b}, [x1], #16
800 
801         tweak_next(v8, v8, RTMP0)
802 
803         cbnz            w4, .Lxts_dec_loop_1x
804 
805 .Lxts_dec_cts:
806         cbz             x5, .Lxts_dec_end
807 
808         /* cipher text stealing */
809 
810         tweak_next(v9, v8, RTMP0)
811         ld1             {v0.16b}, [x2]
812         eor             v0.16b, v0.16b, v9.16b
813         SM4_CRYPT_BLK(v0)
814         eor             v0.16b, v0.16b, v9.16b
815 
816         /* load permute table */
817         adr_l           x6, .Lcts_permute_table
818         add             x7, x6, #32
819         add             x6, x6, x5
820         sub             x7, x7, x5
821         ld1             {v3.16b}, [x6]
822         ld1             {v4.16b}, [x7]
823 
824         /* overlapping loads */
825         add             x2, x2, x5
826         ld1             {v1.16b}, [x2]
827 
828         /* create Cn from En-1 */
829         tbl             v2.16b, {v0.16b}, v3.16b
830         /* padding Pn with En-1 at the end */
831         tbx             v0.16b, {v1.16b}, v4.16b
832 
833         eor             v0.16b, v0.16b, v8.16b
834         SM4_CRYPT_BLK(v0)
835         eor             v0.16b, v0.16b, v8.16b
836 
837 
838         /* overlapping stores */
839         add             x5, x1, x5
840         st1             {v2.16b}, [x5]
841         st1             {v0.16b}, [x1]
842 
843         b               .Lxts_dec_ret
844 
845 .Lxts_dec_end:
846         /* store new tweak */
847         st1             {v8.16b}, [x3]
848 
849 .Lxts_dec_ret:
850         ret
851 SYM_FUNC_END(sm4_ce_xts_dec)
852 
853 .align 3
854 SYM_FUNC_START(sm4_ce_mac_update)
855         /* input:
856          *   x0: round key array, CTX
857          *   x1: digest
858          *   x2: src
859          *   w3: nblocks
860          *   w4: enc_before
861          *   w5: enc_after
862          */
863         SM4_PREPARE(x0)
864 
865         ld1             {RMAC.16b}, [x1]
866 
867         cbz             w4, .Lmac_update
868 
869         SM4_CRYPT_BLK(RMAC)
870 
871 .Lmac_update:
872         cbz             w3, .Lmac_ret
873 
874         sub             w6, w3, #1
875         cmp             w5, wzr
876         csel            w3, w3, w6, ne
877 
878         cbz             w3, .Lmac_end
879 
880 .Lmac_loop_4x:
881         cmp             w3, #4
882         blt             .Lmac_loop_1x
883 
884         sub             w3, w3, #4
885 
886         ld1             {v0.16b-v3.16b}, [x2], #64
887 
888         eor             RMAC.16b, RMAC.16b, v0.16b
889         SM4_CRYPT_BLK(RMAC)
890         eor             RMAC.16b, RMAC.16b, v1.16b
891         SM4_CRYPT_BLK(RMAC)
892         eor             RMAC.16b, RMAC.16b, v2.16b
893         SM4_CRYPT_BLK(RMAC)
894         eor             RMAC.16b, RMAC.16b, v3.16b
895         SM4_CRYPT_BLK(RMAC)
896 
897         cbz             w3, .Lmac_end
898         b               .Lmac_loop_4x
899 
900 .Lmac_loop_1x:
901         sub             w3, w3, #1
902 
903         ld1             {v0.16b}, [x2], #16
904 
905         eor             RMAC.16b, RMAC.16b, v0.16b
906         SM4_CRYPT_BLK(RMAC)
907 
908         cbnz            w3, .Lmac_loop_1x
909 
910 
911 .Lmac_end:
912         cbnz            w5, .Lmac_ret
913 
914         ld1             {v0.16b}, [x2], #16
915         eor             RMAC.16b, RMAC.16b, v0.16b
916 
917 .Lmac_ret:
918         st1             {RMAC.16b}, [x1]
919         ret
920 SYM_FUNC_END(sm4_ce_mac_update)
921 
922 
923         .section        ".rodata", "a"
924         .align 4
925 .Lbswap128_mask:
926         .byte           0x0c, 0x0d, 0x0e, 0x0f, 0x08, 0x09, 0x0a, 0x0b
927         .byte           0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03
928 
929 .Lcts_permute_table:
930         .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
931         .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
932         .byte            0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
933         .byte            0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
934         .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
935         .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

sflogo.php