~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/arch/x86/crypto/chacha-ssse3-x86_64.S

Version: ~ [ linux-6.12-rc7 ] ~ [ linux-6.11.7 ] ~ [ linux-6.10.14 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.60 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.116 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.171 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.229 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.285 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.323 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.12 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 /* SPDX-License-Identifier: GPL-2.0-or-later */
  2 /*
  3  * ChaCha 256-bit cipher algorithm, x64 SSSE3 functions
  4  *
  5  * Copyright (C) 2015 Martin Willi
  6  */
  7 
  8 #include <linux/linkage.h>
  9 #include <asm/frame.h>
 10 
 11 .section        .rodata.cst16.ROT8, "aM", @progbits, 16
 12 .align 16
 13 ROT8:   .octa 0x0e0d0c0f0a09080b0605040702010003
 14 .section        .rodata.cst16.ROT16, "aM", @progbits, 16
 15 .align 16
 16 ROT16:  .octa 0x0d0c0f0e09080b0a0504070601000302
 17 .section        .rodata.cst16.CTRINC, "aM", @progbits, 16
 18 .align 16
 19 CTRINC: .octa 0x00000003000000020000000100000000
 20 
 21 .text
 22 
 23 /*
 24  * chacha_permute - permute one block
 25  *
 26  * Permute one 64-byte block where the state matrix is in %xmm0-%xmm3.  This
 27  * function performs matrix operations on four words in parallel, but requires
 28  * shuffling to rearrange the words after each round.  8/16-bit word rotation is
 29  * done with the slightly better performing SSSE3 byte shuffling, 7/12-bit word
 30  * rotation uses traditional shift+OR.
 31  *
 32  * The round count is given in %r8d.
 33  *
 34  * Clobbers: %r8d, %xmm4-%xmm7
 35  */
 36 SYM_FUNC_START_LOCAL(chacha_permute)
 37 
 38         movdqa          ROT8(%rip),%xmm4
 39         movdqa          ROT16(%rip),%xmm5
 40 
 41 .Ldoubleround:
 42         # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
 43         paddd           %xmm1,%xmm0
 44         pxor            %xmm0,%xmm3
 45         pshufb          %xmm5,%xmm3
 46 
 47         # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
 48         paddd           %xmm3,%xmm2
 49         pxor            %xmm2,%xmm1
 50         movdqa          %xmm1,%xmm6
 51         pslld           $12,%xmm6
 52         psrld           $20,%xmm1
 53         por             %xmm6,%xmm1
 54 
 55         # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
 56         paddd           %xmm1,%xmm0
 57         pxor            %xmm0,%xmm3
 58         pshufb          %xmm4,%xmm3
 59 
 60         # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
 61         paddd           %xmm3,%xmm2
 62         pxor            %xmm2,%xmm1
 63         movdqa          %xmm1,%xmm7
 64         pslld           $7,%xmm7
 65         psrld           $25,%xmm1
 66         por             %xmm7,%xmm1
 67 
 68         # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
 69         pshufd          $0x39,%xmm1,%xmm1
 70         # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
 71         pshufd          $0x4e,%xmm2,%xmm2
 72         # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
 73         pshufd          $0x93,%xmm3,%xmm3
 74 
 75         # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
 76         paddd           %xmm1,%xmm0
 77         pxor            %xmm0,%xmm3
 78         pshufb          %xmm5,%xmm3
 79 
 80         # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
 81         paddd           %xmm3,%xmm2
 82         pxor            %xmm2,%xmm1
 83         movdqa          %xmm1,%xmm6
 84         pslld           $12,%xmm6
 85         psrld           $20,%xmm1
 86         por             %xmm6,%xmm1
 87 
 88         # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
 89         paddd           %xmm1,%xmm0
 90         pxor            %xmm0,%xmm3
 91         pshufb          %xmm4,%xmm3
 92 
 93         # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
 94         paddd           %xmm3,%xmm2
 95         pxor            %xmm2,%xmm1
 96         movdqa          %xmm1,%xmm7
 97         pslld           $7,%xmm7
 98         psrld           $25,%xmm1
 99         por             %xmm7,%xmm1
100 
101         # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
102         pshufd          $0x93,%xmm1,%xmm1
103         # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
104         pshufd          $0x4e,%xmm2,%xmm2
105         # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
106         pshufd          $0x39,%xmm3,%xmm3
107 
108         sub             $2,%r8d
109         jnz             .Ldoubleround
110 
111         RET
112 SYM_FUNC_END(chacha_permute)
113 
114 SYM_FUNC_START(chacha_block_xor_ssse3)
115         # %rdi: Input state matrix, s
116         # %rsi: up to 1 data block output, o
117         # %rdx: up to 1 data block input, i
118         # %rcx: input/output length in bytes
119         # %r8d: nrounds
120         FRAME_BEGIN
121 
122         # x0..3 = s0..3
123         movdqu          0x00(%rdi),%xmm0
124         movdqu          0x10(%rdi),%xmm1
125         movdqu          0x20(%rdi),%xmm2
126         movdqu          0x30(%rdi),%xmm3
127         movdqa          %xmm0,%xmm8
128         movdqa          %xmm1,%xmm9
129         movdqa          %xmm2,%xmm10
130         movdqa          %xmm3,%xmm11
131 
132         mov             %rcx,%rax
133         call            chacha_permute
134 
135         # o0 = i0 ^ (x0 + s0)
136         paddd           %xmm8,%xmm0
137         cmp             $0x10,%rax
138         jl              .Lxorpart
139         movdqu          0x00(%rdx),%xmm4
140         pxor            %xmm4,%xmm0
141         movdqu          %xmm0,0x00(%rsi)
142         # o1 = i1 ^ (x1 + s1)
143         paddd           %xmm9,%xmm1
144         movdqa          %xmm1,%xmm0
145         cmp             $0x20,%rax
146         jl              .Lxorpart
147         movdqu          0x10(%rdx),%xmm0
148         pxor            %xmm1,%xmm0
149         movdqu          %xmm0,0x10(%rsi)
150         # o2 = i2 ^ (x2 + s2)
151         paddd           %xmm10,%xmm2
152         movdqa          %xmm2,%xmm0
153         cmp             $0x30,%rax
154         jl              .Lxorpart
155         movdqu          0x20(%rdx),%xmm0
156         pxor            %xmm2,%xmm0
157         movdqu          %xmm0,0x20(%rsi)
158         # o3 = i3 ^ (x3 + s3)
159         paddd           %xmm11,%xmm3
160         movdqa          %xmm3,%xmm0
161         cmp             $0x40,%rax
162         jl              .Lxorpart
163         movdqu          0x30(%rdx),%xmm0
164         pxor            %xmm3,%xmm0
165         movdqu          %xmm0,0x30(%rsi)
166 
167 .Ldone:
168         FRAME_END
169         RET
170 
171 .Lxorpart:
172         # xor remaining bytes from partial register into output
173         mov             %rax,%r9
174         and             $0x0f,%r9
175         jz              .Ldone
176         and             $~0x0f,%rax
177 
178         mov             %rsi,%r11
179 
180         lea             8(%rsp),%r10
181         sub             $0x10,%rsp
182         and             $~31,%rsp
183 
184         lea             (%rdx,%rax),%rsi
185         mov             %rsp,%rdi
186         mov             %r9,%rcx
187         rep movsb
188 
189         pxor            0x00(%rsp),%xmm0
190         movdqa          %xmm0,0x00(%rsp)
191 
192         mov             %rsp,%rsi
193         lea             (%r11,%rax),%rdi
194         mov             %r9,%rcx
195         rep movsb
196 
197         lea             -8(%r10),%rsp
198         jmp             .Ldone
199 
200 SYM_FUNC_END(chacha_block_xor_ssse3)
201 
202 SYM_FUNC_START(hchacha_block_ssse3)
203         # %rdi: Input state matrix, s
204         # %rsi: output (8 32-bit words)
205         # %edx: nrounds
206         FRAME_BEGIN
207 
208         movdqu          0x00(%rdi),%xmm0
209         movdqu          0x10(%rdi),%xmm1
210         movdqu          0x20(%rdi),%xmm2
211         movdqu          0x30(%rdi),%xmm3
212 
213         mov             %edx,%r8d
214         call            chacha_permute
215 
216         movdqu          %xmm0,0x00(%rsi)
217         movdqu          %xmm3,0x10(%rsi)
218 
219         FRAME_END
220         RET
221 SYM_FUNC_END(hchacha_block_ssse3)
222 
223 SYM_FUNC_START(chacha_4block_xor_ssse3)
224         # %rdi: Input state matrix, s
225         # %rsi: up to 4 data blocks output, o
226         # %rdx: up to 4 data blocks input, i
227         # %rcx: input/output length in bytes
228         # %r8d: nrounds
229 
230         # This function encrypts four consecutive ChaCha blocks by loading the
231         # the state matrix in SSE registers four times. As we need some scratch
232         # registers, we save the first four registers on the stack. The
233         # algorithm performs each operation on the corresponding word of each
234         # state matrix, hence requires no word shuffling. For final XORing step
235         # we transpose the matrix by interleaving 32- and then 64-bit words,
236         # which allows us to do XOR in SSE registers. 8/16-bit word rotation is
237         # done with the slightly better performing SSSE3 byte shuffling,
238         # 7/12-bit word rotation uses traditional shift+OR.
239 
240         lea             8(%rsp),%r10
241         sub             $0x80,%rsp
242         and             $~63,%rsp
243         mov             %rcx,%rax
244 
245         # x0..15[0-3] = s0..3[0..3]
246         movq            0x00(%rdi),%xmm1
247         pshufd          $0x00,%xmm1,%xmm0
248         pshufd          $0x55,%xmm1,%xmm1
249         movq            0x08(%rdi),%xmm3
250         pshufd          $0x00,%xmm3,%xmm2
251         pshufd          $0x55,%xmm3,%xmm3
252         movq            0x10(%rdi),%xmm5
253         pshufd          $0x00,%xmm5,%xmm4
254         pshufd          $0x55,%xmm5,%xmm5
255         movq            0x18(%rdi),%xmm7
256         pshufd          $0x00,%xmm7,%xmm6
257         pshufd          $0x55,%xmm7,%xmm7
258         movq            0x20(%rdi),%xmm9
259         pshufd          $0x00,%xmm9,%xmm8
260         pshufd          $0x55,%xmm9,%xmm9
261         movq            0x28(%rdi),%xmm11
262         pshufd          $0x00,%xmm11,%xmm10
263         pshufd          $0x55,%xmm11,%xmm11
264         movq            0x30(%rdi),%xmm13
265         pshufd          $0x00,%xmm13,%xmm12
266         pshufd          $0x55,%xmm13,%xmm13
267         movq            0x38(%rdi),%xmm15
268         pshufd          $0x00,%xmm15,%xmm14
269         pshufd          $0x55,%xmm15,%xmm15
270         # x0..3 on stack
271         movdqa          %xmm0,0x00(%rsp)
272         movdqa          %xmm1,0x10(%rsp)
273         movdqa          %xmm2,0x20(%rsp)
274         movdqa          %xmm3,0x30(%rsp)
275 
276         movdqa          CTRINC(%rip),%xmm1
277         movdqa          ROT8(%rip),%xmm2
278         movdqa          ROT16(%rip),%xmm3
279 
280         # x12 += counter values 0-3
281         paddd           %xmm1,%xmm12
282 
283 .Ldoubleround4:
284         # x0 += x4, x12 = rotl32(x12 ^ x0, 16)
285         movdqa          0x00(%rsp),%xmm0
286         paddd           %xmm4,%xmm0
287         movdqa          %xmm0,0x00(%rsp)
288         pxor            %xmm0,%xmm12
289         pshufb          %xmm3,%xmm12
290         # x1 += x5, x13 = rotl32(x13 ^ x1, 16)
291         movdqa          0x10(%rsp),%xmm0
292         paddd           %xmm5,%xmm0
293         movdqa          %xmm0,0x10(%rsp)
294         pxor            %xmm0,%xmm13
295         pshufb          %xmm3,%xmm13
296         # x2 += x6, x14 = rotl32(x14 ^ x2, 16)
297         movdqa          0x20(%rsp),%xmm0
298         paddd           %xmm6,%xmm0
299         movdqa          %xmm0,0x20(%rsp)
300         pxor            %xmm0,%xmm14
301         pshufb          %xmm3,%xmm14
302         # x3 += x7, x15 = rotl32(x15 ^ x3, 16)
303         movdqa          0x30(%rsp),%xmm0
304         paddd           %xmm7,%xmm0
305         movdqa          %xmm0,0x30(%rsp)
306         pxor            %xmm0,%xmm15
307         pshufb          %xmm3,%xmm15
308 
309         # x8 += x12, x4 = rotl32(x4 ^ x8, 12)
310         paddd           %xmm12,%xmm8
311         pxor            %xmm8,%xmm4
312         movdqa          %xmm4,%xmm0
313         pslld           $12,%xmm0
314         psrld           $20,%xmm4
315         por             %xmm0,%xmm4
316         # x9 += x13, x5 = rotl32(x5 ^ x9, 12)
317         paddd           %xmm13,%xmm9
318         pxor            %xmm9,%xmm5
319         movdqa          %xmm5,%xmm0
320         pslld           $12,%xmm0
321         psrld           $20,%xmm5
322         por             %xmm0,%xmm5
323         # x10 += x14, x6 = rotl32(x6 ^ x10, 12)
324         paddd           %xmm14,%xmm10
325         pxor            %xmm10,%xmm6
326         movdqa          %xmm6,%xmm0
327         pslld           $12,%xmm0
328         psrld           $20,%xmm6
329         por             %xmm0,%xmm6
330         # x11 += x15, x7 = rotl32(x7 ^ x11, 12)
331         paddd           %xmm15,%xmm11
332         pxor            %xmm11,%xmm7
333         movdqa          %xmm7,%xmm0
334         pslld           $12,%xmm0
335         psrld           $20,%xmm7
336         por             %xmm0,%xmm7
337 
338         # x0 += x4, x12 = rotl32(x12 ^ x0, 8)
339         movdqa          0x00(%rsp),%xmm0
340         paddd           %xmm4,%xmm0
341         movdqa          %xmm0,0x00(%rsp)
342         pxor            %xmm0,%xmm12
343         pshufb          %xmm2,%xmm12
344         # x1 += x5, x13 = rotl32(x13 ^ x1, 8)
345         movdqa          0x10(%rsp),%xmm0
346         paddd           %xmm5,%xmm0
347         movdqa          %xmm0,0x10(%rsp)
348         pxor            %xmm0,%xmm13
349         pshufb          %xmm2,%xmm13
350         # x2 += x6, x14 = rotl32(x14 ^ x2, 8)
351         movdqa          0x20(%rsp),%xmm0
352         paddd           %xmm6,%xmm0
353         movdqa          %xmm0,0x20(%rsp)
354         pxor            %xmm0,%xmm14
355         pshufb          %xmm2,%xmm14
356         # x3 += x7, x15 = rotl32(x15 ^ x3, 8)
357         movdqa          0x30(%rsp),%xmm0
358         paddd           %xmm7,%xmm0
359         movdqa          %xmm0,0x30(%rsp)
360         pxor            %xmm0,%xmm15
361         pshufb          %xmm2,%xmm15
362 
363         # x8 += x12, x4 = rotl32(x4 ^ x8, 7)
364         paddd           %xmm12,%xmm8
365         pxor            %xmm8,%xmm4
366         movdqa          %xmm4,%xmm0
367         pslld           $7,%xmm0
368         psrld           $25,%xmm4
369         por             %xmm0,%xmm4
370         # x9 += x13, x5 = rotl32(x5 ^ x9, 7)
371         paddd           %xmm13,%xmm9
372         pxor            %xmm9,%xmm5
373         movdqa          %xmm5,%xmm0
374         pslld           $7,%xmm0
375         psrld           $25,%xmm5
376         por             %xmm0,%xmm5
377         # x10 += x14, x6 = rotl32(x6 ^ x10, 7)
378         paddd           %xmm14,%xmm10
379         pxor            %xmm10,%xmm6
380         movdqa          %xmm6,%xmm0
381         pslld           $7,%xmm0
382         psrld           $25,%xmm6
383         por             %xmm0,%xmm6
384         # x11 += x15, x7 = rotl32(x7 ^ x11, 7)
385         paddd           %xmm15,%xmm11
386         pxor            %xmm11,%xmm7
387         movdqa          %xmm7,%xmm0
388         pslld           $7,%xmm0
389         psrld           $25,%xmm7
390         por             %xmm0,%xmm7
391 
392         # x0 += x5, x15 = rotl32(x15 ^ x0, 16)
393         movdqa          0x00(%rsp),%xmm0
394         paddd           %xmm5,%xmm0
395         movdqa          %xmm0,0x00(%rsp)
396         pxor            %xmm0,%xmm15
397         pshufb          %xmm3,%xmm15
398         # x1 += x6, x12 = rotl32(x12 ^ x1, 16)
399         movdqa          0x10(%rsp),%xmm0
400         paddd           %xmm6,%xmm0
401         movdqa          %xmm0,0x10(%rsp)
402         pxor            %xmm0,%xmm12
403         pshufb          %xmm3,%xmm12
404         # x2 += x7, x13 = rotl32(x13 ^ x2, 16)
405         movdqa          0x20(%rsp),%xmm0
406         paddd           %xmm7,%xmm0
407         movdqa          %xmm0,0x20(%rsp)
408         pxor            %xmm0,%xmm13
409         pshufb          %xmm3,%xmm13
410         # x3 += x4, x14 = rotl32(x14 ^ x3, 16)
411         movdqa          0x30(%rsp),%xmm0
412         paddd           %xmm4,%xmm0
413         movdqa          %xmm0,0x30(%rsp)
414         pxor            %xmm0,%xmm14
415         pshufb          %xmm3,%xmm14
416 
417         # x10 += x15, x5 = rotl32(x5 ^ x10, 12)
418         paddd           %xmm15,%xmm10
419         pxor            %xmm10,%xmm5
420         movdqa          %xmm5,%xmm0
421         pslld           $12,%xmm0
422         psrld           $20,%xmm5
423         por             %xmm0,%xmm5
424         # x11 += x12, x6 = rotl32(x6 ^ x11, 12)
425         paddd           %xmm12,%xmm11
426         pxor            %xmm11,%xmm6
427         movdqa          %xmm6,%xmm0
428         pslld           $12,%xmm0
429         psrld           $20,%xmm6
430         por             %xmm0,%xmm6
431         # x8 += x13, x7 = rotl32(x7 ^ x8, 12)
432         paddd           %xmm13,%xmm8
433         pxor            %xmm8,%xmm7
434         movdqa          %xmm7,%xmm0
435         pslld           $12,%xmm0
436         psrld           $20,%xmm7
437         por             %xmm0,%xmm7
438         # x9 += x14, x4 = rotl32(x4 ^ x9, 12)
439         paddd           %xmm14,%xmm9
440         pxor            %xmm9,%xmm4
441         movdqa          %xmm4,%xmm0
442         pslld           $12,%xmm0
443         psrld           $20,%xmm4
444         por             %xmm0,%xmm4
445 
446         # x0 += x5, x15 = rotl32(x15 ^ x0, 8)
447         movdqa          0x00(%rsp),%xmm0
448         paddd           %xmm5,%xmm0
449         movdqa          %xmm0,0x00(%rsp)
450         pxor            %xmm0,%xmm15
451         pshufb          %xmm2,%xmm15
452         # x1 += x6, x12 = rotl32(x12 ^ x1, 8)
453         movdqa          0x10(%rsp),%xmm0
454         paddd           %xmm6,%xmm0
455         movdqa          %xmm0,0x10(%rsp)
456         pxor            %xmm0,%xmm12
457         pshufb          %xmm2,%xmm12
458         # x2 += x7, x13 = rotl32(x13 ^ x2, 8)
459         movdqa          0x20(%rsp),%xmm0
460         paddd           %xmm7,%xmm0
461         movdqa          %xmm0,0x20(%rsp)
462         pxor            %xmm0,%xmm13
463         pshufb          %xmm2,%xmm13
464         # x3 += x4, x14 = rotl32(x14 ^ x3, 8)
465         movdqa          0x30(%rsp),%xmm0
466         paddd           %xmm4,%xmm0
467         movdqa          %xmm0,0x30(%rsp)
468         pxor            %xmm0,%xmm14
469         pshufb          %xmm2,%xmm14
470 
471         # x10 += x15, x5 = rotl32(x5 ^ x10, 7)
472         paddd           %xmm15,%xmm10
473         pxor            %xmm10,%xmm5
474         movdqa          %xmm5,%xmm0
475         pslld           $7,%xmm0
476         psrld           $25,%xmm5
477         por             %xmm0,%xmm5
478         # x11 += x12, x6 = rotl32(x6 ^ x11, 7)
479         paddd           %xmm12,%xmm11
480         pxor            %xmm11,%xmm6
481         movdqa          %xmm6,%xmm0
482         pslld           $7,%xmm0
483         psrld           $25,%xmm6
484         por             %xmm0,%xmm6
485         # x8 += x13, x7 = rotl32(x7 ^ x8, 7)
486         paddd           %xmm13,%xmm8
487         pxor            %xmm8,%xmm7
488         movdqa          %xmm7,%xmm0
489         pslld           $7,%xmm0
490         psrld           $25,%xmm7
491         por             %xmm0,%xmm7
492         # x9 += x14, x4 = rotl32(x4 ^ x9, 7)
493         paddd           %xmm14,%xmm9
494         pxor            %xmm9,%xmm4
495         movdqa          %xmm4,%xmm0
496         pslld           $7,%xmm0
497         psrld           $25,%xmm4
498         por             %xmm0,%xmm4
499 
500         sub             $2,%r8d
501         jnz             .Ldoubleround4
502 
503         # x0[0-3] += s0[0]
504         # x1[0-3] += s0[1]
505         movq            0x00(%rdi),%xmm3
506         pshufd          $0x00,%xmm3,%xmm2
507         pshufd          $0x55,%xmm3,%xmm3
508         paddd           0x00(%rsp),%xmm2
509         movdqa          %xmm2,0x00(%rsp)
510         paddd           0x10(%rsp),%xmm3
511         movdqa          %xmm3,0x10(%rsp)
512         # x2[0-3] += s0[2]
513         # x3[0-3] += s0[3]
514         movq            0x08(%rdi),%xmm3
515         pshufd          $0x00,%xmm3,%xmm2
516         pshufd          $0x55,%xmm3,%xmm3
517         paddd           0x20(%rsp),%xmm2
518         movdqa          %xmm2,0x20(%rsp)
519         paddd           0x30(%rsp),%xmm3
520         movdqa          %xmm3,0x30(%rsp)
521 
522         # x4[0-3] += s1[0]
523         # x5[0-3] += s1[1]
524         movq            0x10(%rdi),%xmm3
525         pshufd          $0x00,%xmm3,%xmm2
526         pshufd          $0x55,%xmm3,%xmm3
527         paddd           %xmm2,%xmm4
528         paddd           %xmm3,%xmm5
529         # x6[0-3] += s1[2]
530         # x7[0-3] += s1[3]
531         movq            0x18(%rdi),%xmm3
532         pshufd          $0x00,%xmm3,%xmm2
533         pshufd          $0x55,%xmm3,%xmm3
534         paddd           %xmm2,%xmm6
535         paddd           %xmm3,%xmm7
536 
537         # x8[0-3] += s2[0]
538         # x9[0-3] += s2[1]
539         movq            0x20(%rdi),%xmm3
540         pshufd          $0x00,%xmm3,%xmm2
541         pshufd          $0x55,%xmm3,%xmm3
542         paddd           %xmm2,%xmm8
543         paddd           %xmm3,%xmm9
544         # x10[0-3] += s2[2]
545         # x11[0-3] += s2[3]
546         movq            0x28(%rdi),%xmm3
547         pshufd          $0x00,%xmm3,%xmm2
548         pshufd          $0x55,%xmm3,%xmm3
549         paddd           %xmm2,%xmm10
550         paddd           %xmm3,%xmm11
551 
552         # x12[0-3] += s3[0]
553         # x13[0-3] += s3[1]
554         movq            0x30(%rdi),%xmm3
555         pshufd          $0x00,%xmm3,%xmm2
556         pshufd          $0x55,%xmm3,%xmm3
557         paddd           %xmm2,%xmm12
558         paddd           %xmm3,%xmm13
559         # x14[0-3] += s3[2]
560         # x15[0-3] += s3[3]
561         movq            0x38(%rdi),%xmm3
562         pshufd          $0x00,%xmm3,%xmm2
563         pshufd          $0x55,%xmm3,%xmm3
564         paddd           %xmm2,%xmm14
565         paddd           %xmm3,%xmm15
566 
567         # x12 += counter values 0-3
568         paddd           %xmm1,%xmm12
569 
570         # interleave 32-bit words in state n, n+1
571         movdqa          0x00(%rsp),%xmm0
572         movdqa          0x10(%rsp),%xmm1
573         movdqa          %xmm0,%xmm2
574         punpckldq       %xmm1,%xmm2
575         punpckhdq       %xmm1,%xmm0
576         movdqa          %xmm2,0x00(%rsp)
577         movdqa          %xmm0,0x10(%rsp)
578         movdqa          0x20(%rsp),%xmm0
579         movdqa          0x30(%rsp),%xmm1
580         movdqa          %xmm0,%xmm2
581         punpckldq       %xmm1,%xmm2
582         punpckhdq       %xmm1,%xmm0
583         movdqa          %xmm2,0x20(%rsp)
584         movdqa          %xmm0,0x30(%rsp)
585         movdqa          %xmm4,%xmm0
586         punpckldq       %xmm5,%xmm4
587         punpckhdq       %xmm5,%xmm0
588         movdqa          %xmm0,%xmm5
589         movdqa          %xmm6,%xmm0
590         punpckldq       %xmm7,%xmm6
591         punpckhdq       %xmm7,%xmm0
592         movdqa          %xmm0,%xmm7
593         movdqa          %xmm8,%xmm0
594         punpckldq       %xmm9,%xmm8
595         punpckhdq       %xmm9,%xmm0
596         movdqa          %xmm0,%xmm9
597         movdqa          %xmm10,%xmm0
598         punpckldq       %xmm11,%xmm10
599         punpckhdq       %xmm11,%xmm0
600         movdqa          %xmm0,%xmm11
601         movdqa          %xmm12,%xmm0
602         punpckldq       %xmm13,%xmm12
603         punpckhdq       %xmm13,%xmm0
604         movdqa          %xmm0,%xmm13
605         movdqa          %xmm14,%xmm0
606         punpckldq       %xmm15,%xmm14
607         punpckhdq       %xmm15,%xmm0
608         movdqa          %xmm0,%xmm15
609 
610         # interleave 64-bit words in state n, n+2
611         movdqa          0x00(%rsp),%xmm0
612         movdqa          0x20(%rsp),%xmm1
613         movdqa          %xmm0,%xmm2
614         punpcklqdq      %xmm1,%xmm2
615         punpckhqdq      %xmm1,%xmm0
616         movdqa          %xmm2,0x00(%rsp)
617         movdqa          %xmm0,0x20(%rsp)
618         movdqa          0x10(%rsp),%xmm0
619         movdqa          0x30(%rsp),%xmm1
620         movdqa          %xmm0,%xmm2
621         punpcklqdq      %xmm1,%xmm2
622         punpckhqdq      %xmm1,%xmm0
623         movdqa          %xmm2,0x10(%rsp)
624         movdqa          %xmm0,0x30(%rsp)
625         movdqa          %xmm4,%xmm0
626         punpcklqdq      %xmm6,%xmm4
627         punpckhqdq      %xmm6,%xmm0
628         movdqa          %xmm0,%xmm6
629         movdqa          %xmm5,%xmm0
630         punpcklqdq      %xmm7,%xmm5
631         punpckhqdq      %xmm7,%xmm0
632         movdqa          %xmm0,%xmm7
633         movdqa          %xmm8,%xmm0
634         punpcklqdq      %xmm10,%xmm8
635         punpckhqdq      %xmm10,%xmm0
636         movdqa          %xmm0,%xmm10
637         movdqa          %xmm9,%xmm0
638         punpcklqdq      %xmm11,%xmm9
639         punpckhqdq      %xmm11,%xmm0
640         movdqa          %xmm0,%xmm11
641         movdqa          %xmm12,%xmm0
642         punpcklqdq      %xmm14,%xmm12
643         punpckhqdq      %xmm14,%xmm0
644         movdqa          %xmm0,%xmm14
645         movdqa          %xmm13,%xmm0
646         punpcklqdq      %xmm15,%xmm13
647         punpckhqdq      %xmm15,%xmm0
648         movdqa          %xmm0,%xmm15
649 
650         # xor with corresponding input, write to output
651         movdqa          0x00(%rsp),%xmm0
652         cmp             $0x10,%rax
653         jl              .Lxorpart4
654         movdqu          0x00(%rdx),%xmm1
655         pxor            %xmm1,%xmm0
656         movdqu          %xmm0,0x00(%rsi)
657 
658         movdqu          %xmm4,%xmm0
659         cmp             $0x20,%rax
660         jl              .Lxorpart4
661         movdqu          0x10(%rdx),%xmm1
662         pxor            %xmm1,%xmm0
663         movdqu          %xmm0,0x10(%rsi)
664 
665         movdqu          %xmm8,%xmm0
666         cmp             $0x30,%rax
667         jl              .Lxorpart4
668         movdqu          0x20(%rdx),%xmm1
669         pxor            %xmm1,%xmm0
670         movdqu          %xmm0,0x20(%rsi)
671 
672         movdqu          %xmm12,%xmm0
673         cmp             $0x40,%rax
674         jl              .Lxorpart4
675         movdqu          0x30(%rdx),%xmm1
676         pxor            %xmm1,%xmm0
677         movdqu          %xmm0,0x30(%rsi)
678 
679         movdqa          0x20(%rsp),%xmm0
680         cmp             $0x50,%rax
681         jl              .Lxorpart4
682         movdqu          0x40(%rdx),%xmm1
683         pxor            %xmm1,%xmm0
684         movdqu          %xmm0,0x40(%rsi)
685 
686         movdqu          %xmm6,%xmm0
687         cmp             $0x60,%rax
688         jl              .Lxorpart4
689         movdqu          0x50(%rdx),%xmm1
690         pxor            %xmm1,%xmm0
691         movdqu          %xmm0,0x50(%rsi)
692 
693         movdqu          %xmm10,%xmm0
694         cmp             $0x70,%rax
695         jl              .Lxorpart4
696         movdqu          0x60(%rdx),%xmm1
697         pxor            %xmm1,%xmm0
698         movdqu          %xmm0,0x60(%rsi)
699 
700         movdqu          %xmm14,%xmm0
701         cmp             $0x80,%rax
702         jl              .Lxorpart4
703         movdqu          0x70(%rdx),%xmm1
704         pxor            %xmm1,%xmm0
705         movdqu          %xmm0,0x70(%rsi)
706 
707         movdqa          0x10(%rsp),%xmm0
708         cmp             $0x90,%rax
709         jl              .Lxorpart4
710         movdqu          0x80(%rdx),%xmm1
711         pxor            %xmm1,%xmm0
712         movdqu          %xmm0,0x80(%rsi)
713 
714         movdqu          %xmm5,%xmm0
715         cmp             $0xa0,%rax
716         jl              .Lxorpart4
717         movdqu          0x90(%rdx),%xmm1
718         pxor            %xmm1,%xmm0
719         movdqu          %xmm0,0x90(%rsi)
720 
721         movdqu          %xmm9,%xmm0
722         cmp             $0xb0,%rax
723         jl              .Lxorpart4
724         movdqu          0xa0(%rdx),%xmm1
725         pxor            %xmm1,%xmm0
726         movdqu          %xmm0,0xa0(%rsi)
727 
728         movdqu          %xmm13,%xmm0
729         cmp             $0xc0,%rax
730         jl              .Lxorpart4
731         movdqu          0xb0(%rdx),%xmm1
732         pxor            %xmm1,%xmm0
733         movdqu          %xmm0,0xb0(%rsi)
734 
735         movdqa          0x30(%rsp),%xmm0
736         cmp             $0xd0,%rax
737         jl              .Lxorpart4
738         movdqu          0xc0(%rdx),%xmm1
739         pxor            %xmm1,%xmm0
740         movdqu          %xmm0,0xc0(%rsi)
741 
742         movdqu          %xmm7,%xmm0
743         cmp             $0xe0,%rax
744         jl              .Lxorpart4
745         movdqu          0xd0(%rdx),%xmm1
746         pxor            %xmm1,%xmm0
747         movdqu          %xmm0,0xd0(%rsi)
748 
749         movdqu          %xmm11,%xmm0
750         cmp             $0xf0,%rax
751         jl              .Lxorpart4
752         movdqu          0xe0(%rdx),%xmm1
753         pxor            %xmm1,%xmm0
754         movdqu          %xmm0,0xe0(%rsi)
755 
756         movdqu          %xmm15,%xmm0
757         cmp             $0x100,%rax
758         jl              .Lxorpart4
759         movdqu          0xf0(%rdx),%xmm1
760         pxor            %xmm1,%xmm0
761         movdqu          %xmm0,0xf0(%rsi)
762 
763 .Ldone4:
764         lea             -8(%r10),%rsp
765         RET
766 
767 .Lxorpart4:
768         # xor remaining bytes from partial register into output
769         mov             %rax,%r9
770         and             $0x0f,%r9
771         jz              .Ldone4
772         and             $~0x0f,%rax
773 
774         mov             %rsi,%r11
775 
776         lea             (%rdx,%rax),%rsi
777         mov             %rsp,%rdi
778         mov             %r9,%rcx
779         rep movsb
780 
781         pxor            0x00(%rsp),%xmm0
782         movdqa          %xmm0,0x00(%rsp)
783 
784         mov             %rsp,%rsi
785         lea             (%r11,%rax),%rdi
786         mov             %r9,%rcx
787         rep movsb
788 
789         jmp             .Ldone4
790 
791 SYM_FUNC_END(chacha_4block_xor_ssse3)

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

sflogo.php