~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/arch/x86/include/asm/xor.h

Version: ~ [ linux-6.11.5 ] ~ [ linux-6.10.14 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.58 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.114 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.169 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.228 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.284 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.322 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.9 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 /* SPDX-License-Identifier: GPL-2.0-or-later */
  2 #ifndef _ASM_X86_XOR_H
  3 #define _ASM_X86_XOR_H
  4 
  5 /*
  6  * Optimized RAID-5 checksumming functions for SSE.
  7  */
  8 
  9 /*
 10  * Cache avoiding checksumming functions utilizing KNI instructions
 11  * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
 12  */
 13 
 14 /*
 15  * Based on
 16  * High-speed RAID5 checksumming functions utilizing SSE instructions.
 17  * Copyright (C) 1998 Ingo Molnar.
 18  */
 19 
 20 /*
 21  * x86-64 changes / gcc fixes from Andi Kleen.
 22  * Copyright 2002 Andi Kleen, SuSE Labs.
 23  *
 24  * This hasn't been optimized for the hammer yet, but there are likely
 25  * no advantages to be gotten from x86-64 here anyways.
 26  */
 27 
 28 #include <asm/fpu/api.h>
 29 
 30 #ifdef CONFIG_X86_32
 31 /* reduce register pressure */
 32 # define XOR_CONSTANT_CONSTRAINT "i"
 33 #else
 34 # define XOR_CONSTANT_CONSTRAINT "re"
 35 #endif
 36 
 37 #define OFFS(x)         "16*("#x")"
 38 #define PF_OFFS(x)      "256+16*("#x")"
 39 #define PF0(x)          "       prefetchnta "PF_OFFS(x)"(%[p1])         ;\n"
 40 #define LD(x, y)        "       movaps "OFFS(x)"(%[p1]), %%xmm"#y"      ;\n"
 41 #define ST(x, y)        "       movaps %%xmm"#y", "OFFS(x)"(%[p1])      ;\n"
 42 #define PF1(x)          "       prefetchnta "PF_OFFS(x)"(%[p2])         ;\n"
 43 #define PF2(x)          "       prefetchnta "PF_OFFS(x)"(%[p3])         ;\n"
 44 #define PF3(x)          "       prefetchnta "PF_OFFS(x)"(%[p4])         ;\n"
 45 #define PF4(x)          "       prefetchnta "PF_OFFS(x)"(%[p5])         ;\n"
 46 #define XO1(x, y)       "       xorps "OFFS(x)"(%[p2]), %%xmm"#y"       ;\n"
 47 #define XO2(x, y)       "       xorps "OFFS(x)"(%[p3]), %%xmm"#y"       ;\n"
 48 #define XO3(x, y)       "       xorps "OFFS(x)"(%[p4]), %%xmm"#y"       ;\n"
 49 #define XO4(x, y)       "       xorps "OFFS(x)"(%[p5]), %%xmm"#y"       ;\n"
 50 #define NOP(x)
 51 
 52 #define BLK64(pf, op, i)                                \
 53                 pf(i)                                   \
 54                 op(i, 0)                                \
 55                         op(i + 1, 1)                    \
 56                                 op(i + 2, 2)            \
 57                                         op(i + 3, 3)
 58 
 59 static void
 60 xor_sse_2(unsigned long bytes, unsigned long * __restrict p1,
 61           const unsigned long * __restrict p2)
 62 {
 63         unsigned long lines = bytes >> 8;
 64 
 65         kernel_fpu_begin();
 66 
 67         asm volatile(
 68 #undef BLOCK
 69 #define BLOCK(i)                                        \
 70                 LD(i, 0)                                \
 71                         LD(i + 1, 1)                    \
 72                 PF1(i)                                  \
 73                                 PF1(i + 2)              \
 74                                 LD(i + 2, 2)            \
 75                                         LD(i + 3, 3)    \
 76                 PF0(i + 4)                              \
 77                                 PF0(i + 6)              \
 78                 XO1(i, 0)                               \
 79                         XO1(i + 1, 1)                   \
 80                                 XO1(i + 2, 2)           \
 81                                         XO1(i + 3, 3)   \
 82                 ST(i, 0)                                \
 83                         ST(i + 1, 1)                    \
 84                                 ST(i + 2, 2)            \
 85                                         ST(i + 3, 3)    \
 86 
 87 
 88                 PF0(0)
 89                                 PF0(2)
 90 
 91         " .align 32                     ;\n"
 92         " 1:                            ;\n"
 93 
 94                 BLOCK(0)
 95                 BLOCK(4)
 96                 BLOCK(8)
 97                 BLOCK(12)
 98 
 99         "       add %[inc], %[p1]       ;\n"
100         "       add %[inc], %[p2]       ;\n"
101         "       dec %[cnt]              ;\n"
102         "       jnz 1b                  ;\n"
103         : [cnt] "+r" (lines),
104           [p1] "+r" (p1), [p2] "+r" (p2)
105         : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
106         : "memory");
107 
108         kernel_fpu_end();
109 }
110 
111 static void
112 xor_sse_2_pf64(unsigned long bytes, unsigned long * __restrict p1,
113                const unsigned long * __restrict p2)
114 {
115         unsigned long lines = bytes >> 8;
116 
117         kernel_fpu_begin();
118 
119         asm volatile(
120 #undef BLOCK
121 #define BLOCK(i)                        \
122                 BLK64(PF0, LD, i)       \
123                 BLK64(PF1, XO1, i)      \
124                 BLK64(NOP, ST, i)       \
125 
126         " .align 32                     ;\n"
127         " 1:                            ;\n"
128 
129                 BLOCK(0)
130                 BLOCK(4)
131                 BLOCK(8)
132                 BLOCK(12)
133 
134         "       add %[inc], %[p1]       ;\n"
135         "       add %[inc], %[p2]       ;\n"
136         "       dec %[cnt]              ;\n"
137         "       jnz 1b                  ;\n"
138         : [cnt] "+r" (lines),
139           [p1] "+r" (p1), [p2] "+r" (p2)
140         : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
141         : "memory");
142 
143         kernel_fpu_end();
144 }
145 
146 static void
147 xor_sse_3(unsigned long bytes, unsigned long * __restrict p1,
148           const unsigned long * __restrict p2,
149           const unsigned long * __restrict p3)
150 {
151         unsigned long lines = bytes >> 8;
152 
153         kernel_fpu_begin();
154 
155         asm volatile(
156 #undef BLOCK
157 #define BLOCK(i) \
158                 PF1(i)                                  \
159                                 PF1(i + 2)              \
160                 LD(i, 0)                                \
161                         LD(i + 1, 1)                    \
162                                 LD(i + 2, 2)            \
163                                         LD(i + 3, 3)    \
164                 PF2(i)                                  \
165                                 PF2(i + 2)              \
166                 PF0(i + 4)                              \
167                                 PF0(i + 6)              \
168                 XO1(i, 0)                               \
169                         XO1(i + 1, 1)                   \
170                                 XO1(i + 2, 2)           \
171                                         XO1(i + 3, 3)   \
172                 XO2(i, 0)                               \
173                         XO2(i + 1, 1)                   \
174                                 XO2(i + 2, 2)           \
175                                         XO2(i + 3, 3)   \
176                 ST(i, 0)                                \
177                         ST(i + 1, 1)                    \
178                                 ST(i + 2, 2)            \
179                                         ST(i + 3, 3)    \
180 
181 
182                 PF0(0)
183                                 PF0(2)
184 
185         " .align 32                     ;\n"
186         " 1:                            ;\n"
187 
188                 BLOCK(0)
189                 BLOCK(4)
190                 BLOCK(8)
191                 BLOCK(12)
192 
193         "       add %[inc], %[p1]       ;\n"
194         "       add %[inc], %[p2]       ;\n"
195         "       add %[inc], %[p3]       ;\n"
196         "       dec %[cnt]              ;\n"
197         "       jnz 1b                  ;\n"
198         : [cnt] "+r" (lines),
199           [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
200         : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
201         : "memory");
202 
203         kernel_fpu_end();
204 }
205 
206 static void
207 xor_sse_3_pf64(unsigned long bytes, unsigned long * __restrict p1,
208                const unsigned long * __restrict p2,
209                const unsigned long * __restrict p3)
210 {
211         unsigned long lines = bytes >> 8;
212 
213         kernel_fpu_begin();
214 
215         asm volatile(
216 #undef BLOCK
217 #define BLOCK(i)                        \
218                 BLK64(PF0, LD, i)       \
219                 BLK64(PF1, XO1, i)      \
220                 BLK64(PF2, XO2, i)      \
221                 BLK64(NOP, ST, i)       \
222 
223         " .align 32                     ;\n"
224         " 1:                            ;\n"
225 
226                 BLOCK(0)
227                 BLOCK(4)
228                 BLOCK(8)
229                 BLOCK(12)
230 
231         "       add %[inc], %[p1]       ;\n"
232         "       add %[inc], %[p2]       ;\n"
233         "       add %[inc], %[p3]       ;\n"
234         "       dec %[cnt]              ;\n"
235         "       jnz 1b                  ;\n"
236         : [cnt] "+r" (lines),
237           [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
238         : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
239         : "memory");
240 
241         kernel_fpu_end();
242 }
243 
244 static void
245 xor_sse_4(unsigned long bytes, unsigned long * __restrict p1,
246           const unsigned long * __restrict p2,
247           const unsigned long * __restrict p3,
248           const unsigned long * __restrict p4)
249 {
250         unsigned long lines = bytes >> 8;
251 
252         kernel_fpu_begin();
253 
254         asm volatile(
255 #undef BLOCK
256 #define BLOCK(i) \
257                 PF1(i)                                  \
258                                 PF1(i + 2)              \
259                 LD(i, 0)                                \
260                         LD(i + 1, 1)                    \
261                                 LD(i + 2, 2)            \
262                                         LD(i + 3, 3)    \
263                 PF2(i)                                  \
264                                 PF2(i + 2)              \
265                 XO1(i, 0)                               \
266                         XO1(i + 1, 1)                   \
267                                 XO1(i + 2, 2)           \
268                                         XO1(i + 3, 3)   \
269                 PF3(i)                                  \
270                                 PF3(i + 2)              \
271                 PF0(i + 4)                              \
272                                 PF0(i + 6)              \
273                 XO2(i, 0)                               \
274                         XO2(i + 1, 1)                   \
275                                 XO2(i + 2, 2)           \
276                                         XO2(i + 3, 3)   \
277                 XO3(i, 0)                               \
278                         XO3(i + 1, 1)                   \
279                                 XO3(i + 2, 2)           \
280                                         XO3(i + 3, 3)   \
281                 ST(i, 0)                                \
282                         ST(i + 1, 1)                    \
283                                 ST(i + 2, 2)            \
284                                         ST(i + 3, 3)    \
285 
286 
287                 PF0(0)
288                                 PF0(2)
289 
290         " .align 32                     ;\n"
291         " 1:                            ;\n"
292 
293                 BLOCK(0)
294                 BLOCK(4)
295                 BLOCK(8)
296                 BLOCK(12)
297 
298         "       add %[inc], %[p1]       ;\n"
299         "       add %[inc], %[p2]       ;\n"
300         "       add %[inc], %[p3]       ;\n"
301         "       add %[inc], %[p4]       ;\n"
302         "       dec %[cnt]              ;\n"
303         "       jnz 1b                  ;\n"
304         : [cnt] "+r" (lines), [p1] "+r" (p1),
305           [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
306         : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
307         : "memory");
308 
309         kernel_fpu_end();
310 }
311 
312 static void
313 xor_sse_4_pf64(unsigned long bytes, unsigned long * __restrict p1,
314                const unsigned long * __restrict p2,
315                const unsigned long * __restrict p3,
316                const unsigned long * __restrict p4)
317 {
318         unsigned long lines = bytes >> 8;
319 
320         kernel_fpu_begin();
321 
322         asm volatile(
323 #undef BLOCK
324 #define BLOCK(i)                        \
325                 BLK64(PF0, LD, i)       \
326                 BLK64(PF1, XO1, i)      \
327                 BLK64(PF2, XO2, i)      \
328                 BLK64(PF3, XO3, i)      \
329                 BLK64(NOP, ST, i)       \
330 
331         " .align 32                     ;\n"
332         " 1:                            ;\n"
333 
334                 BLOCK(0)
335                 BLOCK(4)
336                 BLOCK(8)
337                 BLOCK(12)
338 
339         "       add %[inc], %[p1]       ;\n"
340         "       add %[inc], %[p2]       ;\n"
341         "       add %[inc], %[p3]       ;\n"
342         "       add %[inc], %[p4]       ;\n"
343         "       dec %[cnt]              ;\n"
344         "       jnz 1b                  ;\n"
345         : [cnt] "+r" (lines), [p1] "+r" (p1),
346           [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
347         : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
348         : "memory");
349 
350         kernel_fpu_end();
351 }
352 
353 static void
354 xor_sse_5(unsigned long bytes, unsigned long * __restrict p1,
355           const unsigned long * __restrict p2,
356           const unsigned long * __restrict p3,
357           const unsigned long * __restrict p4,
358           const unsigned long * __restrict p5)
359 {
360         unsigned long lines = bytes >> 8;
361 
362         kernel_fpu_begin();
363 
364         asm volatile(
365 #undef BLOCK
366 #define BLOCK(i) \
367                 PF1(i)                                  \
368                                 PF1(i + 2)              \
369                 LD(i, 0)                                \
370                         LD(i + 1, 1)                    \
371                                 LD(i + 2, 2)            \
372                                         LD(i + 3, 3)    \
373                 PF2(i)                                  \
374                                 PF2(i + 2)              \
375                 XO1(i, 0)                               \
376                         XO1(i + 1, 1)                   \
377                                 XO1(i + 2, 2)           \
378                                         XO1(i + 3, 3)   \
379                 PF3(i)                                  \
380                                 PF3(i + 2)              \
381                 XO2(i, 0)                               \
382                         XO2(i + 1, 1)                   \
383                                 XO2(i + 2, 2)           \
384                                         XO2(i + 3, 3)   \
385                 PF4(i)                                  \
386                                 PF4(i + 2)              \
387                 PF0(i + 4)                              \
388                                 PF0(i + 6)              \
389                 XO3(i, 0)                               \
390                         XO3(i + 1, 1)                   \
391                                 XO3(i + 2, 2)           \
392                                         XO3(i + 3, 3)   \
393                 XO4(i, 0)                               \
394                         XO4(i + 1, 1)                   \
395                                 XO4(i + 2, 2)           \
396                                         XO4(i + 3, 3)   \
397                 ST(i, 0)                                \
398                         ST(i + 1, 1)                    \
399                                 ST(i + 2, 2)            \
400                                         ST(i + 3, 3)    \
401 
402 
403                 PF0(0)
404                                 PF0(2)
405 
406         " .align 32                     ;\n"
407         " 1:                            ;\n"
408 
409                 BLOCK(0)
410                 BLOCK(4)
411                 BLOCK(8)
412                 BLOCK(12)
413 
414         "       add %[inc], %[p1]       ;\n"
415         "       add %[inc], %[p2]       ;\n"
416         "       add %[inc], %[p3]       ;\n"
417         "       add %[inc], %[p4]       ;\n"
418         "       add %[inc], %[p5]       ;\n"
419         "       dec %[cnt]              ;\n"
420         "       jnz 1b                  ;\n"
421         : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
422           [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
423         : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
424         : "memory");
425 
426         kernel_fpu_end();
427 }
428 
429 static void
430 xor_sse_5_pf64(unsigned long bytes, unsigned long * __restrict p1,
431                const unsigned long * __restrict p2,
432                const unsigned long * __restrict p3,
433                const unsigned long * __restrict p4,
434                const unsigned long * __restrict p5)
435 {
436         unsigned long lines = bytes >> 8;
437 
438         kernel_fpu_begin();
439 
440         asm volatile(
441 #undef BLOCK
442 #define BLOCK(i)                        \
443                 BLK64(PF0, LD, i)       \
444                 BLK64(PF1, XO1, i)      \
445                 BLK64(PF2, XO2, i)      \
446                 BLK64(PF3, XO3, i)      \
447                 BLK64(PF4, XO4, i)      \
448                 BLK64(NOP, ST, i)       \
449 
450         " .align 32                     ;\n"
451         " 1:                            ;\n"
452 
453                 BLOCK(0)
454                 BLOCK(4)
455                 BLOCK(8)
456                 BLOCK(12)
457 
458         "       add %[inc], %[p1]       ;\n"
459         "       add %[inc], %[p2]       ;\n"
460         "       add %[inc], %[p3]       ;\n"
461         "       add %[inc], %[p4]       ;\n"
462         "       add %[inc], %[p5]       ;\n"
463         "       dec %[cnt]              ;\n"
464         "       jnz 1b                  ;\n"
465         : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
466           [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
467         : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
468         : "memory");
469 
470         kernel_fpu_end();
471 }
472 
473 static struct xor_block_template xor_block_sse_pf64 = {
474         .name = "prefetch64-sse",
475         .do_2 = xor_sse_2_pf64,
476         .do_3 = xor_sse_3_pf64,
477         .do_4 = xor_sse_4_pf64,
478         .do_5 = xor_sse_5_pf64,
479 };
480 
481 #undef LD
482 #undef XO1
483 #undef XO2
484 #undef XO3
485 #undef XO4
486 #undef ST
487 #undef NOP
488 #undef BLK64
489 #undef BLOCK
490 
491 #undef XOR_CONSTANT_CONSTRAINT
492 
493 #ifdef CONFIG_X86_32
494 # include <asm/xor_32.h>
495 #else
496 # include <asm/xor_64.h>
497 #endif
498 
499 #define XOR_SELECT_TEMPLATE(FASTEST) \
500         AVX_SELECT(FASTEST)
501 
502 #endif /* _ASM_X86_XOR_H */
503 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

sflogo.php