~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/arch/s390/crypto/crc32le-vx.c

Version: ~ [ linux-6.11.5 ] ~ [ linux-6.10.14 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.58 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.114 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.169 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.228 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.284 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.322 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.9 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 /* SPDX-License-Identifier: GPL-2.0 */
  2 /*
  3  * Hardware-accelerated CRC-32 variants for Linux on z Systems
  4  *
  5  * Use the z/Architecture Vector Extension Facility to accelerate the
  6  * computing of bitreflected CRC-32 checksums for IEEE 802.3 Ethernet
  7  * and Castagnoli.
  8  *
  9  * This CRC-32 implementation algorithm is bitreflected and processes
 10  * the least-significant bit first (Little-Endian).
 11  *
 12  * Copyright IBM Corp. 2015
 13  * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
 14  */
 15 
 16 #include <linux/types.h>
 17 #include <asm/fpu.h>
 18 #include "crc32-vx.h"
 19 
 20 /* Vector register range containing CRC-32 constants */
 21 #define CONST_PERM_LE2BE        9
 22 #define CONST_R2R1              10
 23 #define CONST_R4R3              11
 24 #define CONST_R5                12
 25 #define CONST_RU_POLY           13
 26 #define CONST_CRC_POLY          14
 27 
 28 /*
 29  * The CRC-32 constant block contains reduction constants to fold and
 30  * process particular chunks of the input data stream in parallel.
 31  *
 32  * For the CRC-32 variants, the constants are precomputed according to
 33  * these definitions:
 34  *
 35  *      R1 = [(x4*128+32 mod P'(x) << 32)]' << 1
 36  *      R2 = [(x4*128-32 mod P'(x) << 32)]' << 1
 37  *      R3 = [(x128+32 mod P'(x) << 32)]'   << 1
 38  *      R4 = [(x128-32 mod P'(x) << 32)]'   << 1
 39  *      R5 = [(x64 mod P'(x) << 32)]'       << 1
 40  *      R6 = [(x32 mod P'(x) << 32)]'       << 1
 41  *
 42  *      The bitreflected Barret reduction constant, u', is defined as
 43  *      the bit reversal of floor(x**64 / P(x)).
 44  *
 45  *      where P(x) is the polynomial in the normal domain and the P'(x) is the
 46  *      polynomial in the reversed (bitreflected) domain.
 47  *
 48  * CRC-32 (IEEE 802.3 Ethernet, ...) polynomials:
 49  *
 50  *      P(x)  = 0x04C11DB7
 51  *      P'(x) = 0xEDB88320
 52  *
 53  * CRC-32C (Castagnoli) polynomials:
 54  *
 55  *      P(x)  = 0x1EDC6F41
 56  *      P'(x) = 0x82F63B78
 57  */
 58 
 59 static unsigned long constants_CRC_32_LE[] = {
 60         0x0f0e0d0c0b0a0908, 0x0706050403020100, /* BE->LE mask */
 61         0x1c6e41596, 0x154442bd4,               /* R2, R1 */
 62         0x0ccaa009e, 0x1751997d0,               /* R4, R3 */
 63         0x0, 0x163cd6124,                       /* R5 */
 64         0x0, 0x1f7011641,                       /* u' */
 65         0x0, 0x1db710641                        /* P'(x) << 1 */
 66 };
 67 
 68 static unsigned long constants_CRC_32C_LE[] = {
 69         0x0f0e0d0c0b0a0908, 0x0706050403020100, /* BE->LE mask */
 70         0x09e4addf8, 0x740eef02,                /* R2, R1 */
 71         0x14cd00bd6, 0xf20c0dfe,                /* R4, R3 */
 72         0x0, 0x0dd45aab8,                       /* R5 */
 73         0x0, 0x0dea713f1,                       /* u' */
 74         0x0, 0x105ec76f0                        /* P'(x) << 1 */
 75 };
 76 
 77 /**
 78  * crc32_le_vgfm_generic - Compute CRC-32 (LE variant) with vector registers
 79  * @crc: Initial CRC value, typically ~0.
 80  * @buf: Input buffer pointer, performance might be improved if the
 81  *       buffer is on a doubleword boundary.
 82  * @size: Size of the buffer, must be 64 bytes or greater.
 83  * @constants: CRC-32 constant pool base pointer.
 84  *
 85  * Register usage:
 86  *      V0:       Initial CRC value and intermediate constants and results.
 87  *      V1..V4:   Data for CRC computation.
 88  *      V5..V8:   Next data chunks that are fetched from the input buffer.
 89  *      V9:       Constant for BE->LE conversion and shift operations
 90  *      V10..V14: CRC-32 constants.
 91  */
 92 static u32 crc32_le_vgfm_generic(u32 crc, unsigned char const *buf, size_t size, unsigned long *constants)
 93 {
 94         /* Load CRC-32 constants */
 95         fpu_vlm(CONST_PERM_LE2BE, CONST_CRC_POLY, constants);
 96 
 97         /*
 98          * Load the initial CRC value.
 99          *
100          * The CRC value is loaded into the rightmost word of the
101          * vector register and is later XORed with the LSB portion
102          * of the loaded input data.
103          */
104         fpu_vzero(0);                   /* Clear V0 */
105         fpu_vlvgf(0, crc, 3);           /* Load CRC into rightmost word */
106 
107         /* Load a 64-byte data chunk and XOR with CRC */
108         fpu_vlm(1, 4, buf);
109         fpu_vperm(1, 1, 1, CONST_PERM_LE2BE);
110         fpu_vperm(2, 2, 2, CONST_PERM_LE2BE);
111         fpu_vperm(3, 3, 3, CONST_PERM_LE2BE);
112         fpu_vperm(4, 4, 4, CONST_PERM_LE2BE);
113 
114         fpu_vx(1, 0, 1);                /* V1 ^= CRC */
115         buf += 64;
116         size -= 64;
117 
118         while (size >= 64) {
119                 fpu_vlm(5, 8, buf);
120                 fpu_vperm(5, 5, 5, CONST_PERM_LE2BE);
121                 fpu_vperm(6, 6, 6, CONST_PERM_LE2BE);
122                 fpu_vperm(7, 7, 7, CONST_PERM_LE2BE);
123                 fpu_vperm(8, 8, 8, CONST_PERM_LE2BE);
124                 /*
125                  * Perform a GF(2) multiplication of the doublewords in V1 with
126                  * the R1 and R2 reduction constants in V0.  The intermediate
127                  * result is then folded (accumulated) with the next data chunk
128                  * in V5 and stored in V1. Repeat this step for the register
129                  * contents in V2, V3, and V4 respectively.
130                  */
131                 fpu_vgfmag(1, CONST_R2R1, 1, 5);
132                 fpu_vgfmag(2, CONST_R2R1, 2, 6);
133                 fpu_vgfmag(3, CONST_R2R1, 3, 7);
134                 fpu_vgfmag(4, CONST_R2R1, 4, 8);
135                 buf += 64;
136                 size -= 64;
137         }
138 
139         /*
140          * Fold V1 to V4 into a single 128-bit value in V1.  Multiply V1 with R3
141          * and R4 and accumulating the next 128-bit chunk until a single 128-bit
142          * value remains.
143          */
144         fpu_vgfmag(1, CONST_R4R3, 1, 2);
145         fpu_vgfmag(1, CONST_R4R3, 1, 3);
146         fpu_vgfmag(1, CONST_R4R3, 1, 4);
147 
148         while (size >= 16) {
149                 fpu_vl(2, buf);
150                 fpu_vperm(2, 2, 2, CONST_PERM_LE2BE);
151                 fpu_vgfmag(1, CONST_R4R3, 1, 2);
152                 buf += 16;
153                 size -= 16;
154         }
155 
156         /*
157          * Set up a vector register for byte shifts.  The shift value must
158          * be loaded in bits 1-4 in byte element 7 of a vector register.
159          * Shift by 8 bytes: 0x40
160          * Shift by 4 bytes: 0x20
161          */
162         fpu_vleib(9, 0x40, 7);
163 
164         /*
165          * Prepare V0 for the next GF(2) multiplication: shift V0 by 8 bytes
166          * to move R4 into the rightmost doubleword and set the leftmost
167          * doubleword to 0x1.
168          */
169         fpu_vsrlb(0, CONST_R4R3, 9);
170         fpu_vleig(0, 1, 0);
171 
172         /*
173          * Compute GF(2) product of V1 and V0.  The rightmost doubleword
174          * of V1 is multiplied with R4.  The leftmost doubleword of V1 is
175          * multiplied by 0x1 and is then XORed with rightmost product.
176          * Implicitly, the intermediate leftmost product becomes padded
177          */
178         fpu_vgfmg(1, 0, 1);
179 
180         /*
181          * Now do the final 32-bit fold by multiplying the rightmost word
182          * in V1 with R5 and XOR the result with the remaining bits in V1.
183          *
184          * To achieve this by a single VGFMAG, right shift V1 by a word
185          * and store the result in V2 which is then accumulated.  Use the
186          * vector unpack instruction to load the rightmost half of the
187          * doubleword into the rightmost doubleword element of V1; the other
188          * half is loaded in the leftmost doubleword.
189          * The vector register with CONST_R5 contains the R5 constant in the
190          * rightmost doubleword and the leftmost doubleword is zero to ignore
191          * the leftmost product of V1.
192          */
193         fpu_vleib(9, 0x20, 7);            /* Shift by words */
194         fpu_vsrlb(2, 1, 9);               /* Store remaining bits in V2 */
195         fpu_vupllf(1, 1);                 /* Split rightmost doubleword */
196         fpu_vgfmag(1, CONST_R5, 1, 2);    /* V1 = (V1 * R5) XOR V2 */
197 
198         /*
199          * Apply a Barret reduction to compute the final 32-bit CRC value.
200          *
201          * The input values to the Barret reduction are the degree-63 polynomial
202          * in V1 (R(x)), degree-32 generator polynomial, and the reduction
203          * constant u.  The Barret reduction result is the CRC value of R(x) mod
204          * P(x).
205          *
206          * The Barret reduction algorithm is defined as:
207          *
208          *    1. T1(x) = floor( R(x) / x^32 ) GF2MUL u
209          *    2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x)
210          *    3. C(x)  = R(x) XOR T2(x) mod x^32
211          *
212          *  Note: The leftmost doubleword of vector register containing
213          *  CONST_RU_POLY is zero and, thus, the intermediate GF(2) product
214          *  is zero and does not contribute to the final result.
215          */
216 
217         /* T1(x) = floor( R(x) / x^32 ) GF2MUL u */
218         fpu_vupllf(2, 1);
219         fpu_vgfmg(2, CONST_RU_POLY, 2);
220 
221         /*
222          * Compute the GF(2) product of the CRC polynomial with T1(x) in
223          * V2 and XOR the intermediate result, T2(x), with the value in V1.
224          * The final result is stored in word element 2 of V2.
225          */
226         fpu_vupllf(2, 2);
227         fpu_vgfmag(2, CONST_CRC_POLY, 2, 1);
228 
229         return fpu_vlgvf(2, 2);
230 }
231 
232 u32 crc32_le_vgfm_16(u32 crc, unsigned char const *buf, size_t size)
233 {
234         return crc32_le_vgfm_generic(crc, buf, size, &constants_CRC_32_LE[0]);
235 }
236 
237 u32 crc32c_le_vgfm_16(u32 crc, unsigned char const *buf, size_t size)
238 {
239         return crc32_le_vgfm_generic(crc, buf, size, &constants_CRC_32C_LE[0]);
240 }
241 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

sflogo.php