1 /* SPDX-License-Identifier: GPL-2.0 */ 1 2 /* 3 * Hardware-accelerated CRC-32 variants for Li 4 * 5 * Use the z/Architecture Vector Extension Fac 6 * computing of bitreflected CRC-32 checksums 7 * and Castagnoli. 8 * 9 * This CRC-32 implementation algorithm is bit 10 * the least-significant bit first (Little-End 11 * 12 * Copyright IBM Corp. 2015 13 * Author(s): Hendrik Brueckner <brueckner@lin 14 */ 15 16 #include <linux/types.h> 17 #include <asm/fpu.h> 18 #include "crc32-vx.h" 19 20 /* Vector register range containing CRC-32 con 21 #define CONST_PERM_LE2BE 9 22 #define CONST_R2R1 10 23 #define CONST_R4R3 11 24 #define CONST_R5 12 25 #define CONST_RU_POLY 13 26 #define CONST_CRC_POLY 14 27 28 /* 29 * The CRC-32 constant block contains reductio 30 * process particular chunks of the input data 31 * 32 * For the CRC-32 variants, the constants are 33 * these definitions: 34 * 35 * R1 = [(x4*128+32 mod P'(x) << 32)]' << 36 * R2 = [(x4*128-32 mod P'(x) << 32)]' << 37 * R3 = [(x128+32 mod P'(x) << 32)]' << 38 * R4 = [(x128-32 mod P'(x) << 32)]' << 39 * R5 = [(x64 mod P'(x) << 32)]' << 40 * R6 = [(x32 mod P'(x) << 32)]' << 41 * 42 * The bitreflected Barret reduction cons 43 * the bit reversal of floor(x**64 / P(x) 44 * 45 * where P(x) is the polynomial in the no 46 * polynomial in the reversed (bitreflect 47 * 48 * CRC-32 (IEEE 802.3 Ethernet, ...) polynomia 49 * 50 * P(x) = 0x04C11DB7 51 * P'(x) = 0xEDB88320 52 * 53 * CRC-32C (Castagnoli) polynomials: 54 * 55 * P(x) = 0x1EDC6F41 56 * P'(x) = 0x82F63B78 57 */ 58 59 static unsigned long constants_CRC_32_LE[] = { 60 0x0f0e0d0c0b0a0908, 0x0706050403020100 61 0x1c6e41596, 0x154442bd4, 62 0x0ccaa009e, 0x1751997d0, 63 0x0, 0x163cd6124, 64 0x0, 0x1f7011641, 65 0x0, 0x1db710641 66 }; 67 68 static unsigned long constants_CRC_32C_LE[] = 69 0x0f0e0d0c0b0a0908, 0x0706050403020100 70 0x09e4addf8, 0x740eef02, 71 0x14cd00bd6, 0xf20c0dfe, 72 0x0, 0x0dd45aab8, 73 0x0, 0x0dea713f1, 74 0x0, 0x105ec76f0 75 }; 76 77 /** 78 * crc32_le_vgfm_generic - Compute CRC-32 (LE 79 * @crc: Initial CRC value, typically ~0. 80 * @buf: Input buffer pointer, performance mig 81 * buffer is on a doubleword boundary. 82 * @size: Size of the buffer, must be 64 bytes 83 * @constants: CRC-32 constant pool base point 84 * 85 * Register usage: 86 * V0: Initial CRC value and interm 87 * V1..V4: Data for CRC computation. 88 * V5..V8: Next data chunks that are fe 89 * V9: Constant for BE->LE conversi 90 * V10..V14: CRC-32 constants. 91 */ 92 static u32 crc32_le_vgfm_generic(u32 crc, unsi 93 { 94 /* Load CRC-32 constants */ 95 fpu_vlm(CONST_PERM_LE2BE, CONST_CRC_PO 96 97 /* 98 * Load the initial CRC value. 99 * 100 * The CRC value is loaded into the ri 101 * vector register and is later XORed 102 * of the loaded input data. 103 */ 104 fpu_vzero(0); /* Cle 105 fpu_vlvgf(0, crc, 3); /* Loa 106 107 /* Load a 64-byte data chunk and XOR w 108 fpu_vlm(1, 4, buf); 109 fpu_vperm(1, 1, 1, CONST_PERM_LE2BE); 110 fpu_vperm(2, 2, 2, CONST_PERM_LE2BE); 111 fpu_vperm(3, 3, 3, CONST_PERM_LE2BE); 112 fpu_vperm(4, 4, 4, CONST_PERM_LE2BE); 113 114 fpu_vx(1, 0, 1); /* V1 115 buf += 64; 116 size -= 64; 117 118 while (size >= 64) { 119 fpu_vlm(5, 8, buf); 120 fpu_vperm(5, 5, 5, CONST_PERM_ 121 fpu_vperm(6, 6, 6, CONST_PERM_ 122 fpu_vperm(7, 7, 7, CONST_PERM_ 123 fpu_vperm(8, 8, 8, CONST_PERM_ 124 /* 125 * Perform a GF(2) multiplicat 126 * the R1 and R2 reduction con 127 * result is then folded (accu 128 * in V5 and stored in V1. Rep 129 * contents in V2, V3, and V4 130 */ 131 fpu_vgfmag(1, CONST_R2R1, 1, 5 132 fpu_vgfmag(2, CONST_R2R1, 2, 6 133 fpu_vgfmag(3, CONST_R2R1, 3, 7 134 fpu_vgfmag(4, CONST_R2R1, 4, 8 135 buf += 64; 136 size -= 64; 137 } 138 139 /* 140 * Fold V1 to V4 into a single 128-bit 141 * and R4 and accumulating the next 12 142 * value remains. 143 */ 144 fpu_vgfmag(1, CONST_R4R3, 1, 2); 145 fpu_vgfmag(1, CONST_R4R3, 1, 3); 146 fpu_vgfmag(1, CONST_R4R3, 1, 4); 147 148 while (size >= 16) { 149 fpu_vl(2, buf); 150 fpu_vperm(2, 2, 2, CONST_PERM_ 151 fpu_vgfmag(1, CONST_R4R3, 1, 2 152 buf += 16; 153 size -= 16; 154 } 155 156 /* 157 * Set up a vector register for byte s 158 * be loaded in bits 1-4 in byte eleme 159 * Shift by 8 bytes: 0x40 160 * Shift by 4 bytes: 0x20 161 */ 162 fpu_vleib(9, 0x40, 7); 163 164 /* 165 * Prepare V0 for the next GF(2) multi 166 * to move R4 into the rightmost doubl 167 * doubleword to 0x1. 168 */ 169 fpu_vsrlb(0, CONST_R4R3, 9); 170 fpu_vleig(0, 1, 0); 171 172 /* 173 * Compute GF(2) product of V1 and V0. 174 * of V1 is multiplied with R4. The l 175 * multiplied by 0x1 and is then XORed 176 * Implicitly, the intermediate leftmo 177 */ 178 fpu_vgfmg(1, 0, 1); 179 180 /* 181 * Now do the final 32-bit fold by mul 182 * in V1 with R5 and XOR the result wi 183 * 184 * To achieve this by a single VGFMAG, 185 * and store the result in V2 which is 186 * vector unpack instruction to load t 187 * doubleword into the rightmost doubl 188 * half is loaded in the leftmost doub 189 * The vector register with CONST_R5 c 190 * rightmost doubleword and the leftmo 191 * the leftmost product of V1. 192 */ 193 fpu_vleib(9, 0x20, 7); /* S 194 fpu_vsrlb(2, 1, 9); /* S 195 fpu_vupllf(1, 1); /* S 196 fpu_vgfmag(1, CONST_R5, 1, 2); /* V 197 198 /* 199 * Apply a Barret reduction to compute 200 * 201 * The input values to the Barret redu 202 * in V1 (R(x)), degree-32 generator p 203 * constant u. The Barret reduction r 204 * P(x). 205 * 206 * The Barret reduction algorithm is d 207 * 208 * 1. T1(x) = floor( R(x) / x^32 ) 209 * 2. T2(x) = floor( T1(x) / x^32 ) 210 * 3. C(x) = R(x) XOR T2(x) mod x^ 211 * 212 * Note: The leftmost doubleword of v 213 * CONST_RU_POLY is zero and, thus, t 214 * is zero and does not contribute to 215 */ 216 217 /* T1(x) = floor( R(x) / x^32 ) GF2MUL 218 fpu_vupllf(2, 1); 219 fpu_vgfmg(2, CONST_RU_POLY, 2); 220 221 /* 222 * Compute the GF(2) product of the CR 223 * V2 and XOR the intermediate result, 224 * The final result is stored in word 225 */ 226 fpu_vupllf(2, 2); 227 fpu_vgfmag(2, CONST_CRC_POLY, 2, 1); 228 229 return fpu_vlgvf(2, 2); 230 } 231 232 u32 crc32_le_vgfm_16(u32 crc, unsigned char co 233 { 234 return crc32_le_vgfm_generic(crc, buf, 235 } 236 237 u32 crc32c_le_vgfm_16(u32 crc, unsigned char c 238 { 239 return crc32_le_vgfm_generic(crc, buf, 240 } 241
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.