1 ############################################## 2 # Implement fast CRC-T10DIF computation with S 3 # 4 # Copyright (c) 2013, Intel Corporation 5 # 6 # Authors: 7 # Erdinc Ozturk <erdinc.ozturk@intel.com> 8 # Vinodh Gopal <vinodh.gopal@intel.com> 9 # James Guilford <james.guilford@intel.com> 10 # Tim Chen <tim.c.chen@linux.intel.com> 11 # 12 # This software is available to you under a ch 13 # licenses. You may choose to be licensed und 14 # General Public License (GPL) Version 2, avai 15 # COPYING in the main directory of this source 16 # OpenIB.org BSD license below: 17 # 18 # Redistribution and use in source and binary 19 # modification, are permitted provided that th 20 # met: 21 # 22 # * Redistributions of source code must retain 23 # notice, this list of conditions and the fo 24 # 25 # * Redistributions in binary form must reprod 26 # notice, this list of conditions and the fo 27 # documentation and/or other materials provi 28 # distribution. 29 # 30 # * Neither the name of the Intel Corporation 31 # contributors may be used to endorse or pro 32 # this software without specific prior writt 33 # 34 # 35 # THIS SOFTWARE IS PROVIDED BY INTEL CORPORATI 36 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BU 37 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FI 38 # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL IN 39 # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIR 40 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDI 41 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 42 # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER C 43 # LIABILITY, WHETHER IN CONTRACT, STRICT LIABI 44 # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 45 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY 46 # 47 # Reference paper titled "Fast CRC Compu 48 # Polynomials Using PCLMULQDQ Instructio 49 # URL: http://www.intel.com/content/dam/ 50 # /white-papers/fast-crc-computation-generic- 51 # 52 53 #include <linux/linkage.h> 54 55 .text 56 57 #define init_crc %edi 58 #define buf %rsi 59 #define len %rdx 60 61 #define FOLD_CONSTS %xmm10 62 #define BSWAP_MASK %xmm11 63 64 # Fold reg1, reg2 into the next 32 data bytes, 65 # reg1, reg2. 66 .macro fold_32_bytes offset, reg1, reg2 67 movdqu \offset(buf), %xmm9 68 movdqu \offset+16(buf), %xmm12 69 pshufb BSWAP_MASK, %xmm9 70 pshufb BSWAP_MASK, %xmm12 71 movdqa \reg1, %xmm8 72 movdqa \reg2, %xmm13 73 pclmulqdq $0x00, FOLD_CONSTS, \r 74 pclmulqdq $0x11, FOLD_CONSTS, %x 75 pclmulqdq $0x00, FOLD_CONSTS, \r 76 pclmulqdq $0x11, FOLD_CONSTS, %x 77 pxor %xmm9 , \reg1 78 xorps %xmm8 , \reg1 79 pxor %xmm12, \reg2 80 xorps %xmm13, \reg2 81 .endm 82 83 # Fold src_reg into dst_reg. 84 .macro fold_16_bytes src_reg, dst_reg 85 movdqa \src_reg, %xmm8 86 pclmulqdq $0x11, FOLD_CONSTS, \s 87 pclmulqdq $0x00, FOLD_CONSTS, %x 88 pxor %xmm8, \dst_reg 89 xorps \src_reg, \dst_reg 90 .endm 91 92 # 93 # u16 crc_t10dif_pcl(u16 init_crc, const *u8 b 94 # 95 # Assumes len >= 16. 96 # 97 SYM_FUNC_START(crc_t10dif_pcl) 98 99 movdqa .Lbswap_mask(%rip), BSWAP_MASK 100 101 # For sizes less than 256 bytes, we ca 102 cmp $256, len 103 jl .Lless_than_256_bytes 104 105 # Load the first 128 data bytes. Byte 106 # bit order match the polynomial coeff 107 movdqu 16*0(buf), %xmm0 108 movdqu 16*1(buf), %xmm1 109 movdqu 16*2(buf), %xmm2 110 movdqu 16*3(buf), %xmm3 111 movdqu 16*4(buf), %xmm4 112 movdqu 16*5(buf), %xmm5 113 movdqu 16*6(buf), %xmm6 114 movdqu 16*7(buf), %xmm7 115 add $128, buf 116 pshufb BSWAP_MASK, %xmm0 117 pshufb BSWAP_MASK, %xmm1 118 pshufb BSWAP_MASK, %xmm2 119 pshufb BSWAP_MASK, %xmm3 120 pshufb BSWAP_MASK, %xmm4 121 pshufb BSWAP_MASK, %xmm5 122 pshufb BSWAP_MASK, %xmm6 123 pshufb BSWAP_MASK, %xmm7 124 125 # XOR the first 16 data *bits* with th 126 pxor %xmm8, %xmm8 127 pinsrw $7, init_crc, %xmm8 128 pxor %xmm8, %xmm0 129 130 movdqa .Lfold_across_128_bytes_consts 131 132 # Subtract 128 for the 128 data bytes 133 # 128 to simplify the termination cond 134 sub $256, len 135 136 # While >= 128 data bytes remain (not 137 # bytes xmm0-7 into them, storing the 138 .Lfold_128_bytes_loop: 139 fold_32_bytes 0, %xmm0, %xmm1 140 fold_32_bytes 32, %xmm2, %xmm3 141 fold_32_bytes 64, %xmm4, %xmm5 142 fold_32_bytes 96, %xmm6, %xmm7 143 add $128, buf 144 sub $128, len 145 jge .Lfold_128_bytes_loop 146 147 # Now fold the 112 bytes in xmm0-xmm6 148 149 # Fold across 64 bytes. 150 movdqa .Lfold_across_64_bytes_consts( 151 fold_16_bytes %xmm0, %xmm4 152 fold_16_bytes %xmm1, %xmm5 153 fold_16_bytes %xmm2, %xmm6 154 fold_16_bytes %xmm3, %xmm7 155 # Fold across 32 bytes. 156 movdqa .Lfold_across_32_bytes_consts( 157 fold_16_bytes %xmm4, %xmm6 158 fold_16_bytes %xmm5, %xmm7 159 # Fold across 16 bytes. 160 movdqa .Lfold_across_16_bytes_consts( 161 fold_16_bytes %xmm6, %xmm7 162 163 # Add 128 to get the correct number of 164 # (not counting xmm7), following the p 165 # Then subtract 16 to simplify the ter 166 # following loop. 167 add $128-16, len 168 169 # While >= 16 data bytes remain (not c 170 # xmm7 into them, storing the result b 171 jl .Lfold_16_bytes_loop_done 172 .Lfold_16_bytes_loop: 173 movdqa %xmm7, %xmm8 174 pclmulqdq $0x11, FOLD_CONSTS, %x 175 pclmulqdq $0x00, FOLD_CONSTS, %x 176 pxor %xmm8, %xmm7 177 movdqu (buf), %xmm0 178 pshufb BSWAP_MASK, %xmm0 179 pxor %xmm0 , %xmm7 180 add $16, buf 181 sub $16, len 182 jge .Lfold_16_bytes_loop 183 184 .Lfold_16_bytes_loop_done: 185 # Add 16 to get the correct number of 186 # (not counting xmm7), following the p 187 add $16, len 188 je .Lreduce_final_16_bytes 189 190 .Lhandle_partial_segment: 191 # Reduce the last '16 + len' bytes whe 192 # bytes are in xmm7 and the rest are t 193 # this without needing a fold constant 194 # the bytes into a first chunk of 'len 195 # bytes, then fold the first chunk int 196 197 movdqa %xmm7, %xmm2 198 199 # xmm1 = last 16 original data bytes 200 movdqu -16(buf, len), %xmm1 201 pshufb BSWAP_MASK, %xmm1 202 203 # xmm2 = high order part of second chu 204 lea .Lbyteshift_table+16(%rip), %r 205 sub len, %rax 206 movdqu (%rax), %xmm0 207 pshufb %xmm0, %xmm2 208 209 # xmm7 = first chunk: xmm7 right-shift 210 pxor .Lmask1(%rip), %xmm0 211 pshufb %xmm0, %xmm7 212 213 # xmm1 = second chunk: 'len' bytes fro 214 # then '16-len' bytes from xmm2 (high- 215 pblendvb %xmm2, %xmm1 #xmm0 216 217 # Fold the first chunk into the second 218 movdqa %xmm7, %xmm8 219 pclmulqdq $0x11, FOLD_CONSTS, %x 220 pclmulqdq $0x00, FOLD_CONSTS, %x 221 pxor %xmm8, %xmm7 222 pxor %xmm1, %xmm7 223 224 .Lreduce_final_16_bytes: 225 # Reduce the 128-bit value M(x), store 226 227 # Load 'x^48 * (x^48 mod G(x))' and 'x 228 movdqa .Lfinal_fold_consts(%rip), FOL 229 230 # Fold the high 64 bits into the low 6 231 # x^64. This produces a 128-bit value 232 # whose low 48 bits are 0. 233 movdqa %xmm7, %xmm0 234 pclmulqdq $0x11, FOLD_CONSTS, %x 235 pslldq $8, %xmm0 236 pxor %xmm0, %xmm7 237 238 # Fold the high 32 bits into the low 9 239 # value congruent to x^64 * M(x) and w 240 movdqa %xmm7, %xmm0 241 pand .Lmask2(%rip), %xmm0 242 psrldq $12, %xmm7 243 pclmulqdq $0x00, FOLD_CONSTS, %x 244 pxor %xmm0, %xmm7 245 246 # Load G(x) and floor(x^48 / G(x)). 247 movdqa .Lbarrett_reduction_consts(%ri 248 249 # Use Barrett reduction to compute the 250 movdqa %xmm7, %xmm0 251 pclmulqdq $0x11, FOLD_CONSTS, %x 252 psrlq $32, %xmm7 253 pclmulqdq $0x00, FOLD_CONSTS, %x 254 psrlq $48, %xmm0 255 pxor %xmm7, %xmm0 # 256 # Final CRC value (x^16 * M(x)) mod G( 257 258 pextrw $0, %xmm0, %eax 259 RET 260 261 .align 16 262 .Lless_than_256_bytes: 263 # Checksumming a buffer of length 16.. 264 265 # Load the first 16 data bytes. 266 movdqu (buf), %xmm7 267 pshufb BSWAP_MASK, %xmm7 268 add $16, buf 269 270 # XOR the first 16 data *bits* with th 271 pxor %xmm0, %xmm0 272 pinsrw $7, init_crc, %xmm0 273 pxor %xmm0, %xmm7 274 275 movdqa .Lfold_across_16_bytes_consts( 276 cmp $16, len 277 je .Lreduce_final_16_bytes 278 sub $32, len 279 jge .Lfold_16_bytes_loop 280 add $16, len 281 jmp .Lhandle_partial_segment 282 SYM_FUNC_END(crc_t10dif_pcl) 283 284 .section .rodata, "a", @progbits 285 .align 16 286 287 # Fold constants precomputed from the polynomi 288 # G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 289 .Lfold_across_128_bytes_consts: 290 .quad 0x0000000000006123 291 .quad 0x0000000000002295 292 .Lfold_across_64_bytes_consts: 293 .quad 0x0000000000001069 294 .quad 0x000000000000dd31 295 .Lfold_across_32_bytes_consts: 296 .quad 0x000000000000857d 297 .quad 0x0000000000007acc 298 .Lfold_across_16_bytes_consts: 299 .quad 0x000000000000a010 300 .quad 0x0000000000001faa 301 .Lfinal_fold_consts: 302 .quad 0x1368000000000000 303 .quad 0x2d56000000000000 304 .Lbarrett_reduction_consts: 305 .quad 0x0000000000018bb7 306 .quad 0x00000001f65a57f8 307 308 .section .rodata.cst16.mask1, "aM", @pr 309 .align 16 310 .Lmask1: 311 .octa 0x8080808080808080808080808080 312 313 .section .rodata.cst16.mask2, "aM", @pr 314 .align 16 315 .Lmask2: 316 .octa 0x00000000FFFFFFFFFFFFFFFFFFFF 317 318 .section .rodata.cst16.bswap_mask, "aM" 319 .align 16 320 .Lbswap_mask: 321 .octa 0x000102030405060708090A0B0C0D 322 323 .section .rodata.cst32.byteshift_table, 324 .align 16 325 # For 1 <= len <= 15, the 16-byte vector begin 326 # is the index vector to shift left by 'len' b 327 # 0x80} XOR the index vector to shift right by 328 .Lbyteshift_table: 329 .byte 0x0, 0x81, 0x82, 0x83 330 .byte 0x88, 0x89, 0x8a, 0x8b 331 .byte 0x0, 0x1, 0x2, 0x3 332 .byte 0x8, 0x9, 0xa, 0xb
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.