1 // SPDX-License-Identifier: GPL-2.0-or-later 1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* -*- linux-c -*- --------------------------- 2 /* -*- linux-c -*- -------------------------------------------------------- 3 * 3 * 4 * Copyright (C) 2016 Intel Corporation 4 * Copyright (C) 2016 Intel Corporation 5 * 5 * 6 * Author: Gayatri Kammela <gayatri.kammela@ 6 * Author: Gayatri Kammela <gayatri.kammela@intel.com> 7 * Author: Megha Dey <megha.dey@linux.intel. 7 * Author: Megha Dey <megha.dey@linux.intel.com> 8 * 8 * 9 * Based on avx2.c: Copyright 2012 Yuanhan L 9 * Based on avx2.c: Copyright 2012 Yuanhan Liu All Rights Reserved 10 * Based on sse2.c: Copyright 2002 H. Peter 10 * Based on sse2.c: Copyright 2002 H. Peter Anvin - All Rights Reserved 11 * 11 * 12 * ------------------------------------------- 12 * ----------------------------------------------------------------------- 13 */ 13 */ 14 14 15 /* 15 /* 16 * AVX512 implementation of RAID-6 syndrome fu 16 * AVX512 implementation of RAID-6 syndrome functions 17 * 17 * 18 */ 18 */ 19 19 20 #ifdef CONFIG_AS_AVX512 20 #ifdef CONFIG_AS_AVX512 21 21 22 #include <linux/raid/pq.h> 22 #include <linux/raid/pq.h> 23 #include "x86.h" 23 #include "x86.h" 24 24 25 static const struct raid6_avx512_constants { 25 static const struct raid6_avx512_constants { 26 u64 x1d[8]; 26 u64 x1d[8]; 27 } raid6_avx512_constants __aligned(512/8) = { 27 } raid6_avx512_constants __aligned(512/8) = { 28 { 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1 28 { 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL, 29 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1 29 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL, 30 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1 30 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL, 31 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1 31 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,}, 32 }; 32 }; 33 33 34 static int raid6_have_avx512(void) 34 static int raid6_have_avx512(void) 35 { 35 { 36 return boot_cpu_has(X86_FEATURE_AVX2) 36 return boot_cpu_has(X86_FEATURE_AVX2) && 37 boot_cpu_has(X86_FEATURE_AVX) 37 boot_cpu_has(X86_FEATURE_AVX) && 38 boot_cpu_has(X86_FEATURE_AVX51 38 boot_cpu_has(X86_FEATURE_AVX512F) && 39 boot_cpu_has(X86_FEATURE_AVX51 39 boot_cpu_has(X86_FEATURE_AVX512BW) && 40 boot_cpu_has(X86_FEATURE_AVX51 40 boot_cpu_has(X86_FEATURE_AVX512VL) && 41 boot_cpu_has(X86_FEATURE_AVX51 41 boot_cpu_has(X86_FEATURE_AVX512DQ); 42 } 42 } 43 43 44 static void raid6_avx5121_gen_syndrome(int dis 44 static void raid6_avx5121_gen_syndrome(int disks, size_t bytes, void **ptrs) 45 { 45 { 46 u8 **dptr = (u8 **)ptrs; 46 u8 **dptr = (u8 **)ptrs; 47 u8 *p, *q; 47 u8 *p, *q; 48 int d, z, z0; 48 int d, z, z0; 49 49 50 z0 = disks - 3; /* Highest dat 50 z0 = disks - 3; /* Highest data disk */ 51 p = dptr[z0+1]; /* XOR parity 51 p = dptr[z0+1]; /* XOR parity */ 52 q = dptr[z0+2]; /* RS syndrome 52 q = dptr[z0+2]; /* RS syndrome */ 53 53 54 kernel_fpu_begin(); 54 kernel_fpu_begin(); 55 55 56 asm volatile("vmovdqa64 %0,%%zmm0\n\t" 56 asm volatile("vmovdqa64 %0,%%zmm0\n\t" 57 "vpxorq %%zmm1,%%zmm1,%%z 57 "vpxorq %%zmm1,%%zmm1,%%zmm1" /* Zero temp */ 58 : 58 : 59 : "m" (raid6_avx512_const 59 : "m" (raid6_avx512_constants.x1d[0])); 60 60 61 for (d = 0; d < bytes; d += 64) { 61 for (d = 0; d < bytes; d += 64) { 62 asm volatile("prefetchnta %0\n 62 asm volatile("prefetchnta %0\n\t" 63 "vmovdqa64 %0,%%z 63 "vmovdqa64 %0,%%zmm2\n\t" /* P[0] */ 64 "prefetchnta %1\n 64 "prefetchnta %1\n\t" 65 "vmovdqa64 %%zmm2 65 "vmovdqa64 %%zmm2,%%zmm4\n\t" /* Q[0] */ 66 "vmovdqa64 %1,%%z 66 "vmovdqa64 %1,%%zmm6" 67 : 67 : 68 : "m" (dptr[z0][d 68 : "m" (dptr[z0][d]), "m" (dptr[z0-1][d])); 69 for (z = z0-2; z >= 0; z--) { 69 for (z = z0-2; z >= 0; z--) { 70 asm volatile("prefetch 70 asm volatile("prefetchnta %0\n\t" 71 "vpcmpgtb 71 "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t" 72 "vpmovm2b 72 "vpmovm2b %%k1,%%zmm5\n\t" 73 "vpaddb % 73 "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" 74 "vpandq % 74 "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" 75 "vpxorq % 75 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" 76 "vpxorq % 76 "vpxorq %%zmm6,%%zmm2,%%zmm2\n\t" 77 "vpxorq % 77 "vpxorq %%zmm6,%%zmm4,%%zmm4\n\t" 78 "vmovdqa6 78 "vmovdqa64 %0,%%zmm6" 79 : 79 : 80 : "m" (dp 80 : "m" (dptr[z][d])); 81 } 81 } 82 asm volatile("vpcmpgtb %%zmm4, 82 asm volatile("vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t" 83 "vpmovm2b %%k1,%% 83 "vpmovm2b %%k1,%%zmm5\n\t" 84 "vpaddb %%zmm4,%% 84 "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" 85 "vpandq %%zmm0,%% 85 "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" 86 "vpxorq %%zmm5,%% 86 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" 87 "vpxorq %%zmm6,%% 87 "vpxorq %%zmm6,%%zmm2,%%zmm2\n\t" 88 "vpxorq %%zmm6,%% 88 "vpxorq %%zmm6,%%zmm4,%%zmm4\n\t" 89 "vmovntdq %%zmm2, 89 "vmovntdq %%zmm2,%0\n\t" 90 "vpxorq %%zmm2,%% 90 "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t" 91 "vmovntdq %%zmm4, 91 "vmovntdq %%zmm4,%1\n\t" 92 "vpxorq %%zmm4,%% 92 "vpxorq %%zmm4,%%zmm4,%%zmm4" 93 : 93 : 94 : "m" (p[d]), "m" 94 : "m" (p[d]), "m" (q[d])); 95 } 95 } 96 96 97 asm volatile("sfence" : : : "memory"); 97 asm volatile("sfence" : : : "memory"); 98 kernel_fpu_end(); 98 kernel_fpu_end(); 99 } 99 } 100 100 101 static void raid6_avx5121_xor_syndrome(int dis 101 static void raid6_avx5121_xor_syndrome(int disks, int start, int stop, 102 size_t 102 size_t bytes, void **ptrs) 103 { 103 { 104 u8 **dptr = (u8 **)ptrs; 104 u8 **dptr = (u8 **)ptrs; 105 u8 *p, *q; 105 u8 *p, *q; 106 int d, z, z0; 106 int d, z, z0; 107 107 108 z0 = stop; /* P/Q right s 108 z0 = stop; /* P/Q right side optimization */ 109 p = dptr[disks-2]; /* XOR parity 109 p = dptr[disks-2]; /* XOR parity */ 110 q = dptr[disks-1]; /* RS syndrome 110 q = dptr[disks-1]; /* RS syndrome */ 111 111 112 kernel_fpu_begin(); 112 kernel_fpu_begin(); 113 113 114 asm volatile("vmovdqa64 %0,%%zmm0" 114 asm volatile("vmovdqa64 %0,%%zmm0" 115 : : "m" (raid6_avx512_con 115 : : "m" (raid6_avx512_constants.x1d[0])); 116 116 117 for (d = 0 ; d < bytes ; d += 64) { 117 for (d = 0 ; d < bytes ; d += 64) { 118 asm volatile("vmovdqa64 %0,%%z 118 asm volatile("vmovdqa64 %0,%%zmm4\n\t" 119 "vmovdqa64 %1,%%z 119 "vmovdqa64 %1,%%zmm2\n\t" 120 "vpxorq %%zmm4,%% 120 "vpxorq %%zmm4,%%zmm2,%%zmm2" 121 : 121 : 122 : "m" (dptr[z0][d 122 : "m" (dptr[z0][d]), "m" (p[d])); 123 /* P/Q data pages */ 123 /* P/Q data pages */ 124 for (z = z0-1 ; z >= start ; z 124 for (z = z0-1 ; z >= start ; z--) { 125 asm volatile("vpxorq % 125 asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t" 126 "vpcmpgtb 126 "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t" 127 "vpmovm2b 127 "vpmovm2b %%k1,%%zmm5\n\t" 128 "vpaddb % 128 "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" 129 "vpandq % 129 "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" 130 "vpxorq % 130 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" 131 "vmovdqa6 131 "vmovdqa64 %0,%%zmm5\n\t" 132 "vpxorq % 132 "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t" 133 "vpxorq % 133 "vpxorq %%zmm5,%%zmm4,%%zmm4" 134 : 134 : 135 : "m" (dp 135 : "m" (dptr[z][d])); 136 } 136 } 137 /* P/Q left side optimization 137 /* P/Q left side optimization */ 138 for (z = start-1 ; z >= 0 ; z- 138 for (z = start-1 ; z >= 0 ; z--) { 139 asm volatile("vpxorq % 139 asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t" 140 "vpcmpgtb 140 "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t" 141 "vpmovm2b 141 "vpmovm2b %%k1,%%zmm5\n\t" 142 "vpaddb % 142 "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" 143 "vpandq % 143 "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" 144 "vpxorq % 144 "vpxorq %%zmm5,%%zmm4,%%zmm4" 145 : 145 : 146 : ); 146 : ); 147 } 147 } 148 asm volatile("vpxorq %0,%%zmm4 148 asm volatile("vpxorq %0,%%zmm4,%%zmm4\n\t" 149 /* Don't use movntdq for r/w m 149 /* Don't use movntdq for r/w memory area < cache line */ 150 "vmovdqa64 %%zmm4 150 "vmovdqa64 %%zmm4,%0\n\t" 151 "vmovdqa64 %%zmm2 151 "vmovdqa64 %%zmm2,%1" 152 : 152 : 153 : "m" (q[d]), "m" 153 : "m" (q[d]), "m" (p[d])); 154 } 154 } 155 155 156 asm volatile("sfence" : : : "memory"); 156 asm volatile("sfence" : : : "memory"); 157 kernel_fpu_end(); 157 kernel_fpu_end(); 158 } 158 } 159 159 160 const struct raid6_calls raid6_avx512x1 = { 160 const struct raid6_calls raid6_avx512x1 = { 161 raid6_avx5121_gen_syndrome, 161 raid6_avx5121_gen_syndrome, 162 raid6_avx5121_xor_syndrome, 162 raid6_avx5121_xor_syndrome, 163 raid6_have_avx512, 163 raid6_have_avx512, 164 "avx512x1", 164 "avx512x1", 165 .priority = 2 /* Prefer AVX5 !! 165 1 /* Has cache hints */ 166 }; 166 }; 167 167 168 /* 168 /* 169 * Unrolled-by-2 AVX512 implementation 169 * Unrolled-by-2 AVX512 implementation 170 */ 170 */ 171 static void raid6_avx5122_gen_syndrome(int dis 171 static void raid6_avx5122_gen_syndrome(int disks, size_t bytes, void **ptrs) 172 { 172 { 173 u8 **dptr = (u8 **)ptrs; 173 u8 **dptr = (u8 **)ptrs; 174 u8 *p, *q; 174 u8 *p, *q; 175 int d, z, z0; 175 int d, z, z0; 176 176 177 z0 = disks - 3; /* Highest dat 177 z0 = disks - 3; /* Highest data disk */ 178 p = dptr[z0+1]; /* XOR parity 178 p = dptr[z0+1]; /* XOR parity */ 179 q = dptr[z0+2]; /* RS syndrome 179 q = dptr[z0+2]; /* RS syndrome */ 180 180 181 kernel_fpu_begin(); 181 kernel_fpu_begin(); 182 182 183 asm volatile("vmovdqa64 %0,%%zmm0\n\t" 183 asm volatile("vmovdqa64 %0,%%zmm0\n\t" 184 "vpxorq %%zmm1,%%zmm1,%%z 184 "vpxorq %%zmm1,%%zmm1,%%zmm1" /* Zero temp */ 185 : 185 : 186 : "m" (raid6_avx512_const 186 : "m" (raid6_avx512_constants.x1d[0])); 187 187 188 /* We uniformly assume a single prefet 188 /* We uniformly assume a single prefetch covers at least 64 bytes */ 189 for (d = 0; d < bytes; d += 128) { 189 for (d = 0; d < bytes; d += 128) { 190 asm volatile("prefetchnta %0\n 190 asm volatile("prefetchnta %0\n\t" 191 "prefetchnta %1\n 191 "prefetchnta %1\n\t" 192 "vmovdqa64 %0,%%z 192 "vmovdqa64 %0,%%zmm2\n\t" /* P[0] */ 193 "vmovdqa64 %1,%%z 193 "vmovdqa64 %1,%%zmm3\n\t" /* P[1] */ 194 "vmovdqa64 %%zmm2 194 "vmovdqa64 %%zmm2,%%zmm4\n\t" /* Q[0] */ 195 "vmovdqa64 %%zmm3 195 "vmovdqa64 %%zmm3,%%zmm6" /* Q[1] */ 196 : 196 : 197 : "m" (dptr[z0][d 197 : "m" (dptr[z0][d]), "m" (dptr[z0][d+64])); 198 for (z = z0-1; z >= 0; z--) { 198 for (z = z0-1; z >= 0; z--) { 199 asm volatile("prefetch 199 asm volatile("prefetchnta %0\n\t" 200 "prefetch 200 "prefetchnta %1\n\t" 201 "vpcmpgtb 201 "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t" 202 "vpcmpgtb 202 "vpcmpgtb %%zmm6,%%zmm1,%%k2\n\t" 203 "vpmovm2b 203 "vpmovm2b %%k1,%%zmm5\n\t" 204 "vpmovm2b 204 "vpmovm2b %%k2,%%zmm7\n\t" 205 "vpaddb % 205 "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" 206 "vpaddb % 206 "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t" 207 "vpandq % 207 "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" 208 "vpandq % 208 "vpandq %%zmm0,%%zmm7,%%zmm7\n\t" 209 "vpxorq % 209 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" 210 "vpxorq % 210 "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" 211 "vmovdqa6 211 "vmovdqa64 %0,%%zmm5\n\t" 212 "vmovdqa6 212 "vmovdqa64 %1,%%zmm7\n\t" 213 "vpxorq % 213 "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t" 214 "vpxorq % 214 "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t" 215 "vpxorq % 215 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" 216 "vpxorq % 216 "vpxorq %%zmm7,%%zmm6,%%zmm6" 217 : 217 : 218 : "m" (dp 218 : "m" (dptr[z][d]), "m" (dptr[z][d+64])); 219 } 219 } 220 asm volatile("vmovntdq %%zmm2, 220 asm volatile("vmovntdq %%zmm2,%0\n\t" 221 "vmovntdq %%zmm3, 221 "vmovntdq %%zmm3,%1\n\t" 222 "vmovntdq %%zmm4, 222 "vmovntdq %%zmm4,%2\n\t" 223 "vmovntdq %%zmm6, 223 "vmovntdq %%zmm6,%3" 224 : 224 : 225 : "m" (p[d]), "m" 225 : "m" (p[d]), "m" (p[d+64]), "m" (q[d]), 226 "m" (q[d+64])); 226 "m" (q[d+64])); 227 } 227 } 228 228 229 asm volatile("sfence" : : : "memory"); 229 asm volatile("sfence" : : : "memory"); 230 kernel_fpu_end(); 230 kernel_fpu_end(); 231 } 231 } 232 232 233 static void raid6_avx5122_xor_syndrome(int dis 233 static void raid6_avx5122_xor_syndrome(int disks, int start, int stop, 234 size_t 234 size_t bytes, void **ptrs) 235 { 235 { 236 u8 **dptr = (u8 **)ptrs; 236 u8 **dptr = (u8 **)ptrs; 237 u8 *p, *q; 237 u8 *p, *q; 238 int d, z, z0; 238 int d, z, z0; 239 239 240 z0 = stop; /* P/Q right s 240 z0 = stop; /* P/Q right side optimization */ 241 p = dptr[disks-2]; /* XOR parity 241 p = dptr[disks-2]; /* XOR parity */ 242 q = dptr[disks-1]; /* RS syndrome 242 q = dptr[disks-1]; /* RS syndrome */ 243 243 244 kernel_fpu_begin(); 244 kernel_fpu_begin(); 245 245 246 asm volatile("vmovdqa64 %0,%%zmm0" 246 asm volatile("vmovdqa64 %0,%%zmm0" 247 : : "m" (raid6_avx512_con 247 : : "m" (raid6_avx512_constants.x1d[0])); 248 248 249 for (d = 0 ; d < bytes ; d += 128) { 249 for (d = 0 ; d < bytes ; d += 128) { 250 asm volatile("vmovdqa64 %0,%%z 250 asm volatile("vmovdqa64 %0,%%zmm4\n\t" 251 "vmovdqa64 %1,%%z 251 "vmovdqa64 %1,%%zmm6\n\t" 252 "vmovdqa64 %2,%%z 252 "vmovdqa64 %2,%%zmm2\n\t" 253 "vmovdqa64 %3,%%z 253 "vmovdqa64 %3,%%zmm3\n\t" 254 "vpxorq %%zmm4,%% 254 "vpxorq %%zmm4,%%zmm2,%%zmm2\n\t" 255 "vpxorq %%zmm6,%% 255 "vpxorq %%zmm6,%%zmm3,%%zmm3" 256 : 256 : 257 : "m" (dptr[z0][d 257 : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]), 258 "m" (p[d]), "m" 258 "m" (p[d]), "m" (p[d+64])); 259 /* P/Q data pages */ 259 /* P/Q data pages */ 260 for (z = z0-1 ; z >= start ; z 260 for (z = z0-1 ; z >= start ; z--) { 261 asm volatile("vpxorq % 261 asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t" 262 "vpxorq % 262 "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t" 263 "vpcmpgtb 263 "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t" 264 "vpcmpgtb 264 "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t" 265 "vpmovm2b 265 "vpmovm2b %%k1,%%zmm5\n\t" 266 "vpmovm2b 266 "vpmovm2b %%k2,%%zmm7\n\t" 267 "vpaddb % 267 "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" 268 "vpaddb % 268 "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t" 269 "vpandq % 269 "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" 270 "vpandq % 270 "vpandq %%zmm0,%%zmm7,%%zmm7\n\t" 271 "vpxorq % 271 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" 272 "vpxorq % 272 "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" 273 "vmovdqa6 273 "vmovdqa64 %0,%%zmm5\n\t" 274 "vmovdqa6 274 "vmovdqa64 %1,%%zmm7\n\t" 275 "vpxorq % 275 "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t" 276 "vpxorq % 276 "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t" 277 "vpxorq % 277 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" 278 "vpxorq % 278 "vpxorq %%zmm7,%%zmm6,%%zmm6" 279 : 279 : 280 : "m" (dp 280 : "m" (dptr[z][d]), "m" (dptr[z][d+64])); 281 } 281 } 282 /* P/Q left side optimization 282 /* P/Q left side optimization */ 283 for (z = start-1 ; z >= 0 ; z- 283 for (z = start-1 ; z >= 0 ; z--) { 284 asm volatile("vpxorq % 284 asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t" 285 "vpxorq % 285 "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t" 286 "vpcmpgtb 286 "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t" 287 "vpcmpgtb 287 "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t" 288 "vpmovm2b 288 "vpmovm2b %%k1,%%zmm5\n\t" 289 "vpmovm2b 289 "vpmovm2b %%k2,%%zmm7\n\t" 290 "vpaddb % 290 "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" 291 "vpaddb % 291 "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t" 292 "vpandq % 292 "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" 293 "vpandq % 293 "vpandq %%zmm0,%%zmm7,%%zmm7\n\t" 294 "vpxorq % 294 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" 295 "vpxorq % 295 "vpxorq %%zmm7,%%zmm6,%%zmm6" 296 : 296 : 297 : ); 297 : ); 298 } 298 } 299 asm volatile("vpxorq %0,%%zmm4 299 asm volatile("vpxorq %0,%%zmm4,%%zmm4\n\t" 300 "vpxorq %1,%%zmm6 300 "vpxorq %1,%%zmm6,%%zmm6\n\t" 301 /* Don't use movn 301 /* Don't use movntdq for r/w 302 * memory area < 302 * memory area < cache line 303 */ 303 */ 304 "vmovdqa64 %%zmm4 304 "vmovdqa64 %%zmm4,%0\n\t" 305 "vmovdqa64 %%zmm6 305 "vmovdqa64 %%zmm6,%1\n\t" 306 "vmovdqa64 %%zmm2 306 "vmovdqa64 %%zmm2,%2\n\t" 307 "vmovdqa64 %%zmm3 307 "vmovdqa64 %%zmm3,%3" 308 : 308 : 309 : "m" (q[d]), "m" 309 : "m" (q[d]), "m" (q[d+64]), "m" (p[d]), 310 "m" (p[d+64])); 310 "m" (p[d+64])); 311 } 311 } 312 312 313 asm volatile("sfence" : : : "memory"); 313 asm volatile("sfence" : : : "memory"); 314 kernel_fpu_end(); 314 kernel_fpu_end(); 315 } 315 } 316 316 317 const struct raid6_calls raid6_avx512x2 = { 317 const struct raid6_calls raid6_avx512x2 = { 318 raid6_avx5122_gen_syndrome, 318 raid6_avx5122_gen_syndrome, 319 raid6_avx5122_xor_syndrome, 319 raid6_avx5122_xor_syndrome, 320 raid6_have_avx512, 320 raid6_have_avx512, 321 "avx512x2", 321 "avx512x2", 322 .priority = 2 /* Prefer AVX5 !! 322 1 /* Has cache hints */ 323 }; 323 }; 324 324 325 #ifdef CONFIG_X86_64 325 #ifdef CONFIG_X86_64 326 326 327 /* 327 /* 328 * Unrolled-by-4 AVX2 implementation 328 * Unrolled-by-4 AVX2 implementation 329 */ 329 */ 330 static void raid6_avx5124_gen_syndrome(int dis 330 static void raid6_avx5124_gen_syndrome(int disks, size_t bytes, void **ptrs) 331 { 331 { 332 u8 **dptr = (u8 **)ptrs; 332 u8 **dptr = (u8 **)ptrs; 333 u8 *p, *q; 333 u8 *p, *q; 334 int d, z, z0; 334 int d, z, z0; 335 335 336 z0 = disks - 3; /* Highest dat 336 z0 = disks - 3; /* Highest data disk */ 337 p = dptr[z0+1]; /* XOR parity 337 p = dptr[z0+1]; /* XOR parity */ 338 q = dptr[z0+2]; /* RS syndrome 338 q = dptr[z0+2]; /* RS syndrome */ 339 339 340 kernel_fpu_begin(); 340 kernel_fpu_begin(); 341 341 342 asm volatile("vmovdqa64 %0,%%zmm0\n\t" 342 asm volatile("vmovdqa64 %0,%%zmm0\n\t" 343 "vpxorq %%zmm1,%%zmm1,%%z 343 "vpxorq %%zmm1,%%zmm1,%%zmm1\n\t" /* Zero temp */ 344 "vpxorq %%zmm2,%%zmm2,%%z 344 "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t" /* P[0] */ 345 "vpxorq %%zmm3,%%zmm3,%%z 345 "vpxorq %%zmm3,%%zmm3,%%zmm3\n\t" /* P[1] */ 346 "vpxorq %%zmm4,%%zmm4,%%z 346 "vpxorq %%zmm4,%%zmm4,%%zmm4\n\t" /* Q[0] */ 347 "vpxorq %%zmm6,%%zmm6,%%z 347 "vpxorq %%zmm6,%%zmm6,%%zmm6\n\t" /* Q[1] */ 348 "vpxorq %%zmm10,%%zmm10,% 348 "vpxorq %%zmm10,%%zmm10,%%zmm10\n\t" /* P[2] */ 349 "vpxorq %%zmm11,%%zmm11,% 349 "vpxorq %%zmm11,%%zmm11,%%zmm11\n\t" /* P[3] */ 350 "vpxorq %%zmm12,%%zmm12,% 350 "vpxorq %%zmm12,%%zmm12,%%zmm12\n\t" /* Q[2] */ 351 "vpxorq %%zmm14,%%zmm14,% 351 "vpxorq %%zmm14,%%zmm14,%%zmm14" /* Q[3] */ 352 : 352 : 353 : "m" (raid6_avx512_const 353 : "m" (raid6_avx512_constants.x1d[0])); 354 354 355 for (d = 0; d < bytes; d += 256) { 355 for (d = 0; d < bytes; d += 256) { 356 for (z = z0; z >= 0; z--) { 356 for (z = z0; z >= 0; z--) { 357 asm volatile("prefetchnta %0\n 357 asm volatile("prefetchnta %0\n\t" 358 "prefetchnta %1\n 358 "prefetchnta %1\n\t" 359 "prefetchnta %2\n 359 "prefetchnta %2\n\t" 360 "prefetchnta %3\n 360 "prefetchnta %3\n\t" 361 "vpcmpgtb %%zmm4, 361 "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t" 362 "vpcmpgtb %%zmm6, 362 "vpcmpgtb %%zmm6,%%zmm1,%%k2\n\t" 363 "vpcmpgtb %%zmm12 363 "vpcmpgtb %%zmm12,%%zmm1,%%k3\n\t" 364 "vpcmpgtb %%zmm14 364 "vpcmpgtb %%zmm14,%%zmm1,%%k4\n\t" 365 "vpmovm2b %%k1,%% 365 "vpmovm2b %%k1,%%zmm5\n\t" 366 "vpmovm2b %%k2,%% 366 "vpmovm2b %%k2,%%zmm7\n\t" 367 "vpmovm2b %%k3,%% 367 "vpmovm2b %%k3,%%zmm13\n\t" 368 "vpmovm2b %%k4,%% 368 "vpmovm2b %%k4,%%zmm15\n\t" 369 "vpaddb %%zmm4,%% 369 "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" 370 "vpaddb %%zmm6,%% 370 "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t" 371 "vpaddb %%zmm12,% 371 "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t" 372 "vpaddb %%zmm14,% 372 "vpaddb %%zmm14,%%zmm14,%%zmm14\n\t" 373 "vpandq %%zmm0,%% 373 "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" 374 "vpandq %%zmm0,%% 374 "vpandq %%zmm0,%%zmm7,%%zmm7\n\t" 375 "vpandq %%zmm0,%% 375 "vpandq %%zmm0,%%zmm13,%%zmm13\n\t" 376 "vpandq %%zmm0,%% 376 "vpandq %%zmm0,%%zmm15,%%zmm15\n\t" 377 "vpxorq %%zmm5,%% 377 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" 378 "vpxorq %%zmm7,%% 378 "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" 379 "vpxorq %%zmm13,% 379 "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t" 380 "vpxorq %%zmm15,% 380 "vpxorq %%zmm15,%%zmm14,%%zmm14\n\t" 381 "vmovdqa64 %0,%%z 381 "vmovdqa64 %0,%%zmm5\n\t" 382 "vmovdqa64 %1,%%z 382 "vmovdqa64 %1,%%zmm7\n\t" 383 "vmovdqa64 %2,%%z 383 "vmovdqa64 %2,%%zmm13\n\t" 384 "vmovdqa64 %3,%%z 384 "vmovdqa64 %3,%%zmm15\n\t" 385 "vpxorq %%zmm5,%% 385 "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t" 386 "vpxorq %%zmm7,%% 386 "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t" 387 "vpxorq %%zmm13,% 387 "vpxorq %%zmm13,%%zmm10,%%zmm10\n\t" 388 "vpxorq %%zmm15,% 388 "vpxorq %%zmm15,%%zmm11,%%zmm11\n" 389 "vpxorq %%zmm5,%% 389 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" 390 "vpxorq %%zmm7,%% 390 "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" 391 "vpxorq %%zmm13,% 391 "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t" 392 "vpxorq %%zmm15,% 392 "vpxorq %%zmm15,%%zmm14,%%zmm14" 393 : 393 : 394 : "m" (dptr[z][d] 394 : "m" (dptr[z][d]), "m" (dptr[z][d+64]), 395 "m" (dptr[z][d+ 395 "m" (dptr[z][d+128]), "m" (dptr[z][d+192])); 396 } 396 } 397 asm volatile("vmovntdq %%zmm2, 397 asm volatile("vmovntdq %%zmm2,%0\n\t" 398 "vpxorq %%zmm2,%% 398 "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t" 399 "vmovntdq %%zmm3, 399 "vmovntdq %%zmm3,%1\n\t" 400 "vpxorq %%zmm3,%% 400 "vpxorq %%zmm3,%%zmm3,%%zmm3\n\t" 401 "vmovntdq %%zmm10 401 "vmovntdq %%zmm10,%2\n\t" 402 "vpxorq %%zmm10,% 402 "vpxorq %%zmm10,%%zmm10,%%zmm10\n\t" 403 "vmovntdq %%zmm11 403 "vmovntdq %%zmm11,%3\n\t" 404 "vpxorq %%zmm11,% 404 "vpxorq %%zmm11,%%zmm11,%%zmm11\n\t" 405 "vmovntdq %%zmm4, 405 "vmovntdq %%zmm4,%4\n\t" 406 "vpxorq %%zmm4,%% 406 "vpxorq %%zmm4,%%zmm4,%%zmm4\n\t" 407 "vmovntdq %%zmm6, 407 "vmovntdq %%zmm6,%5\n\t" 408 "vpxorq %%zmm6,%% 408 "vpxorq %%zmm6,%%zmm6,%%zmm6\n\t" 409 "vmovntdq %%zmm12 409 "vmovntdq %%zmm12,%6\n\t" 410 "vpxorq %%zmm12,% 410 "vpxorq %%zmm12,%%zmm12,%%zmm12\n\t" 411 "vmovntdq %%zmm14 411 "vmovntdq %%zmm14,%7\n\t" 412 "vpxorq %%zmm14,% 412 "vpxorq %%zmm14,%%zmm14,%%zmm14" 413 : 413 : 414 : "m" (p[d]), "m" 414 : "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]), 415 "m" (p[d+192]), 415 "m" (p[d+192]), "m" (q[d]), "m" (q[d+64]), 416 "m" (q[d+128]), 416 "m" (q[d+128]), "m" (q[d+192])); 417 } 417 } 418 418 419 asm volatile("sfence" : : : "memory"); 419 asm volatile("sfence" : : : "memory"); 420 kernel_fpu_end(); 420 kernel_fpu_end(); 421 } 421 } 422 422 423 static void raid6_avx5124_xor_syndrome(int dis 423 static void raid6_avx5124_xor_syndrome(int disks, int start, int stop, 424 size_t 424 size_t bytes, void **ptrs) 425 { 425 { 426 u8 **dptr = (u8 **)ptrs; 426 u8 **dptr = (u8 **)ptrs; 427 u8 *p, *q; 427 u8 *p, *q; 428 int d, z, z0; 428 int d, z, z0; 429 429 430 z0 = stop; /* P/Q right s 430 z0 = stop; /* P/Q right side optimization */ 431 p = dptr[disks-2]; /* XOR parity 431 p = dptr[disks-2]; /* XOR parity */ 432 q = dptr[disks-1]; /* RS syndrome 432 q = dptr[disks-1]; /* RS syndrome */ 433 433 434 kernel_fpu_begin(); 434 kernel_fpu_begin(); 435 435 436 asm volatile("vmovdqa64 %0,%%zmm0" 436 asm volatile("vmovdqa64 %0,%%zmm0" 437 :: "m" (raid6_avx512_cons 437 :: "m" (raid6_avx512_constants.x1d[0])); 438 438 439 for (d = 0 ; d < bytes ; d += 256) { 439 for (d = 0 ; d < bytes ; d += 256) { 440 asm volatile("vmovdqa64 %0,%%z 440 asm volatile("vmovdqa64 %0,%%zmm4\n\t" 441 "vmovdqa64 %1,%%z 441 "vmovdqa64 %1,%%zmm6\n\t" 442 "vmovdqa64 %2,%%z 442 "vmovdqa64 %2,%%zmm12\n\t" 443 "vmovdqa64 %3,%%z 443 "vmovdqa64 %3,%%zmm14\n\t" 444 "vmovdqa64 %4,%%z 444 "vmovdqa64 %4,%%zmm2\n\t" 445 "vmovdqa64 %5,%%z 445 "vmovdqa64 %5,%%zmm3\n\t" 446 "vmovdqa64 %6,%%z 446 "vmovdqa64 %6,%%zmm10\n\t" 447 "vmovdqa64 %7,%%z 447 "vmovdqa64 %7,%%zmm11\n\t" 448 "vpxorq %%zmm4,%% 448 "vpxorq %%zmm4,%%zmm2,%%zmm2\n\t" 449 "vpxorq %%zmm6,%% 449 "vpxorq %%zmm6,%%zmm3,%%zmm3\n\t" 450 "vpxorq %%zmm12,% 450 "vpxorq %%zmm12,%%zmm10,%%zmm10\n\t" 451 "vpxorq %%zmm14,% 451 "vpxorq %%zmm14,%%zmm11,%%zmm11" 452 : 452 : 453 : "m" (dptr[z0][d 453 : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]), 454 "m" (dptr[z0][d 454 "m" (dptr[z0][d+128]), "m" (dptr[z0][d+192]), 455 "m" (p[d]), "m" 455 "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]), 456 "m" (p[d+192])) 456 "m" (p[d+192])); 457 /* P/Q data pages */ 457 /* P/Q data pages */ 458 for (z = z0-1 ; z >= start ; z 458 for (z = z0-1 ; z >= start ; z--) { 459 asm volatile("vpxorq % 459 asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t" 460 "vpxorq % 460 "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t" 461 "vpxorq % 461 "vpxorq %%zmm13,%%zmm13,%%zmm13\n\t" 462 "vpxorq % 462 "vpxorq %%zmm15,%%zmm15,%%zmm15\n\t" 463 "prefetch 463 "prefetchnta %0\n\t" 464 "prefetch 464 "prefetchnta %2\n\t" 465 "vpcmpgtb 465 "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t" 466 "vpcmpgtb 466 "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t" 467 "vpcmpgtb 467 "vpcmpgtb %%zmm12,%%zmm13,%%k3\n\t" 468 "vpcmpgtb 468 "vpcmpgtb %%zmm14,%%zmm15,%%k4\n\t" 469 "vpmovm2b 469 "vpmovm2b %%k1,%%zmm5\n\t" 470 "vpmovm2b 470 "vpmovm2b %%k2,%%zmm7\n\t" 471 "vpmovm2b 471 "vpmovm2b %%k3,%%zmm13\n\t" 472 "vpmovm2b 472 "vpmovm2b %%k4,%%zmm15\n\t" 473 "vpaddb % 473 "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" 474 "vpaddb % 474 "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t" 475 "vpaddb % 475 "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t" 476 "vpaddb % 476 "vpaddb %%Zmm14,%%zmm14,%%zmm14\n\t" 477 "vpandq % 477 "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" 478 "vpandq % 478 "vpandq %%zmm0,%%zmm7,%%zmm7\n\t" 479 "vpandq % 479 "vpandq %%zmm0,%%zmm13,%%zmm13\n\t" 480 "vpandq % 480 "vpandq %%zmm0,%%zmm15,%%zmm15\n\t" 481 "vpxorq % 481 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" 482 "vpxorq % 482 "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" 483 "vpxorq % 483 "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t" 484 "vpxorq % 484 "vpxorq %%zmm15,%%zmm14,%%zmm14\n\t" 485 "vmovdqa6 485 "vmovdqa64 %0,%%zmm5\n\t" 486 "vmovdqa6 486 "vmovdqa64 %1,%%zmm7\n\t" 487 "vmovdqa6 487 "vmovdqa64 %2,%%zmm13\n\t" 488 "vmovdqa6 488 "vmovdqa64 %3,%%zmm15\n\t" 489 "vpxorq % 489 "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t" 490 "vpxorq % 490 "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t" 491 "vpxorq % 491 "vpxorq %%zmm13,%%zmm10,%%zmm10\n\t" 492 "vpxorq % 492 "vpxorq %%zmm15,%%zmm11,%%zmm11\n\t" 493 "vpxorq % 493 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" 494 "vpxorq % 494 "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" 495 "vpxorq % 495 "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t" 496 "vpxorq % 496 "vpxorq %%zmm15,%%zmm14,%%zmm14" 497 : 497 : 498 : "m" (dp 498 : "m" (dptr[z][d]), "m" (dptr[z][d+64]), 499 "m" (dp 499 "m" (dptr[z][d+128]), 500 "m" (dp 500 "m" (dptr[z][d+192])); 501 } 501 } 502 asm volatile("prefetchnta %0\n 502 asm volatile("prefetchnta %0\n\t" 503 "prefetchnta %1\n 503 "prefetchnta %1\n\t" 504 : 504 : 505 : "m" (q[d]), "m" 505 : "m" (q[d]), "m" (q[d+128])); 506 /* P/Q left side optimization 506 /* P/Q left side optimization */ 507 for (z = start-1 ; z >= 0 ; z- 507 for (z = start-1 ; z >= 0 ; z--) { 508 asm volatile("vpxorq % 508 asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t" 509 "vpxorq % 509 "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t" 510 "vpxorq % 510 "vpxorq %%zmm13,%%zmm13,%%zmm13\n\t" 511 "vpxorq % 511 "vpxorq %%zmm15,%%zmm15,%%zmm15\n\t" 512 "vpcmpgtb 512 "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t" 513 "vpcmpgtb 513 "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t" 514 "vpcmpgtb 514 "vpcmpgtb %%zmm12,%%zmm13,%%k3\n\t" 515 "vpcmpgtb 515 "vpcmpgtb %%zmm14,%%zmm15,%%k4\n\t" 516 "vpmovm2b 516 "vpmovm2b %%k1,%%zmm5\n\t" 517 "vpmovm2b 517 "vpmovm2b %%k2,%%zmm7\n\t" 518 "vpmovm2b 518 "vpmovm2b %%k3,%%zmm13\n\t" 519 "vpmovm2b 519 "vpmovm2b %%k4,%%zmm15\n\t" 520 "vpaddb % 520 "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" 521 "vpaddb % 521 "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t" 522 "vpaddb % 522 "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t" 523 "vpaddb % 523 "vpaddb %%zmm14,%%zmm14,%%zmm14\n\t" 524 "vpandq % 524 "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" 525 "vpandq % 525 "vpandq %%zmm0,%%zmm7,%%zmm7\n\t" 526 "vpandq % 526 "vpandq %%zmm0,%%zmm13,%%zmm13\n\t" 527 "vpandq % 527 "vpandq %%zmm0,%%zmm15,%%zmm15\n\t" 528 "vpxorq % 528 "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" 529 "vpxorq % 529 "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" 530 "vpxorq % 530 "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t" 531 "vpxorq % 531 "vpxorq %%zmm15,%%zmm14,%%zmm14" 532 : 532 : 533 : ); 533 : ); 534 } 534 } 535 asm volatile("vmovntdq %%zmm2, 535 asm volatile("vmovntdq %%zmm2,%0\n\t" 536 "vmovntdq %%zmm3, 536 "vmovntdq %%zmm3,%1\n\t" 537 "vmovntdq %%zmm10 537 "vmovntdq %%zmm10,%2\n\t" 538 "vmovntdq %%zmm11 538 "vmovntdq %%zmm11,%3\n\t" 539 "vpxorq %4,%%zmm4 539 "vpxorq %4,%%zmm4,%%zmm4\n\t" 540 "vpxorq %5,%%zmm6 540 "vpxorq %5,%%zmm6,%%zmm6\n\t" 541 "vpxorq %6,%%zmm1 541 "vpxorq %6,%%zmm12,%%zmm12\n\t" 542 "vpxorq %7,%%zmm1 542 "vpxorq %7,%%zmm14,%%zmm14\n\t" 543 "vmovntdq %%zmm4, 543 "vmovntdq %%zmm4,%4\n\t" 544 "vmovntdq %%zmm6, 544 "vmovntdq %%zmm6,%5\n\t" 545 "vmovntdq %%zmm12 545 "vmovntdq %%zmm12,%6\n\t" 546 "vmovntdq %%zmm14 546 "vmovntdq %%zmm14,%7" 547 : 547 : 548 : "m" (p[d]), "m 548 : "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]), 549 "m" (p[d+192]), 549 "m" (p[d+192]), "m" (q[d]), "m" (q[d+64]), 550 "m" (q[d+128]), 550 "m" (q[d+128]), "m" (q[d+192])); 551 } 551 } 552 asm volatile("sfence" : : : "memory"); 552 asm volatile("sfence" : : : "memory"); 553 kernel_fpu_end(); 553 kernel_fpu_end(); 554 } 554 } 555 const struct raid6_calls raid6_avx512x4 = { 555 const struct raid6_calls raid6_avx512x4 = { 556 raid6_avx5124_gen_syndrome, 556 raid6_avx5124_gen_syndrome, 557 raid6_avx5124_xor_syndrome, 557 raid6_avx5124_xor_syndrome, 558 raid6_have_avx512, 558 raid6_have_avx512, 559 "avx512x4", 559 "avx512x4", 560 .priority = 2 /* Prefer AVX5 !! 560 1 /* Has cache hints */ 561 }; 561 }; 562 #endif 562 #endif 563 563 564 #endif /* CONFIG_AS_AVX512 */ 564 #endif /* CONFIG_AS_AVX512 */ 565 565
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.