1 // SPDX-License-Identifier: GPL-2.0 1 // SPDX-License-Identifier: GPL-2.0 2 /* 2 /* 3 * NUMA emulation 3 * NUMA emulation 4 */ 4 */ 5 #include <linux/kernel.h> 5 #include <linux/kernel.h> 6 #include <linux/errno.h> 6 #include <linux/errno.h> 7 #include <linux/topology.h> 7 #include <linux/topology.h> 8 #include <linux/memblock.h> 8 #include <linux/memblock.h> 9 #include <linux/numa_memblks.h> 9 #include <linux/numa_memblks.h> 10 #include <asm/numa.h> 10 #include <asm/numa.h> 11 11 12 #define FAKE_NODE_MIN_SIZE ((u64)32 << 20 12 #define FAKE_NODE_MIN_SIZE ((u64)32 << 20) 13 #define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_M 13 #define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL)) 14 14 15 static int emu_nid_to_phys[MAX_NUMNODES]; 15 static int emu_nid_to_phys[MAX_NUMNODES]; 16 static char *emu_cmdline __initdata; 16 static char *emu_cmdline __initdata; 17 17 18 int __init numa_emu_cmdline(char *str) 18 int __init numa_emu_cmdline(char *str) 19 { 19 { 20 emu_cmdline = str; 20 emu_cmdline = str; 21 return 0; 21 return 0; 22 } 22 } 23 23 24 static int __init emu_find_memblk_by_nid(int n 24 static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo *mi) 25 { 25 { 26 int i; 26 int i; 27 27 28 for (i = 0; i < mi->nr_blks; i++) 28 for (i = 0; i < mi->nr_blks; i++) 29 if (mi->blk[i].nid == nid) 29 if (mi->blk[i].nid == nid) 30 return i; 30 return i; 31 return -ENOENT; 31 return -ENOENT; 32 } 32 } 33 33 34 static u64 __init mem_hole_size(u64 start, u64 34 static u64 __init mem_hole_size(u64 start, u64 end) 35 { 35 { 36 unsigned long start_pfn = PFN_UP(start 36 unsigned long start_pfn = PFN_UP(start); 37 unsigned long end_pfn = PFN_DOWN(end); 37 unsigned long end_pfn = PFN_DOWN(end); 38 38 39 if (start_pfn < end_pfn) 39 if (start_pfn < end_pfn) 40 return PFN_PHYS(absent_pages_i 40 return PFN_PHYS(absent_pages_in_range(start_pfn, end_pfn)); 41 return 0; 41 return 0; 42 } 42 } 43 43 44 /* 44 /* 45 * Sets up nid to range from @start to @end. 45 * Sets up nid to range from @start to @end. The return value is -errno if 46 * something went wrong, 0 otherwise. 46 * something went wrong, 0 otherwise. 47 */ 47 */ 48 static int __init emu_setup_memblk(struct numa 48 static int __init emu_setup_memblk(struct numa_meminfo *ei, 49 struct numa 49 struct numa_meminfo *pi, 50 int nid, in 50 int nid, int phys_blk, u64 size) 51 { 51 { 52 struct numa_memblk *eb = &ei->blk[ei-> 52 struct numa_memblk *eb = &ei->blk[ei->nr_blks]; 53 struct numa_memblk *pb = &pi->blk[phys 53 struct numa_memblk *pb = &pi->blk[phys_blk]; 54 54 55 if (ei->nr_blks >= NR_NODE_MEMBLKS) { 55 if (ei->nr_blks >= NR_NODE_MEMBLKS) { 56 pr_err("NUMA: Too many emulate 56 pr_err("NUMA: Too many emulated memblks, failing emulation\n"); 57 return -EINVAL; 57 return -EINVAL; 58 } 58 } 59 59 60 ei->nr_blks++; 60 ei->nr_blks++; 61 eb->start = pb->start; 61 eb->start = pb->start; 62 eb->end = pb->start + size; 62 eb->end = pb->start + size; 63 eb->nid = nid; 63 eb->nid = nid; 64 64 65 if (emu_nid_to_phys[nid] == NUMA_NO_NO 65 if (emu_nid_to_phys[nid] == NUMA_NO_NODE) 66 emu_nid_to_phys[nid] = pb->nid 66 emu_nid_to_phys[nid] = pb->nid; 67 67 68 pb->start += size; 68 pb->start += size; 69 if (pb->start >= pb->end) { 69 if (pb->start >= pb->end) { 70 WARN_ON_ONCE(pb->start > pb->e 70 WARN_ON_ONCE(pb->start > pb->end); 71 numa_remove_memblk_from(phys_b 71 numa_remove_memblk_from(phys_blk, pi); 72 } 72 } 73 73 74 printk(KERN_INFO "Faking node %d at [m 74 printk(KERN_INFO "Faking node %d at [mem %#018Lx-%#018Lx] (%LuMB)\n", 75 nid, eb->start, eb->end - 1, (e 75 nid, eb->start, eb->end - 1, (eb->end - eb->start) >> 20); 76 return 0; 76 return 0; 77 } 77 } 78 78 79 /* 79 /* 80 * Sets up nr_nodes fake nodes interleaved ove 80 * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr 81 * to max_addr. 81 * to max_addr. 82 * 82 * 83 * Returns zero on success or negative on erro 83 * Returns zero on success or negative on error. 84 */ 84 */ 85 static int __init split_nodes_interleave(struc 85 static int __init split_nodes_interleave(struct numa_meminfo *ei, 86 struc 86 struct numa_meminfo *pi, 87 u64 a 87 u64 addr, u64 max_addr, int nr_nodes) 88 { 88 { 89 nodemask_t physnode_mask = numa_nodes_ 89 nodemask_t physnode_mask = numa_nodes_parsed; 90 u64 size; 90 u64 size; 91 int big; 91 int big; 92 int nid = 0; 92 int nid = 0; 93 int i, ret; 93 int i, ret; 94 94 95 if (nr_nodes <= 0) 95 if (nr_nodes <= 0) 96 return -1; 96 return -1; 97 if (nr_nodes > MAX_NUMNODES) { 97 if (nr_nodes > MAX_NUMNODES) { 98 pr_info("numa=fake=%d too larg 98 pr_info("numa=fake=%d too large, reducing to %d\n", 99 nr_nodes, MAX_NUMNODES 99 nr_nodes, MAX_NUMNODES); 100 nr_nodes = MAX_NUMNODES; 100 nr_nodes = MAX_NUMNODES; 101 } 101 } 102 102 103 /* 103 /* 104 * Calculate target node size. x86_32 104 * Calculate target node size. x86_32 freaks on __udivdi3() so do 105 * the division in ulong number of pag 105 * the division in ulong number of pages and convert back. 106 */ 106 */ 107 size = max_addr - addr - mem_hole_size 107 size = max_addr - addr - mem_hole_size(addr, max_addr); 108 size = PFN_PHYS((unsigned long)(size > 108 size = PFN_PHYS((unsigned long)(size >> PAGE_SHIFT) / nr_nodes); 109 109 110 /* 110 /* 111 * Calculate the number of big nodes t 111 * Calculate the number of big nodes that can be allocated as a result 112 * of consolidating the remainder. 112 * of consolidating the remainder. 113 */ 113 */ 114 big = ((size & ~FAKE_NODE_MIN_HASH_MAS 114 big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) / 115 FAKE_NODE_MIN_SIZE; 115 FAKE_NODE_MIN_SIZE; 116 116 117 size &= FAKE_NODE_MIN_HASH_MASK; 117 size &= FAKE_NODE_MIN_HASH_MASK; 118 if (!size) { 118 if (!size) { 119 pr_err("Not enough memory for 119 pr_err("Not enough memory for each node. " 120 "NUMA emulation disabl 120 "NUMA emulation disabled.\n"); 121 return -1; 121 return -1; 122 } 122 } 123 123 124 /* 124 /* 125 * Continue to fill physical nodes wit 125 * Continue to fill physical nodes with fake nodes until there is no 126 * memory left on any of them. 126 * memory left on any of them. 127 */ 127 */ 128 while (!nodes_empty(physnode_mask)) { 128 while (!nodes_empty(physnode_mask)) { 129 for_each_node_mask(i, physnode 129 for_each_node_mask(i, physnode_mask) { 130 u64 dma32_end = numa_e 130 u64 dma32_end = numa_emu_dma_end(); 131 u64 start, limit, end; 131 u64 start, limit, end; 132 int phys_blk; 132 int phys_blk; 133 133 134 phys_blk = emu_find_me 134 phys_blk = emu_find_memblk_by_nid(i, pi); 135 if (phys_blk < 0) { 135 if (phys_blk < 0) { 136 node_clear(i, 136 node_clear(i, physnode_mask); 137 continue; 137 continue; 138 } 138 } 139 start = pi->blk[phys_b 139 start = pi->blk[phys_blk].start; 140 limit = pi->blk[phys_b 140 limit = pi->blk[phys_blk].end; 141 end = start + size; 141 end = start + size; 142 142 143 if (nid < big) 143 if (nid < big) 144 end += FAKE_NO 144 end += FAKE_NODE_MIN_SIZE; 145 145 146 /* 146 /* 147 * Continue to add mem 147 * Continue to add memory to this fake node if its 148 * non-reserved memory 148 * non-reserved memory is less than the per-node size. 149 */ 149 */ 150 while (end - start - m 150 while (end - start - mem_hole_size(start, end) < size) { 151 end += FAKE_NO 151 end += FAKE_NODE_MIN_SIZE; 152 if (end > limi 152 if (end > limit) { 153 end = 153 end = limit; 154 break; 154 break; 155 } 155 } 156 } 156 } 157 157 158 /* 158 /* 159 * If there won't be a 159 * If there won't be at least FAKE_NODE_MIN_SIZE of 160 * non-reserved memory 160 * non-reserved memory in ZONE_DMA32 for the next node, 161 * this one must exten 161 * this one must extend to the boundary. 162 */ 162 */ 163 if (end < dma32_end && 163 if (end < dma32_end && dma32_end - end - 164 mem_hole_size(end, 164 mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) 165 end = dma32_en 165 end = dma32_end; 166 166 167 /* 167 /* 168 * If there won't be e 168 * If there won't be enough non-reserved memory for the 169 * next node, this one 169 * next node, this one must extend to the end of the 170 * physical node. 170 * physical node. 171 */ 171 */ 172 if (limit - end - mem_ 172 if (limit - end - mem_hole_size(end, limit) < size) 173 end = limit; 173 end = limit; 174 174 175 ret = emu_setup_memblk 175 ret = emu_setup_memblk(ei, pi, nid++ % nr_nodes, 176 176 phys_blk, 177 177 min(end, limit) - start); 178 if (ret < 0) 178 if (ret < 0) 179 return ret; 179 return ret; 180 } 180 } 181 } 181 } 182 return 0; 182 return 0; 183 } 183 } 184 184 185 /* 185 /* 186 * Returns the end address of a node so that t 186 * Returns the end address of a node so that there is at least `size' amount of 187 * non-reserved memory or `max_addr' is reache 187 * non-reserved memory or `max_addr' is reached. 188 */ 188 */ 189 static u64 __init find_end_of_node(u64 start, 189 static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size) 190 { 190 { 191 u64 end = start + size; 191 u64 end = start + size; 192 192 193 while (end - start - mem_hole_size(sta 193 while (end - start - mem_hole_size(start, end) < size) { 194 end += FAKE_NODE_MIN_SIZE; 194 end += FAKE_NODE_MIN_SIZE; 195 if (end > max_addr) { 195 if (end > max_addr) { 196 end = max_addr; 196 end = max_addr; 197 break; 197 break; 198 } 198 } 199 } 199 } 200 return end; 200 return end; 201 } 201 } 202 202 203 static u64 uniform_size(u64 max_addr, u64 base 203 static u64 uniform_size(u64 max_addr, u64 base, u64 hole, int nr_nodes) 204 { 204 { 205 unsigned long max_pfn = PHYS_PFN(max_a 205 unsigned long max_pfn = PHYS_PFN(max_addr); 206 unsigned long base_pfn = PHYS_PFN(base 206 unsigned long base_pfn = PHYS_PFN(base); 207 unsigned long hole_pfns = PHYS_PFN(hol 207 unsigned long hole_pfns = PHYS_PFN(hole); 208 208 209 return PFN_PHYS((max_pfn - base_pfn - 209 return PFN_PHYS((max_pfn - base_pfn - hole_pfns) / nr_nodes); 210 } 210 } 211 211 212 /* 212 /* 213 * Sets up fake nodes of `size' interleaved ov 213 * Sets up fake nodes of `size' interleaved over physical nodes ranging from 214 * `addr' to `max_addr'. 214 * `addr' to `max_addr'. 215 * 215 * 216 * Returns zero on success or negative on erro 216 * Returns zero on success or negative on error. 217 */ 217 */ 218 static int __init split_nodes_size_interleave_ 218 static int __init split_nodes_size_interleave_uniform(struct numa_meminfo *ei, 219 219 struct numa_meminfo *pi, 220 220 u64 addr, u64 max_addr, u64 size, 221 221 int nr_nodes, struct numa_memblk *pblk, 222 222 int nid) 223 { 223 { 224 nodemask_t physnode_mask = numa_nodes_ 224 nodemask_t physnode_mask = numa_nodes_parsed; 225 int i, ret, uniform = 0; 225 int i, ret, uniform = 0; 226 u64 min_size; 226 u64 min_size; 227 227 228 if ((!size && !nr_nodes) || (nr_nodes 228 if ((!size && !nr_nodes) || (nr_nodes && !pblk)) 229 return -1; 229 return -1; 230 230 231 /* 231 /* 232 * In the 'uniform' case split the pas 232 * In the 'uniform' case split the passed in physical node by 233 * nr_nodes, in the non-uniform case, 233 * nr_nodes, in the non-uniform case, ignore the passed in 234 * physical block and try to create no 234 * physical block and try to create nodes of at least size 235 * @size. 235 * @size. 236 * 236 * 237 * In the uniform case, split the node 237 * In the uniform case, split the nodes strictly by physical 238 * capacity, i.e. ignore holes. In the 238 * capacity, i.e. ignore holes. In the non-uniform case account 239 * for holes and treat @size as a mini 239 * for holes and treat @size as a minimum floor. 240 */ 240 */ 241 if (!nr_nodes) 241 if (!nr_nodes) 242 nr_nodes = MAX_NUMNODES; 242 nr_nodes = MAX_NUMNODES; 243 else { 243 else { 244 nodes_clear(physnode_mask); 244 nodes_clear(physnode_mask); 245 node_set(pblk->nid, physnode_m 245 node_set(pblk->nid, physnode_mask); 246 uniform = 1; 246 uniform = 1; 247 } 247 } 248 248 249 if (uniform) { 249 if (uniform) { 250 min_size = uniform_size(max_ad 250 min_size = uniform_size(max_addr, addr, 0, nr_nodes); 251 size = min_size; 251 size = min_size; 252 } else { 252 } else { 253 /* 253 /* 254 * The limit on emulated nodes 254 * The limit on emulated nodes is MAX_NUMNODES, so the 255 * size per node is increased 255 * size per node is increased accordingly if the 256 * requested size is too small 256 * requested size is too small. This creates a uniform 257 * distribution of node sizes 257 * distribution of node sizes across the entire machine 258 * (but not necessarily over p 258 * (but not necessarily over physical nodes). 259 */ 259 */ 260 min_size = uniform_size(max_ad 260 min_size = uniform_size(max_addr, addr, 261 mem_hole_size( 261 mem_hole_size(addr, max_addr), nr_nodes); 262 } 262 } 263 min_size = ALIGN(max(min_size, FAKE_NO 263 min_size = ALIGN(max(min_size, FAKE_NODE_MIN_SIZE), FAKE_NODE_MIN_SIZE); 264 if (size < min_size) { 264 if (size < min_size) { 265 pr_err("Fake node size %LuMB t 265 pr_err("Fake node size %LuMB too small, increasing to %LuMB\n", 266 size >> 20, min_size > 266 size >> 20, min_size >> 20); 267 size = min_size; 267 size = min_size; 268 } 268 } 269 size = ALIGN_DOWN(size, FAKE_NODE_MIN_ 269 size = ALIGN_DOWN(size, FAKE_NODE_MIN_SIZE); 270 270 271 /* 271 /* 272 * Fill physical nodes with fake nodes 272 * Fill physical nodes with fake nodes of size until there is no memory 273 * left on any of them. 273 * left on any of them. 274 */ 274 */ 275 while (!nodes_empty(physnode_mask)) { 275 while (!nodes_empty(physnode_mask)) { 276 for_each_node_mask(i, physnode 276 for_each_node_mask(i, physnode_mask) { 277 u64 dma32_end = numa_e 277 u64 dma32_end = numa_emu_dma_end(); 278 u64 start, limit, end; 278 u64 start, limit, end; 279 int phys_blk; 279 int phys_blk; 280 280 281 phys_blk = emu_find_me 281 phys_blk = emu_find_memblk_by_nid(i, pi); 282 if (phys_blk < 0) { 282 if (phys_blk < 0) { 283 node_clear(i, 283 node_clear(i, physnode_mask); 284 continue; 284 continue; 285 } 285 } 286 286 287 start = pi->blk[phys_b 287 start = pi->blk[phys_blk].start; 288 limit = pi->blk[phys_b 288 limit = pi->blk[phys_blk].end; 289 289 290 if (uniform) 290 if (uniform) 291 end = start + 291 end = start + size; 292 else 292 else 293 end = find_end 293 end = find_end_of_node(start, limit, size); 294 /* 294 /* 295 * If there won't be a 295 * If there won't be at least FAKE_NODE_MIN_SIZE of 296 * non-reserved memory 296 * non-reserved memory in ZONE_DMA32 for the next node, 297 * this one must exten 297 * this one must extend to the boundary. 298 */ 298 */ 299 if (end < dma32_end && 299 if (end < dma32_end && dma32_end - end - 300 mem_hole_size(end, 300 mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) 301 end = dma32_en 301 end = dma32_end; 302 302 303 /* 303 /* 304 * If there won't be e 304 * If there won't be enough non-reserved memory for the 305 * next node, this one 305 * next node, this one must extend to the end of the 306 * physical node. 306 * physical node. 307 */ 307 */ 308 if ((limit - end - mem 308 if ((limit - end - mem_hole_size(end, limit) < size) 309 && !un 309 && !uniform) 310 end = limit; 310 end = limit; 311 311 312 ret = emu_setup_memblk 312 ret = emu_setup_memblk(ei, pi, nid++ % MAX_NUMNODES, 313 313 phys_blk, 314 314 min(end, limit) - start); 315 if (ret < 0) 315 if (ret < 0) 316 return ret; 316 return ret; 317 } 317 } 318 } 318 } 319 return nid; 319 return nid; 320 } 320 } 321 321 322 static int __init split_nodes_size_interleave( 322 static int __init split_nodes_size_interleave(struct numa_meminfo *ei, 323 323 struct numa_meminfo *pi, 324 324 u64 addr, u64 max_addr, u64 size) 325 { 325 { 326 return split_nodes_size_interleave_uni 326 return split_nodes_size_interleave_uniform(ei, pi, addr, max_addr, size, 327 0, NULL, 0); 327 0, NULL, 0); 328 } 328 } 329 329 330 static int __init setup_emu2phys_nid(int *dfl_ 330 static int __init setup_emu2phys_nid(int *dfl_phys_nid) 331 { 331 { 332 int i, max_emu_nid = 0; 332 int i, max_emu_nid = 0; 333 333 334 *dfl_phys_nid = NUMA_NO_NODE; 334 *dfl_phys_nid = NUMA_NO_NODE; 335 for (i = 0; i < ARRAY_SIZE(emu_nid_to_ 335 for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) { 336 if (emu_nid_to_phys[i] != NUMA 336 if (emu_nid_to_phys[i] != NUMA_NO_NODE) { 337 max_emu_nid = i; 337 max_emu_nid = i; 338 if (*dfl_phys_nid == N 338 if (*dfl_phys_nid == NUMA_NO_NODE) 339 *dfl_phys_nid 339 *dfl_phys_nid = emu_nid_to_phys[i]; 340 } 340 } 341 } 341 } 342 342 343 return max_emu_nid; 343 return max_emu_nid; 344 } 344 } 345 345 346 /** 346 /** 347 * numa_emulation - Emulate NUMA nodes 347 * numa_emulation - Emulate NUMA nodes 348 * @numa_meminfo: NUMA configuration to massag 348 * @numa_meminfo: NUMA configuration to massage 349 * @numa_dist_cnt: The size of the physical NU 349 * @numa_dist_cnt: The size of the physical NUMA distance table 350 * 350 * 351 * Emulate NUMA nodes according to the numa=fa 351 * Emulate NUMA nodes according to the numa=fake kernel parameter. 352 * @numa_meminfo contains the physical memory 352 * @numa_meminfo contains the physical memory configuration and is modified 353 * to reflect the emulated configuration on su 353 * to reflect the emulated configuration on success. @numa_dist_cnt is 354 * used to determine the size of the physical 354 * used to determine the size of the physical distance table. 355 * 355 * 356 * On success, the following modifications are 356 * On success, the following modifications are made. 357 * 357 * 358 * - @numa_meminfo is updated to reflect the e 358 * - @numa_meminfo is updated to reflect the emulated nodes. 359 * 359 * 360 * - __apicid_to_node[] is updated such that A 360 * - __apicid_to_node[] is updated such that APIC IDs are mapped to the 361 * emulated nodes. 361 * emulated nodes. 362 * 362 * 363 * - NUMA distance table is rebuilt to represe 363 * - NUMA distance table is rebuilt to represent distances between emulated 364 * nodes. The distances are determined cons 364 * nodes. The distances are determined considering how emulated nodes 365 * are mapped to physical nodes and match th 365 * are mapped to physical nodes and match the actual distances. 366 * 366 * 367 * - emu_nid_to_phys[] reflects how emulated n 367 * - emu_nid_to_phys[] reflects how emulated nodes are mapped to physical 368 * nodes. This is used by numa_add_cpu() an 368 * nodes. This is used by numa_add_cpu() and numa_remove_cpu(). 369 * 369 * 370 * If emulation is not enabled or fails, emu_n 370 * If emulation is not enabled or fails, emu_nid_to_phys[] is filled with 371 * identity mapping and no other modification 371 * identity mapping and no other modification is made. 372 */ 372 */ 373 void __init numa_emulation(struct numa_meminfo 373 void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) 374 { 374 { 375 static struct numa_meminfo ei __initda 375 static struct numa_meminfo ei __initdata; 376 static struct numa_meminfo pi __initda 376 static struct numa_meminfo pi __initdata; 377 const u64 max_addr = PFN_PHYS(max_pfn) 377 const u64 max_addr = PFN_PHYS(max_pfn); 378 u8 *phys_dist = NULL; 378 u8 *phys_dist = NULL; 379 size_t phys_size = numa_dist_cnt * num 379 size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]); 380 int max_emu_nid, dfl_phys_nid; 380 int max_emu_nid, dfl_phys_nid; 381 int i, j, ret; 381 int i, j, ret; 382 382 383 if (!emu_cmdline) 383 if (!emu_cmdline) 384 goto no_emu; 384 goto no_emu; 385 385 386 memset(&ei, 0, sizeof(ei)); 386 memset(&ei, 0, sizeof(ei)); 387 pi = *numa_meminfo; 387 pi = *numa_meminfo; 388 388 389 for (i = 0; i < MAX_NUMNODES; i++) 389 for (i = 0; i < MAX_NUMNODES; i++) 390 emu_nid_to_phys[i] = NUMA_NO_N 390 emu_nid_to_phys[i] = NUMA_NO_NODE; 391 391 392 /* 392 /* 393 * If the numa=fake command-line conta 393 * If the numa=fake command-line contains a 'M' or 'G', it represents 394 * the fixed node size. Otherwise, if 394 * the fixed node size. Otherwise, if it is just a single number N, 395 * split the system RAM into N fake no 395 * split the system RAM into N fake nodes. 396 */ 396 */ 397 if (strchr(emu_cmdline, 'U')) { 397 if (strchr(emu_cmdline, 'U')) { 398 nodemask_t physnode_mask = num 398 nodemask_t physnode_mask = numa_nodes_parsed; 399 unsigned long n; 399 unsigned long n; 400 int nid = 0; 400 int nid = 0; 401 401 402 n = simple_strtoul(emu_cmdline 402 n = simple_strtoul(emu_cmdline, &emu_cmdline, 0); 403 ret = -1; 403 ret = -1; 404 for_each_node_mask(i, physnode 404 for_each_node_mask(i, physnode_mask) { 405 /* 405 /* 406 * The reason we pass 406 * The reason we pass in blk[0] is due to 407 * numa_remove_memblk_ 407 * numa_remove_memblk_from() called by 408 * emu_setup_memblk() 408 * emu_setup_memblk() will delete entry 0 409 * and then move every 409 * and then move everything else up in the pi.blk 410 * array. Therefore we 410 * array. Therefore we should always be looking 411 * at blk[0]. 411 * at blk[0]. 412 */ 412 */ 413 ret = split_nodes_size 413 ret = split_nodes_size_interleave_uniform(&ei, &pi, 414 pi.blk 414 pi.blk[0].start, pi.blk[0].end, 0, 415 n, &pi 415 n, &pi.blk[0], nid); 416 if (ret < 0) 416 if (ret < 0) 417 break; 417 break; 418 if (ret < n) { 418 if (ret < n) { 419 pr_info("%s: p 419 pr_info("%s: phys: %d only got %d of %ld nodes, failing\n", 420 420 __func__, i, ret, n); 421 ret = -1; 421 ret = -1; 422 break; 422 break; 423 } 423 } 424 nid = ret; 424 nid = ret; 425 } 425 } 426 } else if (strchr(emu_cmdline, 'M') || 426 } else if (strchr(emu_cmdline, 'M') || strchr(emu_cmdline, 'G')) { 427 u64 size; 427 u64 size; 428 428 429 size = memparse(emu_cmdline, & 429 size = memparse(emu_cmdline, &emu_cmdline); 430 ret = split_nodes_size_interle 430 ret = split_nodes_size_interleave(&ei, &pi, 0, max_addr, size); 431 } else { 431 } else { 432 unsigned long n; 432 unsigned long n; 433 433 434 n = simple_strtoul(emu_cmdline 434 n = simple_strtoul(emu_cmdline, &emu_cmdline, 0); 435 ret = split_nodes_interleave(& 435 ret = split_nodes_interleave(&ei, &pi, 0, max_addr, n); 436 } 436 } 437 if (*emu_cmdline == ':') 437 if (*emu_cmdline == ':') 438 emu_cmdline++; 438 emu_cmdline++; 439 439 440 if (ret < 0) 440 if (ret < 0) 441 goto no_emu; 441 goto no_emu; 442 442 443 if (numa_cleanup_meminfo(&ei) < 0) { 443 if (numa_cleanup_meminfo(&ei) < 0) { 444 pr_warn("NUMA: Warning: constr 444 pr_warn("NUMA: Warning: constructed meminfo invalid, disabling emulation\n"); 445 goto no_emu; 445 goto no_emu; 446 } 446 } 447 447 448 /* copy the physical distance table */ 448 /* copy the physical distance table */ 449 if (numa_dist_cnt) { 449 if (numa_dist_cnt) { 450 phys_dist = memblock_alloc(phy 450 phys_dist = memblock_alloc(phys_size, PAGE_SIZE); 451 if (!phys_dist) { 451 if (!phys_dist) { 452 pr_warn("NUMA: Warning 452 pr_warn("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n"); 453 goto no_emu; 453 goto no_emu; 454 } 454 } 455 455 456 for (i = 0; i < numa_dist_cnt; 456 for (i = 0; i < numa_dist_cnt; i++) 457 for (j = 0; j < numa_d 457 for (j = 0; j < numa_dist_cnt; j++) 458 phys_dist[i * 458 phys_dist[i * numa_dist_cnt + j] = 459 node_d 459 node_distance(i, j); 460 } 460 } 461 461 462 /* 462 /* 463 * Determine the max emulated nid and 463 * Determine the max emulated nid and the default phys nid to use 464 * for unmapped nodes. 464 * for unmapped nodes. 465 */ 465 */ 466 max_emu_nid = setup_emu2phys_nid(&dfl_ 466 max_emu_nid = setup_emu2phys_nid(&dfl_phys_nid); 467 467 468 /* commit */ 468 /* commit */ 469 *numa_meminfo = ei; 469 *numa_meminfo = ei; 470 470 471 /* Make sure numa_nodes_parsed only co 471 /* Make sure numa_nodes_parsed only contains emulated nodes */ 472 nodes_clear(numa_nodes_parsed); 472 nodes_clear(numa_nodes_parsed); 473 for (i = 0; i < ARRAY_SIZE(ei.blk); i+ 473 for (i = 0; i < ARRAY_SIZE(ei.blk); i++) 474 if (ei.blk[i].start != ei.blk[ 474 if (ei.blk[i].start != ei.blk[i].end && 475 ei.blk[i].nid != NUMA_NO_N 475 ei.blk[i].nid != NUMA_NO_NODE) 476 node_set(ei.blk[i].nid 476 node_set(ei.blk[i].nid, numa_nodes_parsed); 477 477 478 numa_emu_update_cpu_to_node(emu_nid_to 478 numa_emu_update_cpu_to_node(emu_nid_to_phys, ARRAY_SIZE(emu_nid_to_phys)); 479 479 480 /* make sure all emulated nodes are ma 480 /* make sure all emulated nodes are mapped to a physical node */ 481 for (i = 0; i < ARRAY_SIZE(emu_nid_to_ 481 for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) 482 if (emu_nid_to_phys[i] == NUMA 482 if (emu_nid_to_phys[i] == NUMA_NO_NODE) 483 emu_nid_to_phys[i] = d 483 emu_nid_to_phys[i] = dfl_phys_nid; 484 484 485 /* transform distance table */ 485 /* transform distance table */ 486 numa_reset_distance(); 486 numa_reset_distance(); 487 for (i = 0; i < max_emu_nid + 1; i++) 487 for (i = 0; i < max_emu_nid + 1; i++) { 488 for (j = 0; j < max_emu_nid + 488 for (j = 0; j < max_emu_nid + 1; j++) { 489 int physi = emu_nid_to 489 int physi = emu_nid_to_phys[i]; 490 int physj = emu_nid_to 490 int physj = emu_nid_to_phys[j]; 491 int dist; 491 int dist; 492 492 493 if (get_option(&emu_cm 493 if (get_option(&emu_cmdline, &dist) == 2) 494 ; 494 ; 495 else if (physi >= numa 495 else if (physi >= numa_dist_cnt || physj >= numa_dist_cnt) 496 dist = physi = 496 dist = physi == physj ? 497 LOCAL_ 497 LOCAL_DISTANCE : REMOTE_DISTANCE; 498 else 498 else 499 dist = phys_di 499 dist = phys_dist[physi * numa_dist_cnt + physj]; 500 500 501 numa_set_distance(i, j 501 numa_set_distance(i, j, dist); 502 } 502 } 503 } 503 } 504 504 505 /* free the copied physical distance t 505 /* free the copied physical distance table */ 506 memblock_free(phys_dist, phys_size); 506 memblock_free(phys_dist, phys_size); 507 return; 507 return; 508 508 509 no_emu: 509 no_emu: 510 /* No emulation. Build identity emu_n 510 /* No emulation. Build identity emu_nid_to_phys[] for numa_add_cpu() */ 511 for (i = 0; i < ARRAY_SIZE(emu_nid_to_ 511 for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) 512 emu_nid_to_phys[i] = i; 512 emu_nid_to_phys[i] = i; 513 } 513 } 514 514 515 #ifndef CONFIG_DEBUG_PER_CPU_MAPS 515 #ifndef CONFIG_DEBUG_PER_CPU_MAPS 516 void numa_add_cpu(unsigned int cpu) 516 void numa_add_cpu(unsigned int cpu) 517 { 517 { 518 int physnid, nid; 518 int physnid, nid; 519 519 520 nid = early_cpu_to_node(cpu); 520 nid = early_cpu_to_node(cpu); 521 BUG_ON(nid == NUMA_NO_NODE || !node_on 521 BUG_ON(nid == NUMA_NO_NODE || !node_online(nid)); 522 522 523 physnid = emu_nid_to_phys[nid]; 523 physnid = emu_nid_to_phys[nid]; 524 524 525 /* 525 /* 526 * Map the cpu to each emulated node t 526 * Map the cpu to each emulated node that is allocated on the physical 527 * node of the cpu's apic id. 527 * node of the cpu's apic id. 528 */ 528 */ 529 for_each_online_node(nid) 529 for_each_online_node(nid) 530 if (emu_nid_to_phys[nid] == ph 530 if (emu_nid_to_phys[nid] == physnid) 531 cpumask_set_cpu(cpu, n 531 cpumask_set_cpu(cpu, node_to_cpumask_map[nid]); 532 } 532 } 533 533 534 void numa_remove_cpu(unsigned int cpu) 534 void numa_remove_cpu(unsigned int cpu) 535 { 535 { 536 int i; 536 int i; 537 537 538 for_each_online_node(i) 538 for_each_online_node(i) 539 cpumask_clear_cpu(cpu, node_to 539 cpumask_clear_cpu(cpu, node_to_cpumask_map[i]); 540 } 540 } 541 #else /* !CONFIG_DEBUG_PER_CPU_MAPS */ 541 #else /* !CONFIG_DEBUG_PER_CPU_MAPS */ 542 static void numa_set_cpumask(unsigned int cpu, 542 static void numa_set_cpumask(unsigned int cpu, bool enable) 543 { 543 { 544 int nid, physnid; 544 int nid, physnid; 545 545 546 nid = early_cpu_to_node(cpu); 546 nid = early_cpu_to_node(cpu); 547 if (nid == NUMA_NO_NODE) { 547 if (nid == NUMA_NO_NODE) { 548 /* early_cpu_to_node() already 548 /* early_cpu_to_node() already emits a warning and trace */ 549 return; 549 return; 550 } 550 } 551 551 552 physnid = emu_nid_to_phys[nid]; 552 physnid = emu_nid_to_phys[nid]; 553 553 554 for_each_online_node(nid) { 554 for_each_online_node(nid) { 555 if (emu_nid_to_phys[nid] != ph 555 if (emu_nid_to_phys[nid] != physnid) 556 continue; 556 continue; 557 557 558 debug_cpumask_set_cpu(cpu, nid 558 debug_cpumask_set_cpu(cpu, nid, enable); 559 } 559 } 560 } 560 } 561 561 562 void numa_add_cpu(unsigned int cpu) 562 void numa_add_cpu(unsigned int cpu) 563 { 563 { 564 numa_set_cpumask(cpu, true); 564 numa_set_cpumask(cpu, true); 565 } 565 } 566 566 567 void numa_remove_cpu(unsigned int cpu) 567 void numa_remove_cpu(unsigned int cpu) 568 { 568 { 569 numa_set_cpumask(cpu, false); 569 numa_set_cpumask(cpu, false); 570 } 570 } 571 #endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ 571 #endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ 572 572
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.