1 // SPDX-License-Identifier: GPL-2.0 1 2 /* 3 * NUMA emulation 4 */ 5 #include <linux/kernel.h> 6 #include <linux/errno.h> 7 #include <linux/topology.h> 8 #include <linux/memblock.h> 9 #include <linux/numa_memblks.h> 10 #include <asm/numa.h> 11 12 #define FAKE_NODE_MIN_SIZE ((u64)32 << 20 13 #define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_M 14 15 static int emu_nid_to_phys[MAX_NUMNODES]; 16 static char *emu_cmdline __initdata; 17 18 int __init numa_emu_cmdline(char *str) 19 { 20 emu_cmdline = str; 21 return 0; 22 } 23 24 static int __init emu_find_memblk_by_nid(int n 25 { 26 int i; 27 28 for (i = 0; i < mi->nr_blks; i++) 29 if (mi->blk[i].nid == nid) 30 return i; 31 return -ENOENT; 32 } 33 34 static u64 __init mem_hole_size(u64 start, u64 35 { 36 unsigned long start_pfn = PFN_UP(start 37 unsigned long end_pfn = PFN_DOWN(end); 38 39 if (start_pfn < end_pfn) 40 return PFN_PHYS(absent_pages_i 41 return 0; 42 } 43 44 /* 45 * Sets up nid to range from @start to @end. 46 * something went wrong, 0 otherwise. 47 */ 48 static int __init emu_setup_memblk(struct numa 49 struct numa 50 int nid, in 51 { 52 struct numa_memblk *eb = &ei->blk[ei-> 53 struct numa_memblk *pb = &pi->blk[phys 54 55 if (ei->nr_blks >= NR_NODE_MEMBLKS) { 56 pr_err("NUMA: Too many emulate 57 return -EINVAL; 58 } 59 60 ei->nr_blks++; 61 eb->start = pb->start; 62 eb->end = pb->start + size; 63 eb->nid = nid; 64 65 if (emu_nid_to_phys[nid] == NUMA_NO_NO 66 emu_nid_to_phys[nid] = pb->nid 67 68 pb->start += size; 69 if (pb->start >= pb->end) { 70 WARN_ON_ONCE(pb->start > pb->e 71 numa_remove_memblk_from(phys_b 72 } 73 74 printk(KERN_INFO "Faking node %d at [m 75 nid, eb->start, eb->end - 1, (e 76 return 0; 77 } 78 79 /* 80 * Sets up nr_nodes fake nodes interleaved ove 81 * to max_addr. 82 * 83 * Returns zero on success or negative on erro 84 */ 85 static int __init split_nodes_interleave(struc 86 struc 87 u64 a 88 { 89 nodemask_t physnode_mask = numa_nodes_ 90 u64 size; 91 int big; 92 int nid = 0; 93 int i, ret; 94 95 if (nr_nodes <= 0) 96 return -1; 97 if (nr_nodes > MAX_NUMNODES) { 98 pr_info("numa=fake=%d too larg 99 nr_nodes, MAX_NUMNODES 100 nr_nodes = MAX_NUMNODES; 101 } 102 103 /* 104 * Calculate target node size. x86_32 105 * the division in ulong number of pag 106 */ 107 size = max_addr - addr - mem_hole_size 108 size = PFN_PHYS((unsigned long)(size > 109 110 /* 111 * Calculate the number of big nodes t 112 * of consolidating the remainder. 113 */ 114 big = ((size & ~FAKE_NODE_MIN_HASH_MAS 115 FAKE_NODE_MIN_SIZE; 116 117 size &= FAKE_NODE_MIN_HASH_MASK; 118 if (!size) { 119 pr_err("Not enough memory for 120 "NUMA emulation disabl 121 return -1; 122 } 123 124 /* 125 * Continue to fill physical nodes wit 126 * memory left on any of them. 127 */ 128 while (!nodes_empty(physnode_mask)) { 129 for_each_node_mask(i, physnode 130 u64 dma32_end = numa_e 131 u64 start, limit, end; 132 int phys_blk; 133 134 phys_blk = emu_find_me 135 if (phys_blk < 0) { 136 node_clear(i, 137 continue; 138 } 139 start = pi->blk[phys_b 140 limit = pi->blk[phys_b 141 end = start + size; 142 143 if (nid < big) 144 end += FAKE_NO 145 146 /* 147 * Continue to add mem 148 * non-reserved memory 149 */ 150 while (end - start - m 151 end += FAKE_NO 152 if (end > limi 153 end = 154 break; 155 } 156 } 157 158 /* 159 * If there won't be a 160 * non-reserved memory 161 * this one must exten 162 */ 163 if (end < dma32_end && 164 mem_hole_size(end, 165 end = dma32_en 166 167 /* 168 * If there won't be e 169 * next node, this one 170 * physical node. 171 */ 172 if (limit - end - mem_ 173 end = limit; 174 175 ret = emu_setup_memblk 176 177 178 if (ret < 0) 179 return ret; 180 } 181 } 182 return 0; 183 } 184 185 /* 186 * Returns the end address of a node so that t 187 * non-reserved memory or `max_addr' is reache 188 */ 189 static u64 __init find_end_of_node(u64 start, 190 { 191 u64 end = start + size; 192 193 while (end - start - mem_hole_size(sta 194 end += FAKE_NODE_MIN_SIZE; 195 if (end > max_addr) { 196 end = max_addr; 197 break; 198 } 199 } 200 return end; 201 } 202 203 static u64 uniform_size(u64 max_addr, u64 base 204 { 205 unsigned long max_pfn = PHYS_PFN(max_a 206 unsigned long base_pfn = PHYS_PFN(base 207 unsigned long hole_pfns = PHYS_PFN(hol 208 209 return PFN_PHYS((max_pfn - base_pfn - 210 } 211 212 /* 213 * Sets up fake nodes of `size' interleaved ov 214 * `addr' to `max_addr'. 215 * 216 * Returns zero on success or negative on erro 217 */ 218 static int __init split_nodes_size_interleave_ 219 220 221 222 223 { 224 nodemask_t physnode_mask = numa_nodes_ 225 int i, ret, uniform = 0; 226 u64 min_size; 227 228 if ((!size && !nr_nodes) || (nr_nodes 229 return -1; 230 231 /* 232 * In the 'uniform' case split the pas 233 * nr_nodes, in the non-uniform case, 234 * physical block and try to create no 235 * @size. 236 * 237 * In the uniform case, split the node 238 * capacity, i.e. ignore holes. In the 239 * for holes and treat @size as a mini 240 */ 241 if (!nr_nodes) 242 nr_nodes = MAX_NUMNODES; 243 else { 244 nodes_clear(physnode_mask); 245 node_set(pblk->nid, physnode_m 246 uniform = 1; 247 } 248 249 if (uniform) { 250 min_size = uniform_size(max_ad 251 size = min_size; 252 } else { 253 /* 254 * The limit on emulated nodes 255 * size per node is increased 256 * requested size is too small 257 * distribution of node sizes 258 * (but not necessarily over p 259 */ 260 min_size = uniform_size(max_ad 261 mem_hole_size( 262 } 263 min_size = ALIGN(max(min_size, FAKE_NO 264 if (size < min_size) { 265 pr_err("Fake node size %LuMB t 266 size >> 20, min_size > 267 size = min_size; 268 } 269 size = ALIGN_DOWN(size, FAKE_NODE_MIN_ 270 271 /* 272 * Fill physical nodes with fake nodes 273 * left on any of them. 274 */ 275 while (!nodes_empty(physnode_mask)) { 276 for_each_node_mask(i, physnode 277 u64 dma32_end = numa_e 278 u64 start, limit, end; 279 int phys_blk; 280 281 phys_blk = emu_find_me 282 if (phys_blk < 0) { 283 node_clear(i, 284 continue; 285 } 286 287 start = pi->blk[phys_b 288 limit = pi->blk[phys_b 289 290 if (uniform) 291 end = start + 292 else 293 end = find_end 294 /* 295 * If there won't be a 296 * non-reserved memory 297 * this one must exten 298 */ 299 if (end < dma32_end && 300 mem_hole_size(end, 301 end = dma32_en 302 303 /* 304 * If there won't be e 305 * next node, this one 306 * physical node. 307 */ 308 if ((limit - end - mem 309 && !un 310 end = limit; 311 312 ret = emu_setup_memblk 313 314 315 if (ret < 0) 316 return ret; 317 } 318 } 319 return nid; 320 } 321 322 static int __init split_nodes_size_interleave( 323 324 325 { 326 return split_nodes_size_interleave_uni 327 0, NULL, 0); 328 } 329 330 static int __init setup_emu2phys_nid(int *dfl_ 331 { 332 int i, max_emu_nid = 0; 333 334 *dfl_phys_nid = NUMA_NO_NODE; 335 for (i = 0; i < ARRAY_SIZE(emu_nid_to_ 336 if (emu_nid_to_phys[i] != NUMA 337 max_emu_nid = i; 338 if (*dfl_phys_nid == N 339 *dfl_phys_nid 340 } 341 } 342 343 return max_emu_nid; 344 } 345 346 /** 347 * numa_emulation - Emulate NUMA nodes 348 * @numa_meminfo: NUMA configuration to massag 349 * @numa_dist_cnt: The size of the physical NU 350 * 351 * Emulate NUMA nodes according to the numa=fa 352 * @numa_meminfo contains the physical memory 353 * to reflect the emulated configuration on su 354 * used to determine the size of the physical 355 * 356 * On success, the following modifications are 357 * 358 * - @numa_meminfo is updated to reflect the e 359 * 360 * - __apicid_to_node[] is updated such that A 361 * emulated nodes. 362 * 363 * - NUMA distance table is rebuilt to represe 364 * nodes. The distances are determined cons 365 * are mapped to physical nodes and match th 366 * 367 * - emu_nid_to_phys[] reflects how emulated n 368 * nodes. This is used by numa_add_cpu() an 369 * 370 * If emulation is not enabled or fails, emu_n 371 * identity mapping and no other modification 372 */ 373 void __init numa_emulation(struct numa_meminfo 374 { 375 static struct numa_meminfo ei __initda 376 static struct numa_meminfo pi __initda 377 const u64 max_addr = PFN_PHYS(max_pfn) 378 u8 *phys_dist = NULL; 379 size_t phys_size = numa_dist_cnt * num 380 int max_emu_nid, dfl_phys_nid; 381 int i, j, ret; 382 383 if (!emu_cmdline) 384 goto no_emu; 385 386 memset(&ei, 0, sizeof(ei)); 387 pi = *numa_meminfo; 388 389 for (i = 0; i < MAX_NUMNODES; i++) 390 emu_nid_to_phys[i] = NUMA_NO_N 391 392 /* 393 * If the numa=fake command-line conta 394 * the fixed node size. Otherwise, if 395 * split the system RAM into N fake no 396 */ 397 if (strchr(emu_cmdline, 'U')) { 398 nodemask_t physnode_mask = num 399 unsigned long n; 400 int nid = 0; 401 402 n = simple_strtoul(emu_cmdline 403 ret = -1; 404 for_each_node_mask(i, physnode 405 /* 406 * The reason we pass 407 * numa_remove_memblk_ 408 * emu_setup_memblk() 409 * and then move every 410 * array. Therefore we 411 * at blk[0]. 412 */ 413 ret = split_nodes_size 414 pi.blk 415 n, &pi 416 if (ret < 0) 417 break; 418 if (ret < n) { 419 pr_info("%s: p 420 421 ret = -1; 422 break; 423 } 424 nid = ret; 425 } 426 } else if (strchr(emu_cmdline, 'M') || 427 u64 size; 428 429 size = memparse(emu_cmdline, & 430 ret = split_nodes_size_interle 431 } else { 432 unsigned long n; 433 434 n = simple_strtoul(emu_cmdline 435 ret = split_nodes_interleave(& 436 } 437 if (*emu_cmdline == ':') 438 emu_cmdline++; 439 440 if (ret < 0) 441 goto no_emu; 442 443 if (numa_cleanup_meminfo(&ei) < 0) { 444 pr_warn("NUMA: Warning: constr 445 goto no_emu; 446 } 447 448 /* copy the physical distance table */ 449 if (numa_dist_cnt) { 450 phys_dist = memblock_alloc(phy 451 if (!phys_dist) { 452 pr_warn("NUMA: Warning 453 goto no_emu; 454 } 455 456 for (i = 0; i < numa_dist_cnt; 457 for (j = 0; j < numa_d 458 phys_dist[i * 459 node_d 460 } 461 462 /* 463 * Determine the max emulated nid and 464 * for unmapped nodes. 465 */ 466 max_emu_nid = setup_emu2phys_nid(&dfl_ 467 468 /* commit */ 469 *numa_meminfo = ei; 470 471 /* Make sure numa_nodes_parsed only co 472 nodes_clear(numa_nodes_parsed); 473 for (i = 0; i < ARRAY_SIZE(ei.blk); i+ 474 if (ei.blk[i].start != ei.blk[ 475 ei.blk[i].nid != NUMA_NO_N 476 node_set(ei.blk[i].nid 477 478 numa_emu_update_cpu_to_node(emu_nid_to 479 480 /* make sure all emulated nodes are ma 481 for (i = 0; i < ARRAY_SIZE(emu_nid_to_ 482 if (emu_nid_to_phys[i] == NUMA 483 emu_nid_to_phys[i] = d 484 485 /* transform distance table */ 486 numa_reset_distance(); 487 for (i = 0; i < max_emu_nid + 1; i++) 488 for (j = 0; j < max_emu_nid + 489 int physi = emu_nid_to 490 int physj = emu_nid_to 491 int dist; 492 493 if (get_option(&emu_cm 494 ; 495 else if (physi >= numa 496 dist = physi = 497 LOCAL_ 498 else 499 dist = phys_di 500 501 numa_set_distance(i, j 502 } 503 } 504 505 /* free the copied physical distance t 506 memblock_free(phys_dist, phys_size); 507 return; 508 509 no_emu: 510 /* No emulation. Build identity emu_n 511 for (i = 0; i < ARRAY_SIZE(emu_nid_to_ 512 emu_nid_to_phys[i] = i; 513 } 514 515 #ifndef CONFIG_DEBUG_PER_CPU_MAPS 516 void numa_add_cpu(unsigned int cpu) 517 { 518 int physnid, nid; 519 520 nid = early_cpu_to_node(cpu); 521 BUG_ON(nid == NUMA_NO_NODE || !node_on 522 523 physnid = emu_nid_to_phys[nid]; 524 525 /* 526 * Map the cpu to each emulated node t 527 * node of the cpu's apic id. 528 */ 529 for_each_online_node(nid) 530 if (emu_nid_to_phys[nid] == ph 531 cpumask_set_cpu(cpu, n 532 } 533 534 void numa_remove_cpu(unsigned int cpu) 535 { 536 int i; 537 538 for_each_online_node(i) 539 cpumask_clear_cpu(cpu, node_to 540 } 541 #else /* !CONFIG_DEBUG_PER_CPU_MAPS */ 542 static void numa_set_cpumask(unsigned int cpu, 543 { 544 int nid, physnid; 545 546 nid = early_cpu_to_node(cpu); 547 if (nid == NUMA_NO_NODE) { 548 /* early_cpu_to_node() already 549 return; 550 } 551 552 physnid = emu_nid_to_phys[nid]; 553 554 for_each_online_node(nid) { 555 if (emu_nid_to_phys[nid] != ph 556 continue; 557 558 debug_cpumask_set_cpu(cpu, nid 559 } 560 } 561 562 void numa_add_cpu(unsigned int cpu) 563 { 564 numa_set_cpumask(cpu, true); 565 } 566 567 void numa_remove_cpu(unsigned int cpu) 568 { 569 numa_set_cpumask(cpu, false); 570 } 571 #endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ 572
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.