Linux/arch/x86/kernel/cpu/resctrl/monitor.c

Version: ~ [ linux-6.11.5 ] ~ [ linux-6.10.14 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.58 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.114 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.169 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.228 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.284 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.322 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.9 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Resource Director Technology(RDT) 4 * - Monitoring code 5 * 6 * Copyright (C) 2017 Intel Corporation 7 * 8 * Author: 9 * Vikas Shivappa <vikas.shivappa@intel.com> 10 * 11 * This replaces the cqm.c based on perf but we reuse a lot of 12 * code and datastructures originally from Peter Zijlstra and Matt Fleming. 13 * 14 * More information about RDT be found in the Intel (R) x86 Architecture 15 * Software Developer Manual June 2016, volume 3, section 17.17. 16 */ 17 18 #define pr_fmt(fmt) "resctrl: " fmt 19 20 #include <linux/cpu.h> 21 #include <linux/module.h> 22 #include <linux/sizes.h> 23 #include <linux/slab.h> 24 25 #include <asm/cpu_device_id.h> 26 #include <asm/resctrl.h> 27 28 #include "internal.h" 29 #include "trace.h" 30 31 /** 32 * struct rmid_entry - dirty tracking for all RMID. 33 * @closid: The CLOSID for this entry. 34 * @rmid: The RMID for this entry. 35 * @busy: The number of domains with cached data using this RMID. 36 * @list: Member of the rmid_free_lru list when busy == 0. 37 * 38 * Depending on the architecture the correct monitor is accessed using 39 * both @closid and @rmid, or @rmid only. 40 * 41 * Take the rdtgroup_mutex when accessing. 42 */ 43 struct rmid_entry { 44 u32 closid; 45 u32 rmid; 46 int busy; 47 struct list_head list; 48 }; 49 50 /* 51 * @rmid_free_lru - A least recently used list of free RMIDs 52 * These RMIDs are guaranteed to have an occupancy less than the 53 * threshold occupancy 54 */ 55 static LIST_HEAD(rmid_free_lru); 56 57 /* 58 * @closid_num_dirty_rmid The number of dirty RMID each CLOSID has. 59 * Only allocated when CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID is defined. 60 * Indexed by CLOSID. Protected by rdtgroup_mutex. 61 */ 62 static u32 *closid_num_dirty_rmid; 63 64 /* 65 * @rmid_limbo_count - count of currently unused but (potentially) 66 * dirty RMIDs. 67 * This counts RMIDs that no one is currently using but that 68 * may have a occupancy value > resctrl_rmid_realloc_threshold. User can 69 * change the threshold occupancy value. 70 */ 71 static unsigned int rmid_limbo_count; 72 73 /* 74 * @rmid_entry - The entry in the limbo and free lists. 75 */ 76 static struct rmid_entry *rmid_ptrs; 77 78 /* 79 * Global boolean for rdt_monitor which is true if any 80 * resource monitoring is enabled. 81 */ 82 bool rdt_mon_capable; 83 84 /* 85 * Global to indicate which monitoring events are enabled. 86 */ 87 unsigned int rdt_mon_features; 88 89 /* 90 * This is the threshold cache occupancy in bytes at which we will consider an 91 * RMID available for re-allocation. 92 */ 93 unsigned int resctrl_rmid_realloc_threshold; 94 95 /* 96 * This is the maximum value for the reallocation threshold, in bytes. 97 */ 98 unsigned int resctrl_rmid_realloc_limit; 99 100 #define CF(cf) ((unsigned long)(1048576 * (cf) + 0.5)) 101 102 static int snc_nodes_per_l3_cache = 1; 103 104 /* 105 * The correction factor table is documented in Documentation/arch/x86/resctrl.rst. 106 * If rmid > rmid threshold, MBM total and local values should be multiplied 107 * by the correction factor. 108 * 109 * The original table is modified for better code: 110 * 111 * 1. The threshold 0 is changed to rmid count - 1 so don't do correction 112 * for the case. 113 * 2. MBM total and local correction table indexed by core counter which is 114 * equal to (x86_cache_max_rmid + 1) / 8 - 1 and is from 0 up to 27. 115 * 3. The correction factor is normalized to 2^20 (1048576) so it's faster 116 * to calculate corrected value by shifting: 117 * corrected_value = (original_value * correction_factor) >> 20 118 */ 119 static const struct mbm_correction_factor_table { 120 u32 rmidthreshold; 121 u64 cf; 122 } mbm_cf_table[] __initconst = { 123 {7, CF(1.000000)}, 124 {15, CF(1.000000)}, 125 {15, CF(0.969650)}, 126 {31, CF(1.000000)}, 127 {31, CF(1.066667)}, 128 {31, CF(0.969650)}, 129 {47, CF(1.142857)}, 130 {63, CF(1.000000)}, 131 {63, CF(1.185115)}, 132 {63, CF(1.066553)}, 133 {79, CF(1.454545)}, 134 {95, CF(1.000000)}, 135 {95, CF(1.230769)}, 136 {95, CF(1.142857)}, 137 {95, CF(1.066667)}, 138 {127, CF(1.000000)}, 139 {127, CF(1.254863)}, 140 {127, CF(1.185255)}, 141 {151, CF(1.000000)}, 142 {127, CF(1.066667)}, 143 {167, CF(1.000000)}, 144 {159, CF(1.454334)}, 145 {183, CF(1.000000)}, 146 {127, CF(0.969744)}, 147 {191, CF(1.280246)}, 148 {191, CF(1.230921)}, 149 {215, CF(1.000000)}, 150 {191, CF(1.143118)}, 151 }; 152 153 static u32 mbm_cf_rmidthreshold __read_mostly = UINT_MAX; 154 static u64 mbm_cf __read_mostly; 155 156 static inline u64 get_corrected_mbm_count(u32 rmid, unsigned long val) 157 { 158 /* Correct MBM value. */ 159 if (rmid > mbm_cf_rmidthreshold) 160 val = (val * mbm_cf) >> 20; 161 162 return val; 163 } 164 165 /* 166 * x86 and arm64 differ in their handling of monitoring. 167 * x86's RMID are independent numbers, there is only one source of traffic 168 * with an RMID value of '1'. 169 * arm64's PMG extends the PARTID/CLOSID space, there are multiple sources of 170 * traffic with a PMG value of '1', one for each CLOSID, meaning the RMID 171 * value is no longer unique. 172 * To account for this, resctrl uses an index. On x86 this is just the RMID, 173 * on arm64 it encodes the CLOSID and RMID. This gives a unique number. 174 * 175 * The domain's rmid_busy_llc and rmid_ptrs[] are sized by index. The arch code 176 * must accept an attempt to read every index. 177 */ 178 static inline struct rmid_entry *__rmid_entry(u32 idx) 179 { 180 struct rmid_entry *entry; 181 u32 closid, rmid; 182 183 entry = &rmid_ptrs[idx]; 184 resctrl_arch_rmid_idx_decode(idx, &closid, &rmid); 185 186 WARN_ON_ONCE(entry->closid != closid); 187 WARN_ON_ONCE(entry->rmid != rmid); 188 189 return entry; 190 } 191 192 /* 193 * When Sub-NUMA Cluster (SNC) mode is not enabled (as indicated by 194 * "snc_nodes_per_l3_cache == 1") no translation of the RMID value is 195 * needed. The physical RMID is the same as the logical RMID. 196 * 197 * On a platform with SNC mode enabled, Linux enables RMID sharing mode 198 * via MSR 0xCA0 (see the "RMID Sharing Mode" section in the "Intel 199 * Resource Director Technology Architecture Specification" for a full 200 * description of RMID sharing mode). 201 * 202 * In RMID sharing mode there are fewer "logical RMID" values available 203 * to accumulate data ("physical RMIDs" are divided evenly between SNC 204 * nodes that share an L3 cache). Linux creates an rdt_mon_domain for 205 * each SNC node. 206 * 207 * The value loaded into IA32_PQR_ASSOC is the "logical RMID". 208 * 209 * Data is collected independently on each SNC node and can be retrieved 210 * using the "physical RMID" value computed by this function and loaded 211 * into IA32_QM_EVTSEL. @cpu can be any CPU in the SNC node. 212 * 213 * The scope of the IA32_QM_EVTSEL and IA32_QM_CTR MSRs is at the L3 214 * cache. So a "physical RMID" may be read from any CPU that shares 215 * the L3 cache with the desired SNC node, not just from a CPU in 216 * the specific SNC node. 217 */ 218 static int logical_rmid_to_physical_rmid(int cpu, int lrmid) 219 { 220 struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; 221 222 if (snc_nodes_per_l3_cache == 1) 223 return lrmid; 224 225 return lrmid + (cpu_to_node(cpu) % snc_nodes_per_l3_cache) * r->num_rmid; 226 } 227 228 static int __rmid_read_phys(u32 prmid, enum resctrl_event_id eventid, u64 *val) 229 { 230 u64 msr_val; 231 232 /* 233 * As per the SDM, when IA32_QM_EVTSEL.EvtID (bits 7:0) is configured 234 * with a valid event code for supported resource type and the bits 235 * IA32_QM_EVTSEL.RMID (bits 41:32) are configured with valid RMID, 236 * IA32_QM_CTR.data (bits 61:0) reports the monitored data. 237 * IA32_QM_CTR.Error (bit 63) and IA32_QM_CTR.Unavailable (bit 62) 238 * are error bits. 239 */ 240 wrmsr(MSR_IA32_QM_EVTSEL, eventid, prmid); 241 rdmsrl(MSR_IA32_QM_CTR, msr_val); 242 243 if (msr_val & RMID_VAL_ERROR) 244 return -EIO; 245 if (msr_val & RMID_VAL_UNAVAIL) 246 return -EINVAL; 247 248 *val = msr_val; 249 return 0; 250 } 251 252 static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_mon_domain *hw_dom, 253 u32 rmid, 254 enum resctrl_event_id eventid) 255 { 256 switch (eventid) { 257 case QOS_L3_OCCUP_EVENT_ID: 258 return NULL; 259 case QOS_L3_MBM_TOTAL_EVENT_ID: 260 return &hw_dom->arch_mbm_total[rmid]; 261 case QOS_L3_MBM_LOCAL_EVENT_ID: 262 return &hw_dom->arch_mbm_local[rmid]; 263 } 264 265 /* Never expect to get here */ 266 WARN_ON_ONCE(1); 267 268 return NULL; 269 } 270 271 void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d, 272 u32 unused, u32 rmid, 273 enum resctrl_event_id eventid) 274 { 275 struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); 276 int cpu = cpumask_any(&d->hdr.cpu_mask); 277 struct arch_mbm_state *am; 278 u32 prmid; 279 280 am = get_arch_mbm_state(hw_dom, rmid, eventid); 281 if (am) { 282 memset(am, 0, sizeof(*am)); 283 284 prmid = logical_rmid_to_physical_rmid(cpu, rmid); 285 /* Record any initial, non-zero count value. */ 286 __rmid_read_phys(prmid, eventid, &am->prev_msr); 287 } 288 } 289 290 /* 291 * Assumes that hardware counters are also reset and thus that there is 292 * no need to record initial non-zero counts. 293 */ 294 void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain *d) 295 { 296 struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); 297 298 if (is_mbm_total_enabled()) 299 memset(hw_dom->arch_mbm_total, 0, 300 sizeof(*hw_dom->arch_mbm_total) * r->num_rmid); 301 302 if (is_mbm_local_enabled()) 303 memset(hw_dom->arch_mbm_local, 0, 304 sizeof(*hw_dom->arch_mbm_local) * r->num_rmid); 305 } 306 307 static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width) 308 { 309 u64 shift = 64 - width, chunks; 310 311 chunks = (cur_msr << shift) - (prev_msr << shift); 312 return chunks >> shift; 313 } 314 315 int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d, 316 u32 unused, u32 rmid, enum resctrl_event_id eventid, 317 u64 *val, void *ignored) 318 { 319 struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); 320 struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); 321 int cpu = cpumask_any(&d->hdr.cpu_mask); 322 struct arch_mbm_state *am; 323 u64 msr_val, chunks; 324 u32 prmid; 325 int ret; 326 327 resctrl_arch_rmid_read_context_check(); 328 329 prmid = logical_rmid_to_physical_rmid(cpu, rmid); 330 ret = __rmid_read_phys(prmid, eventid, &msr_val); 331 if (ret) 332 return ret; 333 334 am = get_arch_mbm_state(hw_dom, rmid, eventid); 335 if (am) { 336 am->chunks += mbm_overflow_count(am->prev_msr, msr_val, 337 hw_res->mbm_width); 338 chunks = get_corrected_mbm_count(rmid, am->chunks); 339 am->prev_msr = msr_val; 340 } else { 341 chunks = msr_val; 342 } 343 344 *val = chunks * hw_res->mon_scale; 345 346 return 0; 347 } 348 349 static void limbo_release_entry(struct rmid_entry *entry) 350 { 351 lockdep_assert_held(&rdtgroup_mutex); 352 353 rmid_limbo_count--; 354 list_add_tail(&entry->list, &rmid_free_lru); 355 356 if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) 357 closid_num_dirty_rmid[entry->closid]--; 358 } 359 360 /* 361 * Check the RMIDs that are marked as busy for this domain. If the 362 * reported LLC occupancy is below the threshold clear the busy bit and 363 * decrement the count. If the busy count gets to zero on an RMID, we 364 * free the RMID 365 */ 366 void __check_limbo(struct rdt_mon_domain *d, bool force_free) 367 { 368 struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; 369 u32 idx_limit = resctrl_arch_system_num_rmid_idx(); 370 struct rmid_entry *entry; 371 u32 idx, cur_idx = 1; 372 void *arch_mon_ctx; 373 bool rmid_dirty; 374 u64 val = 0; 375 376 arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, QOS_L3_OCCUP_EVENT_ID); 377 if (IS_ERR(arch_mon_ctx)) { 378 pr_warn_ratelimited("Failed to allocate monitor context: %ld", 379 PTR_ERR(arch_mon_ctx)); 380 return; 381 } 382 383 /* 384 * Skip RMID 0 and start from RMID 1 and check all the RMIDs that 385 * are marked as busy for occupancy < threshold. If the occupancy 386 * is less than the threshold decrement the busy counter of the 387 * RMID and move it to the free list when the counter reaches 0. 388 */ 389 for (;;) { 390 idx = find_next_bit(d->rmid_busy_llc, idx_limit, cur_idx); 391 if (idx >= idx_limit) 392 break; 393 394 entry = __rmid_entry(idx); 395 if (resctrl_arch_rmid_read(r, d, entry->closid, entry->rmid, 396 QOS_L3_OCCUP_EVENT_ID, &val, 397 arch_mon_ctx)) { 398 rmid_dirty = true; 399 } else { 400 rmid_dirty = (val >= resctrl_rmid_realloc_threshold); 401 402 /* 403 * x86's CLOSID and RMID are independent numbers, so the entry's 404 * CLOSID is an empty CLOSID (X86_RESCTRL_EMPTY_CLOSID). On Arm the 405 * RMID (PMG) extends the CLOSID (PARTID) space with bits that aren't 406 * used to select the configuration. It is thus necessary to track both 407 * CLOSID and RMID because there may be dependencies between them 408 * on some architectures. 409 */ 410 trace_mon_llc_occupancy_limbo(entry->closid, entry->rmid, d->hdr.id, val); 411 } 412 413 if (force_free || !rmid_dirty) { 414 clear_bit(idx, d->rmid_busy_llc); 415 if (!--entry->busy) 416 limbo_release_entry(entry); 417 } 418 cur_idx = idx + 1; 419 } 420 421 resctrl_arch_mon_ctx_free(r, QOS_L3_OCCUP_EVENT_ID, arch_mon_ctx); 422 } 423 424 bool has_busy_rmid(struct rdt_mon_domain *d) 425 { 426 u32 idx_limit = resctrl_arch_system_num_rmid_idx(); 427 428 return find_first_bit(d->rmid_busy_llc, idx_limit) != idx_limit; 429 } 430 431 static struct rmid_entry *resctrl_find_free_rmid(u32 closid) 432 { 433 struct rmid_entry *itr; 434 u32 itr_idx, cmp_idx; 435 436 if (list_empty(&rmid_free_lru)) 437 return rmid_limbo_count ? ERR_PTR(-EBUSY) : ERR_PTR(-ENOSPC); 438 439 list_for_each_entry(itr, &rmid_free_lru, list) { 440 /* 441 * Get the index of this free RMID, and the index it would need 442 * to be if it were used with this CLOSID. 443 * If the CLOSID is irrelevant on this architecture, the two 444 * index values are always the same on every entry and thus the 445 * very first entry will be returned. 446 */ 447 itr_idx = resctrl_arch_rmid_idx_encode(itr->closid, itr->rmid); 448 cmp_idx = resctrl_arch_rmid_idx_encode(closid, itr->rmid); 449 450 if (itr_idx == cmp_idx) 451 return itr; 452 } 453 454 return ERR_PTR(-ENOSPC); 455 } 456 457 /** 458 * resctrl_find_cleanest_closid() - Find a CLOSID where all the associated 459 * RMID are clean, or the CLOSID that has 460 * the most clean RMID. 461 * 462 * MPAM's equivalent of RMID are per-CLOSID, meaning a freshly allocated CLOSID 463 * may not be able to allocate clean RMID. To avoid this the allocator will 464 * choose the CLOSID with the most clean RMID. 465 * 466 * When the CLOSID and RMID are independent numbers, the first free CLOSID will 467 * be returned. 468 */ 469 int resctrl_find_cleanest_closid(void) 470 { 471 u32 cleanest_closid = ~0; 472 int i = 0; 473 474 lockdep_assert_held(&rdtgroup_mutex); 475 476 if (!IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) 477 return -EIO; 478 479 for (i = 0; i < closids_supported(); i++) { 480 int num_dirty; 481 482 if (closid_allocated(i)) 483 continue; 484 485 num_dirty = closid_num_dirty_rmid[i]; 486 if (num_dirty == 0) 487 return i; 488 489 if (cleanest_closid == ~0) 490 cleanest_closid = i; 491 492 if (num_dirty < closid_num_dirty_rmid[cleanest_closid]) 493 cleanest_closid = i; 494 } 495 496 if (cleanest_closid == ~0) 497 return -ENOSPC; 498 499 return cleanest_closid; 500 } 501 502 /* 503 * For MPAM the RMID value is not unique, and has to be considered with 504 * the CLOSID. The (CLOSID, RMID) pair is allocated on all domains, which 505 * allows all domains to be managed by a single free list. 506 * Each domain also has a rmid_busy_llc to reduce the work of the limbo handler. 507 */ 508 int alloc_rmid(u32 closid) 509 { 510 struct rmid_entry *entry; 511 512 lockdep_assert_held(&rdtgroup_mutex); 513 514 entry = resctrl_find_free_rmid(closid); 515 if (IS_ERR(entry)) 516 return PTR_ERR(entry); 517 518 list_del(&entry->list); 519 return entry->rmid; 520 } 521 522 static void add_rmid_to_limbo(struct rmid_entry *entry) 523 { 524 struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; 525 struct rdt_mon_domain *d; 526 u32 idx; 527 528 lockdep_assert_held(&rdtgroup_mutex); 529 530 /* Walking r->domains, ensure it can't race with cpuhp */ 531 lockdep_assert_cpus_held(); 532 533 idx = resctrl_arch_rmid_idx_encode(entry->closid, entry->rmid); 534 535 entry->busy = 0; 536 list_for_each_entry(d, &r->mon_domains, hdr.list) { 537 /* 538 * For the first limbo RMID in the domain, 539 * setup up the limbo worker. 540 */ 541 if (!has_busy_rmid(d)) 542 cqm_setup_limbo_handler(d, CQM_LIMBOCHECK_INTERVAL, 543 RESCTRL_PICK_ANY_CPU); 544 set_bit(idx, d->rmid_busy_llc); 545 entry->busy++; 546 } 547 548 rmid_limbo_count++; 549 if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) 550 closid_num_dirty_rmid[entry->closid]++; 551 } 552 553 void free_rmid(u32 closid, u32 rmid) 554 { 555 u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid); 556 struct rmid_entry *entry; 557 558 lockdep_assert_held(&rdtgroup_mutex); 559 560 /* 561 * Do not allow the default rmid to be free'd. Comparing by index 562 * allows architectures that ignore the closid parameter to avoid an 563 * unnecessary check. 564 */ 565 if (!resctrl_arch_mon_capable() || 566 idx == resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID, 567 RESCTRL_RESERVED_RMID)) 568 return; 569 570 entry = __rmid_entry(idx); 571 572 if (is_llc_occupancy_enabled()) 573 add_rmid_to_limbo(entry); 574 else 575 list_add_tail(&entry->list, &rmid_free_lru); 576 } 577 578 static struct mbm_state *get_mbm_state(struct rdt_mon_domain *d, u32 closid, 579 u32 rmid, enum resctrl_event_id evtid) 580 { 581 u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid); 582 583 switch (evtid) { 584 case QOS_L3_MBM_TOTAL_EVENT_ID: 585 return &d->mbm_total[idx]; 586 case QOS_L3_MBM_LOCAL_EVENT_ID: 587 return &d->mbm_local[idx]; 588 default: 589 return NULL; 590 } 591 } 592 593 static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) 594 { 595 int cpu = smp_processor_id(); 596 struct rdt_mon_domain *d; 597 struct mbm_state *m; 598 int err, ret; 599 u64 tval = 0; 600 601 if (rr->first) { 602 resctrl_arch_reset_rmid(rr->r, rr->d, closid, rmid, rr->evtid); 603 m = get_mbm_state(rr->d, closid, rmid, rr->evtid); 604 if (m) 605 memset(m, 0, sizeof(struct mbm_state)); 606 return 0; 607 } 608 609 if (rr->d) { 610 /* Reading a single domain, must be on a CPU in that domain. */ 611 if (!cpumask_test_cpu(cpu, &rr->d->hdr.cpu_mask)) 612 return -EINVAL; 613 rr->err = resctrl_arch_rmid_read(rr->r, rr->d, closid, rmid, 614 rr->evtid, &tval, rr->arch_mon_ctx); 615 if (rr->err) 616 return rr->err; 617 618 rr->val += tval; 619 620 return 0; 621 } 622 623 /* Summing domains that share a cache, must be on a CPU for that cache. */ 624 if (!cpumask_test_cpu(cpu, &rr->ci->shared_cpu_map)) 625 return -EINVAL; 626 627 /* 628 * Legacy files must report the sum of an event across all 629 * domains that share the same L3 cache instance. 630 * Report success if a read from any domain succeeds, -EINVAL 631 * (translated to "Unavailable" for user space) if reading from 632 * all domains fail for any reason. 633 */ 634 ret = -EINVAL; 635 list_for_each_entry(d, &rr->r->mon_domains, hdr.list) { 636 if (d->ci->id != rr->ci->id) 637 continue; 638 err = resctrl_arch_rmid_read(rr->r, d, closid, rmid, 639 rr->evtid, &tval, rr->arch_mon_ctx); 640 if (!err) { 641 rr->val += tval; 642 ret = 0; 643 } 644 } 645 646 if (ret) 647 rr->err = ret; 648 649 return ret; 650 } 651 652 /* 653 * mbm_bw_count() - Update bw count from values previously read by 654 * __mon_event_count(). 655 * @closid: The closid used to identify the cached mbm_state. 656 * @rmid: The rmid used to identify the cached mbm_state. 657 * @rr: The struct rmid_read populated by __mon_event_count(). 658 * 659 * Supporting function to calculate the memory bandwidth 660 * and delta bandwidth in MBps. The chunks value previously read by 661 * __mon_event_count() is compared with the chunks value from the previous 662 * invocation. This must be called once per second to maintain values in MBps. 663 */ 664 static void mbm_bw_count(u32 closid, u32 rmid, struct rmid_read *rr) 665 { 666 u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid); 667 struct mbm_state *m = &rr->d->mbm_local[idx]; 668 u64 cur_bw, bytes, cur_bytes; 669 670 cur_bytes = rr->val; 671 bytes = cur_bytes - m->prev_bw_bytes; 672 m->prev_bw_bytes = cur_bytes; 673 674 cur_bw = bytes / SZ_1M; 675 676 m->prev_bw = cur_bw; 677 } 678 679 /* 680 * This is scheduled by mon_event_read() to read the CQM/MBM counters 681 * on a domain. 682 */ 683 void mon_event_count(void *info) 684 { 685 struct rdtgroup *rdtgrp, *entry; 686 struct rmid_read *rr = info; 687 struct list_head *head; 688 int ret; 689 690 rdtgrp = rr->rgrp; 691 692 ret = __mon_event_count(rdtgrp->closid, rdtgrp->mon.rmid, rr); 693 694 /* 695 * For Ctrl groups read data from child monitor groups and 696 * add them together. Count events which are read successfully. 697 * Discard the rmid_read's reporting errors. 698 */ 699 head = &rdtgrp->mon.crdtgrp_list; 700 701 if (rdtgrp->type == RDTCTRL_GROUP) { 702 list_for_each_entry(entry, head, mon.crdtgrp_list) { 703 if (__mon_event_count(entry->closid, entry->mon.rmid, 704 rr) == 0) 705 ret = 0; 706 } 707 } 708 709 /* 710 * __mon_event_count() calls for newly created monitor groups may 711 * report -EINVAL/Unavailable if the monitor hasn't seen any traffic. 712 * Discard error if any of the monitor event reads succeeded. 713 */ 714 if (ret == 0) 715 rr->err = 0; 716 } 717 718 /* 719 * Feedback loop for MBA software controller (mba_sc) 720 * 721 * mba_sc is a feedback loop where we periodically read MBM counters and 722 * adjust the bandwidth percentage values via the IA32_MBA_THRTL_MSRs so 723 * that: 724 * 725 * current bandwidth(cur_bw) < user specified bandwidth(user_bw) 726 * 727 * This uses the MBM counters to measure the bandwidth and MBA throttle 728 * MSRs to control the bandwidth for a particular rdtgrp. It builds on the 729 * fact that resctrl rdtgroups have both monitoring and control. 730 * 731 * The frequency of the checks is 1s and we just tag along the MBM overflow 732 * timer. Having 1s interval makes the calculation of bandwidth simpler. 733 * 734 * Although MBA's goal is to restrict the bandwidth to a maximum, there may 735 * be a need to increase the bandwidth to avoid unnecessarily restricting 736 * the L2 <-> L3 traffic. 737 * 738 * Since MBA controls the L2 external bandwidth where as MBM measures the 739 * L3 external bandwidth the following sequence could lead to such a 740 * situation. 741 * 742 * Consider an rdtgroup which had high L3 <-> memory traffic in initial 743 * phases -> mba_sc kicks in and reduced bandwidth percentage values -> but 744 * after some time rdtgroup has mostly L2 <-> L3 traffic. 745 * 746 * In this case we may restrict the rdtgroup's L2 <-> L3 traffic as its 747 * throttle MSRs already have low percentage values. To avoid 748 * unnecessarily restricting such rdtgroups, we also increase the bandwidth. 749 */ 750 static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_mon_domain *dom_mbm) 751 { 752 u32 closid, rmid, cur_msr_val, new_msr_val; 753 struct mbm_state *pmbm_data, *cmbm_data; 754 struct rdt_ctrl_domain *dom_mba; 755 struct rdt_resource *r_mba; 756 u32 cur_bw, user_bw, idx; 757 struct list_head *head; 758 struct rdtgroup *entry; 759 760 if (!is_mbm_local_enabled()) 761 return; 762 763 r_mba = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl; 764 765 closid = rgrp->closid; 766 rmid = rgrp->mon.rmid; 767 idx = resctrl_arch_rmid_idx_encode(closid, rmid); 768 pmbm_data = &dom_mbm->mbm_local[idx]; 769 770 dom_mba = get_ctrl_domain_from_cpu(smp_processor_id(), r_mba); 771 if (!dom_mba) { 772 pr_warn_once("Failure to get domain for MBA update\n"); 773 return; 774 } 775 776 cur_bw = pmbm_data->prev_bw; 777 user_bw = dom_mba->mbps_val[closid]; 778 779 /* MBA resource doesn't support CDP */ 780 cur_msr_val = resctrl_arch_get_config(r_mba, dom_mba, closid, CDP_NONE); 781 782 /* 783 * For Ctrl groups read data from child monitor groups. 784 */ 785 head = &rgrp->mon.crdtgrp_list; 786 list_for_each_entry(entry, head, mon.crdtgrp_list) { 787 cmbm_data = &dom_mbm->mbm_local[entry->mon.rmid]; 788 cur_bw += cmbm_data->prev_bw; 789 } 790 791 /* 792 * Scale up/down the bandwidth linearly for the ctrl group. The 793 * bandwidth step is the bandwidth granularity specified by the 794 * hardware. 795 * Always increase throttling if current bandwidth is above the 796 * target set by user. 797 * But avoid thrashing up and down on every poll by checking 798 * whether a decrease in throttling is likely to push the group 799 * back over target. E.g. if currently throttling to 30% of bandwidth 800 * on a system with 10% granularity steps, check whether moving to 801 * 40% would go past the limit by multiplying current bandwidth by 802 * "(30 + 10) / 30". 803 */ 804 if (cur_msr_val > r_mba->membw.min_bw && user_bw < cur_bw) { 805 new_msr_val = cur_msr_val - r_mba->membw.bw_gran; 806 } else if (cur_msr_val < MAX_MBA_BW && 807 (user_bw > (cur_bw * (cur_msr_val + r_mba->membw.min_bw) / cur_msr_val))) { 808 new_msr_val = cur_msr_val + r_mba->membw.bw_gran; 809 } else { 810 return; 811 } 812 813 resctrl_arch_update_one(r_mba, dom_mba, closid, CDP_NONE, new_msr_val); 814 } 815 816 static void mbm_update(struct rdt_resource *r, struct rdt_mon_domain *d, 817 u32 closid, u32 rmid) 818 { 819 struct rmid_read rr = {0}; 820 821 rr.r = r; 822 rr.d = d; 823 824 /* 825 * This is protected from concurrent reads from user 826 * as both the user and we hold the global mutex. 827 */ 828 if (is_mbm_total_enabled()) { 829 rr.evtid = QOS_L3_MBM_TOTAL_EVENT_ID; 830 rr.val = 0; 831 rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid); 832 if (IS_ERR(rr.arch_mon_ctx)) { 833 pr_warn_ratelimited("Failed to allocate monitor context: %ld", 834 PTR_ERR(rr.arch_mon_ctx)); 835 return; 836 } 837 838 __mon_event_count(closid, rmid, &rr); 839 840 resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx); 841 } 842 if (is_mbm_local_enabled()) { 843 rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID; 844 rr.val = 0; 845 rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid); 846 if (IS_ERR(rr.arch_mon_ctx)) { 847 pr_warn_ratelimited("Failed to allocate monitor context: %ld", 848 PTR_ERR(rr.arch_mon_ctx)); 849 return; 850 } 851 852 __mon_event_count(closid, rmid, &rr); 853 854 /* 855 * Call the MBA software controller only for the 856 * control groups and when user has enabled 857 * the software controller explicitly. 858 */ 859 if (is_mba_sc(NULL)) 860 mbm_bw_count(closid, rmid, &rr); 861 862 resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx); 863 } 864 } 865 866 /* 867 * Handler to scan the limbo list and move the RMIDs 868 * to free list whose occupancy < threshold_occupancy. 869 */ 870 void cqm_handle_limbo(struct work_struct *work) 871 { 872 unsigned long delay = msecs_to_jiffies(CQM_LIMBOCHECK_INTERVAL); 873 struct rdt_mon_domain *d; 874 875 cpus_read_lock(); 876 mutex_lock(&rdtgroup_mutex); 877 878 d = container_of(work, struct rdt_mon_domain, cqm_limbo.work); 879 880 __check_limbo(d, false); 881 882 if (has_busy_rmid(d)) { 883 d->cqm_work_cpu = cpumask_any_housekeeping(&d->hdr.cpu_mask, 884 RESCTRL_PICK_ANY_CPU); 885 schedule_delayed_work_on(d->cqm_work_cpu, &d->cqm_limbo, 886 delay); 887 } 888 889 mutex_unlock(&rdtgroup_mutex); 890 cpus_read_unlock(); 891 } 892 893 /** 894 * cqm_setup_limbo_handler() - Schedule the limbo handler to run for this 895 * domain. 896 * @dom: The domain the limbo handler should run for. 897 * @delay_ms: How far in the future the handler should run. 898 * @exclude_cpu: Which CPU the handler should not run on, 899 * RESCTRL_PICK_ANY_CPU to pick any CPU. 900 */ 901 void cqm_setup_limbo_handler(struct rdt_mon_domain *dom, unsigned long delay_ms, 902 int exclude_cpu) 903 { 904 unsigned long delay = msecs_to_jiffies(delay_ms); 905 int cpu; 906 907 cpu = cpumask_any_housekeeping(&dom->hdr.cpu_mask, exclude_cpu); 908 dom->cqm_work_cpu = cpu; 909 910 if (cpu < nr_cpu_ids) 911 schedule_delayed_work_on(cpu, &dom->cqm_limbo, delay); 912 } 913 914 void mbm_handle_overflow(struct work_struct *work) 915 { 916 unsigned long delay = msecs_to_jiffies(MBM_OVERFLOW_INTERVAL); 917 struct rdtgroup *prgrp, *crgrp; 918 struct rdt_mon_domain *d; 919 struct list_head *head; 920 struct rdt_resource *r; 921 922 cpus_read_lock(); 923 mutex_lock(&rdtgroup_mutex); 924 925 /* 926 * If the filesystem has been unmounted this work no longer needs to 927 * run. 928 */ 929 if (!resctrl_mounted || !resctrl_arch_mon_capable()) 930 goto out_unlock; 931 932 r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; 933 d = container_of(work, struct rdt_mon_domain, mbm_over.work); 934 935 list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { 936 mbm_update(r, d, prgrp->closid, prgrp->mon.rmid); 937 938 head = &prgrp->mon.crdtgrp_list; 939 list_for_each_entry(crgrp, head, mon.crdtgrp_list) 940 mbm_update(r, d, crgrp->closid, crgrp->mon.rmid); 941 942 if (is_mba_sc(NULL)) 943 update_mba_bw(prgrp, d); 944 } 945 946 /* 947 * Re-check for housekeeping CPUs. This allows the overflow handler to 948 * move off a nohz_full CPU quickly. 949 */ 950 d->mbm_work_cpu = cpumask_any_housekeeping(&d->hdr.cpu_mask, 951 RESCTRL_PICK_ANY_CPU); 952 schedule_delayed_work_on(d->mbm_work_cpu, &d->mbm_over, delay); 953 954 out_unlock: 955 mutex_unlock(&rdtgroup_mutex); 956 cpus_read_unlock(); 957 } 958 959 /** 960 * mbm_setup_overflow_handler() - Schedule the overflow handler to run for this 961 * domain. 962 * @dom: The domain the overflow handler should run for. 963 * @delay_ms: How far in the future the handler should run. 964 * @exclude_cpu: Which CPU the handler should not run on, 965 * RESCTRL_PICK_ANY_CPU to pick any CPU. 966 */ 967 void mbm_setup_overflow_handler(struct rdt_mon_domain *dom, unsigned long delay_ms, 968 int exclude_cpu) 969 { 970 unsigned long delay = msecs_to_jiffies(delay_ms); 971 int cpu; 972 973 /* 974 * When a domain comes online there is no guarantee the filesystem is 975 * mounted. If not, there is no need to catch counter overflow. 976 */ 977 if (!resctrl_mounted || !resctrl_arch_mon_capable()) 978 return; 979 cpu = cpumask_any_housekeeping(&dom->hdr.cpu_mask, exclude_cpu); 980 dom->mbm_work_cpu = cpu; 981 982 if (cpu < nr_cpu_ids) 983 schedule_delayed_work_on(cpu, &dom->mbm_over, delay); 984 } 985 986 static int dom_data_init(struct rdt_resource *r) 987 { 988 u32 idx_limit = resctrl_arch_system_num_rmid_idx(); 989 u32 num_closid = resctrl_arch_get_num_closid(r); 990 struct rmid_entry *entry = NULL; 991 int err = 0, i; 992 u32 idx; 993 994 mutex_lock(&rdtgroup_mutex); 995 if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { 996 u32 *tmp; 997 998 /* 999 * If the architecture hasn't provided a sanitised value here, 1000 * this may result in larger arrays than necessary. Resctrl will 1001 * use a smaller system wide value based on the resources in 1002 * use. 1003 */ 1004 tmp = kcalloc(num_closid, sizeof(*tmp), GFP_KERNEL); 1005 if (!tmp) { 1006 err = -ENOMEM; 1007 goto out_unlock; 1008 } 1009 1010 closid_num_dirty_rmid = tmp; 1011 } 1012 1013 rmid_ptrs = kcalloc(idx_limit, sizeof(struct rmid_entry), GFP_KERNEL); 1014 if (!rmid_ptrs) { 1015 if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { 1016 kfree(closid_num_dirty_rmid); 1017 closid_num_dirty_rmid = NULL; 1018 } 1019 err = -ENOMEM; 1020 goto out_unlock; 1021 } 1022 1023 for (i = 0; i < idx_limit; i++) { 1024 entry = &rmid_ptrs[i]; 1025 INIT_LIST_HEAD(&entry->list); 1026 1027 resctrl_arch_rmid_idx_decode(i, &entry->closid, &entry->rmid); 1028 list_add_tail(&entry->list, &rmid_free_lru); 1029 } 1030 1031 /* 1032 * RESCTRL_RESERVED_CLOSID and RESCTRL_RESERVED_RMID are special and 1033 * are always allocated. These are used for the rdtgroup_default 1034 * control group, which will be setup later in rdtgroup_init(). 1035 */ 1036 idx = resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID, 1037 RESCTRL_RESERVED_RMID); 1038 entry = __rmid_entry(idx); 1039 list_del(&entry->list); 1040 1041 out_unlock: 1042 mutex_unlock(&rdtgroup_mutex); 1043 1044 return err; 1045 } 1046 1047 static void __exit dom_data_exit(void) 1048 { 1049 mutex_lock(&rdtgroup_mutex); 1050 1051 if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { 1052 kfree(closid_num_dirty_rmid); 1053 closid_num_dirty_rmid = NULL; 1054 } 1055 1056 kfree(rmid_ptrs); 1057 rmid_ptrs = NULL; 1058 1059 mutex_unlock(&rdtgroup_mutex); 1060 } 1061 1062 static struct mon_evt llc_occupancy_event = { 1063 .name = "llc_occupancy", 1064 .evtid = QOS_L3_OCCUP_EVENT_ID, 1065 }; 1066 1067 static struct mon_evt mbm_total_event = { 1068 .name = "mbm_total_bytes", 1069 .evtid = QOS_L3_MBM_TOTAL_EVENT_ID, 1070 }; 1071 1072 static struct mon_evt mbm_local_event = { 1073 .name = "mbm_local_bytes", 1074 .evtid = QOS_L3_MBM_LOCAL_EVENT_ID, 1075 }; 1076 1077 /* 1078 * Initialize the event list for the resource. 1079 * 1080 * Note that MBM events are also part of RDT_RESOURCE_L3 resource 1081 * because as per the SDM the total and local memory bandwidth 1082 * are enumerated as part of L3 monitoring. 1083 */ 1084 static void l3_mon_evt_init(struct rdt_resource *r) 1085 { 1086 INIT_LIST_HEAD(&r->evt_list); 1087 1088 if (is_llc_occupancy_enabled()) 1089 list_add_tail(&llc_occupancy_event.list, &r->evt_list); 1090 if (is_mbm_total_enabled()) 1091 list_add_tail(&mbm_total_event.list, &r->evt_list); 1092 if (is_mbm_local_enabled()) 1093 list_add_tail(&mbm_local_event.list, &r->evt_list); 1094 } 1095 1096 /* 1097 * The power-on reset value of MSR_RMID_SNC_CONFIG is 0x1 1098 * which indicates that RMIDs are configured in legacy mode. 1099 * This mode is incompatible with Linux resctrl semantics 1100 * as RMIDs are partitioned between SNC nodes, which requires 1101 * a user to know which RMID is allocated to a task. 1102 * Clearing bit 0 reconfigures the RMID counters for use 1103 * in RMID sharing mode. This mode is better for Linux. 1104 * The RMID space is divided between all SNC nodes with the 1105 * RMIDs renumbered to start from zero in each node when 1106 * counting operations from tasks. Code to read the counters 1107 * must adjust RMID counter numbers based on SNC node. See 1108 * logical_rmid_to_physical_rmid() for code that does this. 1109 */ 1110 void arch_mon_domain_online(struct rdt_resource *r, struct rdt_mon_domain *d) 1111 { 1112 if (snc_nodes_per_l3_cache > 1) 1113 msr_clear_bit(MSR_RMID_SNC_CONFIG, 0); 1114 } 1115 1116 /* CPU models that support MSR_RMID_SNC_CONFIG */ 1117 static const struct x86_cpu_id snc_cpu_ids[] __initconst = { 1118 X86_MATCH_VFM(INTEL_ICELAKE_X, 0), 1119 X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, 0), 1120 X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, 0), 1121 X86_MATCH_VFM(INTEL_GRANITERAPIDS_X, 0), 1122 X86_MATCH_VFM(INTEL_ATOM_CRESTMONT_X, 0), 1123 {} 1124 }; 1125 1126 /* 1127 * There isn't a simple hardware bit that indicates whether a CPU is running 1128 * in Sub-NUMA Cluster (SNC) mode. Infer the state by comparing the 1129 * number of CPUs sharing the L3 cache with CPU0 to the number of CPUs in 1130 * the same NUMA node as CPU0. 1131 * It is not possible to accurately determine SNC state if the system is 1132 * booted with a maxcpus=N parameter. That distorts the ratio of SNC nodes 1133 * to L3 caches. It will be OK if system is booted with hyperthreading 1134 * disabled (since this doesn't affect the ratio). 1135 */ 1136 static __init int snc_get_config(void) 1137 { 1138 struct cacheinfo *ci = get_cpu_cacheinfo_level(0, RESCTRL_L3_CACHE); 1139 const cpumask_t *node0_cpumask; 1140 int cpus_per_node, cpus_per_l3; 1141 int ret; 1142 1143 if (!x86_match_cpu(snc_cpu_ids) || !ci) 1144 return 1; 1145 1146 cpus_read_lock(); 1147 if (num_online_cpus() != num_present_cpus()) 1148 pr_warn("Some CPUs offline, SNC detection may be incorrect\n"); 1149 cpus_read_unlock(); 1150 1151 node0_cpumask = cpumask_of_node(cpu_to_node(0)); 1152 1153 cpus_per_node = cpumask_weight(node0_cpumask); 1154 cpus_per_l3 = cpumask_weight(&ci->shared_cpu_map); 1155 1156 if (!cpus_per_node || !cpus_per_l3) 1157 return 1; 1158 1159 ret = cpus_per_l3 / cpus_per_node; 1160 1161 /* sanity check: Only valid results are 1, 2, 3, 4 */ 1162 switch (ret) { 1163 case 1: 1164 break; 1165 case 2 ... 4: 1166 pr_info("Sub-NUMA Cluster mode detected with %d nodes per L3 cache\n", ret); 1167 rdt_resources_all[RDT_RESOURCE_L3].r_resctrl.mon_scope = RESCTRL_L3_NODE; 1168 break; 1169 default: 1170 pr_warn("Ignore improbable SNC node count %d\n", ret); 1171 ret = 1; 1172 break; 1173 } 1174 1175 return ret; 1176 } 1177 1178 int __init rdt_get_mon_l3_config(struct rdt_resource *r) 1179 { 1180 unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset; 1181 struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); 1182 unsigned int threshold; 1183 int ret; 1184 1185 snc_nodes_per_l3_cache = snc_get_config(); 1186 1187 resctrl_rmid_realloc_limit = boot_cpu_data.x86_cache_size * 1024; 1188 hw_res->mon_scale = boot_cpu_data.x86_cache_occ_scale / snc_nodes_per_l3_cache; 1189 r->num_rmid = (boot_cpu_data.x86_cache_max_rmid + 1) / snc_nodes_per_l3_cache; 1190 hw_res->mbm_width = MBM_CNTR_WIDTH_BASE; 1191 1192 if (mbm_offset > 0 && mbm_offset <= MBM_CNTR_WIDTH_OFFSET_MAX) 1193 hw_res->mbm_width += mbm_offset; 1194 else if (mbm_offset > MBM_CNTR_WIDTH_OFFSET_MAX) 1195 pr_warn("Ignoring impossible MBM counter offset\n"); 1196 1197 /* 1198 * A reasonable upper limit on the max threshold is the number 1199 * of lines tagged per RMID if all RMIDs have the same number of 1200 * lines tagged in the LLC. 1201 * 1202 * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC. 1203 */ 1204 threshold = resctrl_rmid_realloc_limit / r->num_rmid; 1205 1206 /* 1207 * Because num_rmid may not be a power of two, round the value 1208 * to the nearest multiple of hw_res->mon_scale so it matches a 1209 * value the hardware will measure. mon_scale may not be a power of 2. 1210 */ 1211 resctrl_rmid_realloc_threshold = resctrl_arch_round_mon_val(threshold); 1212 1213 ret = dom_data_init(r); 1214 if (ret) 1215 return ret; 1216 1217 if (rdt_cpu_has(X86_FEATURE_BMEC)) { 1218 u32 eax, ebx, ecx, edx; 1219 1220 /* Detect list of bandwidth sources that can be tracked */ 1221 cpuid_count(0x80000020, 3, &eax, &ebx, &ecx, &edx); 1222 hw_res->mbm_cfg_mask = ecx & MAX_EVT_CONFIG_BITS; 1223 1224 if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL)) { 1225 mbm_total_event.configurable = true; 1226 mbm_config_rftype_init("mbm_total_bytes_config"); 1227 } 1228 if (rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL)) { 1229 mbm_local_event.configurable = true; 1230 mbm_config_rftype_init("mbm_local_bytes_config"); 1231 } 1232 } 1233 1234 l3_mon_evt_init(r); 1235 1236 r->mon_capable = true; 1237 1238 return 0; 1239 } 1240 1241 void __exit rdt_put_mon_l3_config(void) 1242 { 1243 dom_data_exit(); 1244 } 1245 1246 void __init intel_rdt_mbm_apply_quirk(void) 1247 { 1248 int cf_index; 1249 1250 cf_index = (boot_cpu_data.x86_cache_max_rmid + 1) / 8 - 1; 1251 if (cf_index >= ARRAY_SIZE(mbm_cf_table)) { 1252 pr_info("No MBM correction factor available\n"); 1253 return; 1254 } 1255 1256 mbm_cf_rmidthreshold = mbm_cf_table[cf_index].rmidthreshold; 1257 mbm_cf = mbm_cf_table[cf_index].cf; 1258 } 1259

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

TOMOYO Linux Cross Reference Linux/arch/x86/kernel/cpu/resctrl/monitor.c

TOMOYO Linux Cross Reference
Linux/arch/x86/kernel/cpu/resctrl/monitor.c