1 // SPDX-License-Identifier: GPL-2.0 1 2 /* 3 * Copyright IBM Corp. 2024 4 */ 5 6 #define KMSG_COMPONENT "hd" 7 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt 8 9 /* 10 * Hiperdispatch: 11 * Dynamically calculates the optimum number o 12 * by considering the state the system is in. 13 * that a capacity update is necessary, it sch 14 * During topology updates the CPU capacities 15 * 16 * There is two places where CPU capacities ar 17 * hiperdispatch. 18 * -> hiperdispatch's reoccuring work function 19 * determine high capacity CPU count. 20 * -> during a topology update hiperdispatch's 21 * updates CPU capacities. 22 * These two can run on different CPUs in para 23 * hiperdispatch to make wrong decisions. This 24 * some overhead by leading to extra rebuild_s 25 * for correction. Access to capacities within 26 * serialized to prevent the overhead. 27 * 28 * Hiperdispatch decision making revolves arou 29 * HD_STEAL_THRESHOLD value is taken as refere 30 * crosses the threshold value hiperdispatch f 31 * capacities to entitled CPUs. When steal tim 32 * threshold boundary, hiperdispatch utilizes 33 * of them high capacity. 34 * 35 * The theory behind HD_STEAL_THRESHOLD is rel 36 * performance. Comparing the throughput of; 37 * - single CORE, with N threads, running N ta 38 * - N separate COREs running N tasks, 39 * using individual COREs for individual tasks 40 * performance. This performance difference is 41 * between machine generations) 42 * 43 * Hiperdispatch tries to hint scheduler to us 44 * each task, as long as steal time on those C 45 * therefore delaying the throughput loss caus 46 */ 47 48 #include <linux/cpumask.h> 49 #include <linux/debugfs.h> 50 #include <linux/device.h> 51 #include <linux/kernel_stat.h> 52 #include <linux/kstrtox.h> 53 #include <linux/ktime.h> 54 #include <linux/sysctl.h> 55 #include <linux/types.h> 56 #include <linux/workqueue.h> 57 #include <asm/hiperdispatch.h> 58 #include <asm/setup.h> 59 #include <asm/smp.h> 60 #include <asm/topology.h> 61 62 #define CREATE_TRACE_POINTS 63 #include <asm/trace/hiperdispatch.h> 64 65 #define HD_DELAY_FACTOR (4) 66 #define HD_DELAY_INTERVAL (HZ / 67 #define HD_STEAL_THRESHOLD 30 68 #define HD_STEAL_AVG_WEIGHT 16 69 70 static cpumask_t hd_vl_coremask; /* Mas 71 static cpumask_t hd_vmvl_cpumask; /* Mas 72 static int hd_high_capacity_cores; /* Cur 73 static int hd_entitled_cores; /* Tot 74 static int hd_online_cores; /* Cur 75 76 static unsigned long hd_previous_steal; /* Pre 77 static unsigned long hd_high_time; /* Tot 78 static unsigned long hd_low_time; /* Tot 79 static atomic64_t hd_adjustments; /* Tot 80 81 static unsigned int hd_steal_threshold = HD_ST 82 static unsigned int hd_delay_factor = HD_DELAY 83 static int hd_enabled; 84 85 static void hd_capacity_work_fn(struct work_st 86 static DECLARE_DELAYED_WORK(hd_capacity_work, 87 88 static int hd_set_hiperdispatch_mode(int enabl 89 { 90 if (!MACHINE_HAS_TOPOLOGY) 91 enable = 0; 92 if (hd_enabled == enable) 93 return 0; 94 hd_enabled = enable; 95 return 1; 96 } 97 98 void hd_reset_state(void) 99 { 100 cpumask_clear(&hd_vl_coremask); 101 cpumask_clear(&hd_vmvl_cpumask); 102 hd_entitled_cores = 0; 103 hd_online_cores = 0; 104 } 105 106 void hd_add_core(int cpu) 107 { 108 const struct cpumask *siblings; 109 int polarization; 110 111 hd_online_cores++; 112 polarization = smp_cpu_get_polarizatio 113 siblings = topology_sibling_cpumask(cp 114 switch (polarization) { 115 case POLARIZATION_VH: 116 hd_entitled_cores++; 117 break; 118 case POLARIZATION_VM: 119 hd_entitled_cores++; 120 cpumask_or(&hd_vmvl_cpumask, & 121 break; 122 case POLARIZATION_VL: 123 cpumask_set_cpu(cpu, &hd_vl_co 124 cpumask_or(&hd_vmvl_cpumask, & 125 break; 126 } 127 } 128 129 /* Serialize update and read operations of deb 130 static DEFINE_MUTEX(hd_counter_mutex); 131 132 static void hd_update_times(void) 133 { 134 static ktime_t prev; 135 ktime_t now; 136 137 /* 138 * Check if hiperdispatch is active, i 139 * This way it is possible to differen 140 * enabling hiperdispatch. 141 */ 142 if (hd_entitled_cores == 0 || hd_enabl 143 prev = ktime_set(0, 0); 144 return; 145 } 146 now = ktime_get(); 147 if (ktime_after(prev, 0)) { 148 if (hd_high_capacity_cores == 149 hd_high_time += ktime_ 150 else 151 hd_low_time += ktime_m 152 } 153 prev = now; 154 } 155 156 static void hd_update_capacities(void) 157 { 158 int cpu, upscaling_cores; 159 unsigned long capacity; 160 161 upscaling_cores = hd_high_capacity_cor 162 capacity = upscaling_cores > 0 ? CPU_C 163 hd_high_capacity_cores = hd_entitled_c 164 for_each_cpu(cpu, &hd_vl_coremask) { 165 smp_set_core_capacity(cpu, cap 166 if (capacity != CPU_CAPACITY_H 167 continue; 168 hd_high_capacity_cores++; 169 upscaling_cores--; 170 if (upscaling_cores == 0) 171 capacity = CPU_CAPACIT 172 } 173 } 174 175 void hd_disable_hiperdispatch(void) 176 { 177 cancel_delayed_work_sync(&hd_capacity_ 178 hd_high_capacity_cores = hd_online_cor 179 hd_previous_steal = 0; 180 } 181 182 int hd_enable_hiperdispatch(void) 183 { 184 mutex_lock(&hd_counter_mutex); 185 hd_update_times(); 186 mutex_unlock(&hd_counter_mutex); 187 if (hd_enabled == 0) 188 return 0; 189 if (hd_entitled_cores == 0) 190 return 0; 191 if (hd_online_cores <= hd_entitled_cor 192 return 0; 193 mod_delayed_work(system_wq, &hd_capaci 194 hd_update_capacities(); 195 return 1; 196 } 197 198 static unsigned long hd_steal_avg(unsigned lon 199 { 200 static unsigned long steal; 201 202 steal = (steal * (HD_STEAL_AVG_WEIGHT 203 return steal; 204 } 205 206 static unsigned long hd_calculate_steal_percen 207 { 208 unsigned long time_delta, steal_delta, 209 static ktime_t prev; 210 int cpus, cpu; 211 ktime_t now; 212 213 cpus = 0; 214 steal = 0; 215 percentage = 0; 216 for_each_cpu(cpu, &hd_vmvl_cpumask) { 217 steal += kcpustat_cpu(cpu).cpu 218 cpus++; 219 } 220 /* 221 * If there is no vertical medium and 222 * is 0 as vertical high CPUs shouldn' 223 */ 224 if (cpus == 0) 225 return percentage; 226 now = ktime_get(); 227 time_delta = ktime_to_ns(ktime_sub(now 228 if (steal > hd_previous_steal && hd_pr 229 steal_delta = (steal - hd_prev 230 percentage = steal_delta / cpu 231 } 232 hd_previous_steal = steal; 233 prev = now; 234 return percentage; 235 } 236 237 static void hd_capacity_work_fn(struct work_st 238 { 239 unsigned long steal_percentage, new_co 240 241 mutex_lock(&smp_cpu_state_mutex); 242 /* 243 * If online cores are less or equal t 244 * does not need to make any adjustmen 245 * disable hiperdispatch. 246 * Normally this check is handled on t 247 * unhotplug, topology and cpu mask up 248 * order, causing hd_enable_hiperdispa 249 */ 250 if (hd_online_cores <= hd_entitled_cor 251 topology_schedule_update(); 252 mutex_unlock(&smp_cpu_state_mu 253 return; 254 } 255 steal_percentage = hd_steal_avg(hd_cal 256 if (steal_percentage < hd_steal_thresh 257 new_cores = hd_online_cores; 258 else 259 new_cores = hd_entitled_cores; 260 if (hd_high_capacity_cores != new_core 261 trace_s390_hd_rebuild_domains( 262 hd_high_capacity_cores = new_c 263 atomic64_inc(&hd_adjustments); 264 topology_schedule_update(); 265 } 266 trace_s390_hd_work_fn(steal_percentage 267 mutex_unlock(&smp_cpu_state_mutex); 268 schedule_delayed_work(&hd_capacity_wor 269 } 270 271 static int hiperdispatch_ctl_handler(const str 272 void *buf 273 { 274 int hiperdispatch; 275 int rc; 276 struct ctl_table ctl_entry = { 277 .procname = ctl->procnam 278 .data = &hiperdispat 279 .maxlen = sizeof(int), 280 .extra1 = SYSCTL_ZERO, 281 .extra2 = SYSCTL_ONE, 282 }; 283 284 hiperdispatch = hd_enabled; 285 rc = proc_douintvec_minmax(&ctl_entry, 286 if (rc < 0 || !write) 287 return rc; 288 mutex_lock(&smp_cpu_state_mutex); 289 if (hd_set_hiperdispatch_mode(hiperdis 290 topology_schedule_update(); 291 mutex_unlock(&smp_cpu_state_mutex); 292 return 0; 293 } 294 295 static struct ctl_table hiperdispatch_ctl_tabl 296 { 297 .procname = "hiperdispat 298 .mode = 0644, 299 .proc_handler = hiperdispatc 300 }, 301 }; 302 303 static ssize_t hd_steal_threshold_show(struct 304 struct 305 char *b 306 { 307 return sysfs_emit(buf, "%u\n", hd_stea 308 } 309 310 static ssize_t hd_steal_threshold_store(struct 311 struct 312 const 313 size_t 314 { 315 unsigned int val; 316 int rc; 317 318 rc = kstrtouint(buf, 0, &val); 319 if (rc) 320 return rc; 321 if (val > 100) 322 return -ERANGE; 323 hd_steal_threshold = val; 324 return count; 325 } 326 327 static DEVICE_ATTR_RW(hd_steal_threshold); 328 329 static ssize_t hd_delay_factor_show(struct dev 330 struct dev 331 char *buf) 332 { 333 return sysfs_emit(buf, "%u\n", hd_dela 334 } 335 336 static ssize_t hd_delay_factor_store(struct de 337 struct de 338 const cha 339 size_t co 340 { 341 unsigned int val; 342 int rc; 343 344 rc = kstrtouint(buf, 0, &val); 345 if (rc) 346 return rc; 347 if (!val) 348 return -ERANGE; 349 hd_delay_factor = val; 350 return count; 351 } 352 353 static DEVICE_ATTR_RW(hd_delay_factor); 354 355 static struct attribute *hd_attrs[] = { 356 &dev_attr_hd_steal_threshold.attr, 357 &dev_attr_hd_delay_factor.attr, 358 NULL, 359 }; 360 361 static const struct attribute_group hd_attr_gr 362 .name = "hiperdispatch", 363 .attrs = hd_attrs, 364 }; 365 366 static int hd_greedy_time_get(void *unused, u6 367 { 368 mutex_lock(&hd_counter_mutex); 369 hd_update_times(); 370 *val = hd_high_time; 371 mutex_unlock(&hd_counter_mutex); 372 return 0; 373 } 374 375 DEFINE_SIMPLE_ATTRIBUTE(hd_greedy_time_fops, h 376 377 static int hd_conservative_time_get(void *unus 378 { 379 mutex_lock(&hd_counter_mutex); 380 hd_update_times(); 381 *val = hd_low_time; 382 mutex_unlock(&hd_counter_mutex); 383 return 0; 384 } 385 386 DEFINE_SIMPLE_ATTRIBUTE(hd_conservative_time_f 387 388 static int hd_adjustment_count_get(void *unuse 389 { 390 *val = atomic64_read(&hd_adjustments); 391 return 0; 392 } 393 394 DEFINE_SIMPLE_ATTRIBUTE(hd_adjustments_fops, h 395 396 static void __init hd_create_debugfs_counters( 397 { 398 struct dentry *dir; 399 400 dir = debugfs_create_dir("hiperdispatc 401 debugfs_create_file("conservative_time 402 debugfs_create_file("greedy_time_ms", 403 debugfs_create_file("adjustment_count" 404 } 405 406 static void __init hd_create_attributes(void) 407 { 408 struct device *dev; 409 410 dev = bus_get_dev_root(&cpu_subsys); 411 if (!dev) 412 return; 413 if (sysfs_create_group(&dev->kobj, &hd 414 pr_warn("Unable to create hipe 415 put_device(dev); 416 } 417 418 static int __init hd_init(void) 419 { 420 if (IS_ENABLED(CONFIG_HIPERDISPATCH_ON 421 hd_set_hiperdispatch_mode(1); 422 topology_schedule_update(); 423 } 424 if (!register_sysctl("s390", hiperdisp 425 pr_warn("Failed to register s3 426 hd_create_debugfs_counters(); 427 hd_create_attributes(); 428 return 0; 429 } 430 late_initcall(hd_init); 431
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.