~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/arch/s390/kernel/hiperdispatch.c

Version: ~ [ linux-6.12-rc7 ] ~ [ linux-6.11.7 ] ~ [ linux-6.10.14 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.60 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.116 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.171 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.229 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.285 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.323 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.12 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 // SPDX-License-Identifier: GPL-2.0
  2 /*
  3  * Copyright IBM Corp. 2024
  4  */
  5 
  6 #define KMSG_COMPONENT "hd"
  7 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
  8 
  9 /*
 10  * Hiperdispatch:
 11  * Dynamically calculates the optimum number of high capacity COREs
 12  * by considering the state the system is in. When hiperdispatch decides
 13  * that a capacity update is necessary, it schedules a topology update.
 14  * During topology updates the CPU capacities are always re-adjusted.
 15  *
 16  * There is two places where CPU capacities are being accessed within
 17  * hiperdispatch.
 18  * -> hiperdispatch's reoccuring work function reads CPU capacities to
 19  *    determine high capacity CPU count.
 20  * -> during a topology update hiperdispatch's adjustment function
 21  *    updates CPU capacities.
 22  * These two can run on different CPUs in parallel which can cause
 23  * hiperdispatch to make wrong decisions. This can potentially cause
 24  * some overhead by leading to extra rebuild_sched_domains() calls
 25  * for correction. Access to capacities within hiperdispatch has to be
 26  * serialized to prevent the overhead.
 27  *
 28  * Hiperdispatch decision making revolves around steal time.
 29  * HD_STEAL_THRESHOLD value is taken as reference. Whenever steal time
 30  * crosses the threshold value hiperdispatch falls back to giving high
 31  * capacities to entitled CPUs. When steal time drops below the
 32  * threshold boundary, hiperdispatch utilizes all CPUs by giving all
 33  * of them high capacity.
 34  *
 35  * The theory behind HD_STEAL_THRESHOLD is related to the SMP thread
 36  * performance. Comparing the throughput of;
 37  * - single CORE, with N threads, running N tasks
 38  * - N separate COREs running N tasks,
 39  * using individual COREs for individual tasks yield better
 40  * performance. This performance difference is roughly ~30% (can change
 41  * between machine generations)
 42  *
 43  * Hiperdispatch tries to hint scheduler to use individual COREs for
 44  * each task, as long as steal time on those COREs are less than 30%,
 45  * therefore delaying the throughput loss caused by using SMP threads.
 46  */
 47 
 48 #include <linux/cpumask.h>
 49 #include <linux/debugfs.h>
 50 #include <linux/device.h>
 51 #include <linux/kernel_stat.h>
 52 #include <linux/kstrtox.h>
 53 #include <linux/ktime.h>
 54 #include <linux/sysctl.h>
 55 #include <linux/types.h>
 56 #include <linux/workqueue.h>
 57 #include <asm/hiperdispatch.h>
 58 #include <asm/setup.h>
 59 #include <asm/smp.h>
 60 #include <asm/topology.h>
 61 
 62 #define CREATE_TRACE_POINTS
 63 #include <asm/trace/hiperdispatch.h>
 64 
 65 #define HD_DELAY_FACTOR                 (4)
 66 #define HD_DELAY_INTERVAL               (HZ / 4)
 67 #define HD_STEAL_THRESHOLD              30
 68 #define HD_STEAL_AVG_WEIGHT             16
 69 
 70 static cpumask_t hd_vl_coremask;        /* Mask containing all vertical low COREs */
 71 static cpumask_t hd_vmvl_cpumask;       /* Mask containing vertical medium and low CPUs */
 72 static int hd_high_capacity_cores;      /* Current CORE count with high capacity */
 73 static int hd_entitled_cores;           /* Total vertical high and medium CORE count */
 74 static int hd_online_cores;             /* Current online CORE count */
 75 
 76 static unsigned long hd_previous_steal; /* Previous iteration's CPU steal timer total */
 77 static unsigned long hd_high_time;      /* Total time spent while all cpus have high capacity */
 78 static unsigned long hd_low_time;       /* Total time spent while vl cpus have low capacity */
 79 static atomic64_t hd_adjustments;       /* Total occurrence count of hiperdispatch adjustments */
 80 
 81 static unsigned int hd_steal_threshold = HD_STEAL_THRESHOLD;
 82 static unsigned int hd_delay_factor = HD_DELAY_FACTOR;
 83 static int hd_enabled;
 84 
 85 static void hd_capacity_work_fn(struct work_struct *work);
 86 static DECLARE_DELAYED_WORK(hd_capacity_work, hd_capacity_work_fn);
 87 
 88 static int hd_set_hiperdispatch_mode(int enable)
 89 {
 90         if (!MACHINE_HAS_TOPOLOGY)
 91                 enable = 0;
 92         if (hd_enabled == enable)
 93                 return 0;
 94         hd_enabled = enable;
 95         return 1;
 96 }
 97 
 98 void hd_reset_state(void)
 99 {
100         cpumask_clear(&hd_vl_coremask);
101         cpumask_clear(&hd_vmvl_cpumask);
102         hd_entitled_cores = 0;
103         hd_online_cores = 0;
104 }
105 
106 void hd_add_core(int cpu)
107 {
108         const struct cpumask *siblings;
109         int polarization;
110 
111         hd_online_cores++;
112         polarization = smp_cpu_get_polarization(cpu);
113         siblings = topology_sibling_cpumask(cpu);
114         switch (polarization) {
115         case POLARIZATION_VH:
116                 hd_entitled_cores++;
117                 break;
118         case POLARIZATION_VM:
119                 hd_entitled_cores++;
120                 cpumask_or(&hd_vmvl_cpumask, &hd_vmvl_cpumask, siblings);
121                 break;
122         case POLARIZATION_VL:
123                 cpumask_set_cpu(cpu, &hd_vl_coremask);
124                 cpumask_or(&hd_vmvl_cpumask, &hd_vmvl_cpumask, siblings);
125                 break;
126         }
127 }
128 
129 /* Serialize update and read operations of debug counters. */
130 static DEFINE_MUTEX(hd_counter_mutex);
131 
132 static void hd_update_times(void)
133 {
134         static ktime_t prev;
135         ktime_t now;
136 
137         /*
138          * Check if hiperdispatch is active, if not set the prev to 0.
139          * This way it is possible to differentiate the first update iteration after
140          * enabling hiperdispatch.
141          */
142         if (hd_entitled_cores == 0 || hd_enabled == 0) {
143                 prev = ktime_set(0, 0);
144                 return;
145         }
146         now = ktime_get();
147         if (ktime_after(prev, 0)) {
148                 if (hd_high_capacity_cores == hd_online_cores)
149                         hd_high_time += ktime_ms_delta(now, prev);
150                 else
151                         hd_low_time += ktime_ms_delta(now, prev);
152         }
153         prev = now;
154 }
155 
156 static void hd_update_capacities(void)
157 {
158         int cpu, upscaling_cores;
159         unsigned long capacity;
160 
161         upscaling_cores = hd_high_capacity_cores - hd_entitled_cores;
162         capacity = upscaling_cores > 0 ? CPU_CAPACITY_HIGH : CPU_CAPACITY_LOW;
163         hd_high_capacity_cores = hd_entitled_cores;
164         for_each_cpu(cpu, &hd_vl_coremask) {
165                 smp_set_core_capacity(cpu, capacity);
166                 if (capacity != CPU_CAPACITY_HIGH)
167                         continue;
168                 hd_high_capacity_cores++;
169                 upscaling_cores--;
170                 if (upscaling_cores == 0)
171                         capacity = CPU_CAPACITY_LOW;
172         }
173 }
174 
175 void hd_disable_hiperdispatch(void)
176 {
177         cancel_delayed_work_sync(&hd_capacity_work);
178         hd_high_capacity_cores = hd_online_cores;
179         hd_previous_steal = 0;
180 }
181 
182 int hd_enable_hiperdispatch(void)
183 {
184         mutex_lock(&hd_counter_mutex);
185         hd_update_times();
186         mutex_unlock(&hd_counter_mutex);
187         if (hd_enabled == 0)
188                 return 0;
189         if (hd_entitled_cores == 0)
190                 return 0;
191         if (hd_online_cores <= hd_entitled_cores)
192                 return 0;
193         mod_delayed_work(system_wq, &hd_capacity_work, HD_DELAY_INTERVAL * hd_delay_factor);
194         hd_update_capacities();
195         return 1;
196 }
197 
198 static unsigned long hd_steal_avg(unsigned long new)
199 {
200         static unsigned long steal;
201 
202         steal = (steal * (HD_STEAL_AVG_WEIGHT - 1) + new) / HD_STEAL_AVG_WEIGHT;
203         return steal;
204 }
205 
206 static unsigned long hd_calculate_steal_percentage(void)
207 {
208         unsigned long time_delta, steal_delta, steal, percentage;
209         static ktime_t prev;
210         int cpus, cpu;
211         ktime_t now;
212 
213         cpus = 0;
214         steal = 0;
215         percentage = 0;
216         for_each_cpu(cpu, &hd_vmvl_cpumask) {
217                 steal += kcpustat_cpu(cpu).cpustat[CPUTIME_STEAL];
218                 cpus++;
219         }
220         /*
221          * If there is no vertical medium and low CPUs steal time
222          * is 0 as vertical high CPUs shouldn't experience steal time.
223          */
224         if (cpus == 0)
225                 return percentage;
226         now = ktime_get();
227         time_delta = ktime_to_ns(ktime_sub(now, prev));
228         if (steal > hd_previous_steal && hd_previous_steal != 0) {
229                 steal_delta = (steal - hd_previous_steal) * 100 / time_delta;
230                 percentage = steal_delta / cpus;
231         }
232         hd_previous_steal = steal;
233         prev = now;
234         return percentage;
235 }
236 
237 static void hd_capacity_work_fn(struct work_struct *work)
238 {
239         unsigned long steal_percentage, new_cores;
240 
241         mutex_lock(&smp_cpu_state_mutex);
242         /*
243          * If online cores are less or equal to entitled cores hiperdispatch
244          * does not need to make any adjustments, call a topology update to
245          * disable hiperdispatch.
246          * Normally this check is handled on topology update, but during cpu
247          * unhotplug, topology and cpu mask updates are done in reverse
248          * order, causing hd_enable_hiperdispatch() to get stale data.
249          */
250         if (hd_online_cores <= hd_entitled_cores) {
251                 topology_schedule_update();
252                 mutex_unlock(&smp_cpu_state_mutex);
253                 return;
254         }
255         steal_percentage = hd_steal_avg(hd_calculate_steal_percentage());
256         if (steal_percentage < hd_steal_threshold)
257                 new_cores = hd_online_cores;
258         else
259                 new_cores = hd_entitled_cores;
260         if (hd_high_capacity_cores != new_cores) {
261                 trace_s390_hd_rebuild_domains(hd_high_capacity_cores, new_cores);
262                 hd_high_capacity_cores = new_cores;
263                 atomic64_inc(&hd_adjustments);
264                 topology_schedule_update();
265         }
266         trace_s390_hd_work_fn(steal_percentage, hd_entitled_cores, hd_high_capacity_cores);
267         mutex_unlock(&smp_cpu_state_mutex);
268         schedule_delayed_work(&hd_capacity_work, HD_DELAY_INTERVAL);
269 }
270 
271 static int hiperdispatch_ctl_handler(const struct ctl_table *ctl, int write,
272                                      void *buffer, size_t *lenp, loff_t *ppos)
273 {
274         int hiperdispatch;
275         int rc;
276         struct ctl_table ctl_entry = {
277                 .procname       = ctl->procname,
278                 .data           = &hiperdispatch,
279                 .maxlen         = sizeof(int),
280                 .extra1         = SYSCTL_ZERO,
281                 .extra2         = SYSCTL_ONE,
282         };
283 
284         hiperdispatch = hd_enabled;
285         rc = proc_douintvec_minmax(&ctl_entry, write, buffer, lenp, ppos);
286         if (rc < 0 || !write)
287                 return rc;
288         mutex_lock(&smp_cpu_state_mutex);
289         if (hd_set_hiperdispatch_mode(hiperdispatch))
290                 topology_schedule_update();
291         mutex_unlock(&smp_cpu_state_mutex);
292         return 0;
293 }
294 
295 static struct ctl_table hiperdispatch_ctl_table[] = {
296         {
297                 .procname       = "hiperdispatch",
298                 .mode           = 0644,
299                 .proc_handler   = hiperdispatch_ctl_handler,
300         },
301 };
302 
303 static ssize_t hd_steal_threshold_show(struct device *dev,
304                                        struct device_attribute *attr,
305                                        char *buf)
306 {
307         return sysfs_emit(buf, "%u\n", hd_steal_threshold);
308 }
309 
310 static ssize_t hd_steal_threshold_store(struct device *dev,
311                                         struct device_attribute *attr,
312                                         const char *buf,
313                                         size_t count)
314 {
315         unsigned int val;
316         int rc;
317 
318         rc = kstrtouint(buf, 0, &val);
319         if (rc)
320                 return rc;
321         if (val > 100)
322                 return -ERANGE;
323         hd_steal_threshold = val;
324         return count;
325 }
326 
327 static DEVICE_ATTR_RW(hd_steal_threshold);
328 
329 static ssize_t hd_delay_factor_show(struct device *dev,
330                                     struct device_attribute *attr,
331                                     char *buf)
332 {
333         return sysfs_emit(buf, "%u\n", hd_delay_factor);
334 }
335 
336 static ssize_t hd_delay_factor_store(struct device *dev,
337                                      struct device_attribute *attr,
338                                      const char *buf,
339                                      size_t count)
340 {
341         unsigned int val;
342         int rc;
343 
344         rc = kstrtouint(buf, 0, &val);
345         if (rc)
346                 return rc;
347         if (!val)
348                 return -ERANGE;
349         hd_delay_factor = val;
350         return count;
351 }
352 
353 static DEVICE_ATTR_RW(hd_delay_factor);
354 
355 static struct attribute *hd_attrs[] = {
356         &dev_attr_hd_steal_threshold.attr,
357         &dev_attr_hd_delay_factor.attr,
358         NULL,
359 };
360 
361 static const struct attribute_group hd_attr_group = {
362         .name  = "hiperdispatch",
363         .attrs = hd_attrs,
364 };
365 
366 static int hd_greedy_time_get(void *unused, u64 *val)
367 {
368         mutex_lock(&hd_counter_mutex);
369         hd_update_times();
370         *val = hd_high_time;
371         mutex_unlock(&hd_counter_mutex);
372         return 0;
373 }
374 
375 DEFINE_SIMPLE_ATTRIBUTE(hd_greedy_time_fops, hd_greedy_time_get, NULL, "%llu\n");
376 
377 static int hd_conservative_time_get(void *unused, u64 *val)
378 {
379         mutex_lock(&hd_counter_mutex);
380         hd_update_times();
381         *val = hd_low_time;
382         mutex_unlock(&hd_counter_mutex);
383         return 0;
384 }
385 
386 DEFINE_SIMPLE_ATTRIBUTE(hd_conservative_time_fops, hd_conservative_time_get, NULL, "%llu\n");
387 
388 static int hd_adjustment_count_get(void *unused, u64 *val)
389 {
390         *val = atomic64_read(&hd_adjustments);
391         return 0;
392 }
393 
394 DEFINE_SIMPLE_ATTRIBUTE(hd_adjustments_fops, hd_adjustment_count_get, NULL, "%llu\n");
395 
396 static void __init hd_create_debugfs_counters(void)
397 {
398         struct dentry *dir;
399 
400         dir = debugfs_create_dir("hiperdispatch", arch_debugfs_dir);
401         debugfs_create_file("conservative_time_ms", 0400, dir, NULL, &hd_conservative_time_fops);
402         debugfs_create_file("greedy_time_ms", 0400, dir, NULL, &hd_greedy_time_fops);
403         debugfs_create_file("adjustment_count", 0400, dir, NULL, &hd_adjustments_fops);
404 }
405 
406 static void __init hd_create_attributes(void)
407 {
408         struct device *dev;
409 
410         dev = bus_get_dev_root(&cpu_subsys);
411         if (!dev)
412                 return;
413         if (sysfs_create_group(&dev->kobj, &hd_attr_group))
414                 pr_warn("Unable to create hiperdispatch attribute group\n");
415         put_device(dev);
416 }
417 
418 static int __init hd_init(void)
419 {
420         if (IS_ENABLED(CONFIG_HIPERDISPATCH_ON)) {
421                 hd_set_hiperdispatch_mode(1);
422                 topology_schedule_update();
423         }
424         if (!register_sysctl("s390", hiperdispatch_ctl_table))
425                 pr_warn("Failed to register s390.hiperdispatch sysctl attribute\n");
426         hd_create_debugfs_counters();
427         hd_create_attributes();
428         return 0;
429 }
430 late_initcall(hd_init);
431 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

sflogo.php