~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/kernel/cgroup/cpuset-v1.c

Version: ~ [ linux-6.12-rc7 ] ~ [ linux-6.11.7 ] ~ [ linux-6.10.14 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.60 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.116 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.171 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.229 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.285 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.323 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.12 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 // SPDX-License-Identifier: GPL-2.0-or-later
  2 
  3 #include "cpuset-internal.h"
  4 
  5 /*
  6  * Legacy hierarchy call to cgroup_transfer_tasks() is handled asynchrously
  7  */
  8 struct cpuset_remove_tasks_struct {
  9         struct work_struct work;
 10         struct cpuset *cs;
 11 };
 12 
 13 /*
 14  * Frequency meter - How fast is some event occurring?
 15  *
 16  * These routines manage a digitally filtered, constant time based,
 17  * event frequency meter.  There are four routines:
 18  *   fmeter_init() - initialize a frequency meter.
 19  *   fmeter_markevent() - called each time the event happens.
 20  *   fmeter_getrate() - returns the recent rate of such events.
 21  *   fmeter_update() - internal routine used to update fmeter.
 22  *
 23  * A common data structure is passed to each of these routines,
 24  * which is used to keep track of the state required to manage the
 25  * frequency meter and its digital filter.
 26  *
 27  * The filter works on the number of events marked per unit time.
 28  * The filter is single-pole low-pass recursive (IIR).  The time unit
 29  * is 1 second.  Arithmetic is done using 32-bit integers scaled to
 30  * simulate 3 decimal digits of precision (multiplied by 1000).
 31  *
 32  * With an FM_COEF of 933, and a time base of 1 second, the filter
 33  * has a half-life of 10 seconds, meaning that if the events quit
 34  * happening, then the rate returned from the fmeter_getrate()
 35  * will be cut in half each 10 seconds, until it converges to zero.
 36  *
 37  * It is not worth doing a real infinitely recursive filter.  If more
 38  * than FM_MAXTICKS ticks have elapsed since the last filter event,
 39  * just compute FM_MAXTICKS ticks worth, by which point the level
 40  * will be stable.
 41  *
 42  * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid
 43  * arithmetic overflow in the fmeter_update() routine.
 44  *
 45  * Given the simple 32 bit integer arithmetic used, this meter works
 46  * best for reporting rates between one per millisecond (msec) and
 47  * one per 32 (approx) seconds.  At constant rates faster than one
 48  * per msec it maxes out at values just under 1,000,000.  At constant
 49  * rates between one per msec, and one per second it will stabilize
 50  * to a value N*1000, where N is the rate of events per second.
 51  * At constant rates between one per second and one per 32 seconds,
 52  * it will be choppy, moving up on the seconds that have an event,
 53  * and then decaying until the next event.  At rates slower than
 54  * about one in 32 seconds, it decays all the way back to zero between
 55  * each event.
 56  */
 57 
 58 #define FM_COEF 933             /* coefficient for half-life of 10 secs */
 59 #define FM_MAXTICKS ((u32)99)   /* useless computing more ticks than this */
 60 #define FM_MAXCNT 1000000       /* limit cnt to avoid overflow */
 61 #define FM_SCALE 1000           /* faux fixed point scale */
 62 
 63 /* Initialize a frequency meter */
 64 void fmeter_init(struct fmeter *fmp)
 65 {
 66         fmp->cnt = 0;
 67         fmp->val = 0;
 68         fmp->time = 0;
 69         spin_lock_init(&fmp->lock);
 70 }
 71 
 72 /* Internal meter update - process cnt events and update value */
 73 static void fmeter_update(struct fmeter *fmp)
 74 {
 75         time64_t now;
 76         u32 ticks;
 77 
 78         now = ktime_get_seconds();
 79         ticks = now - fmp->time;
 80 
 81         if (ticks == 0)
 82                 return;
 83 
 84         ticks = min(FM_MAXTICKS, ticks);
 85         while (ticks-- > 0)
 86                 fmp->val = (FM_COEF * fmp->val) / FM_SCALE;
 87         fmp->time = now;
 88 
 89         fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE;
 90         fmp->cnt = 0;
 91 }
 92 
 93 /* Process any previous ticks, then bump cnt by one (times scale). */
 94 static void fmeter_markevent(struct fmeter *fmp)
 95 {
 96         spin_lock(&fmp->lock);
 97         fmeter_update(fmp);
 98         fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE);
 99         spin_unlock(&fmp->lock);
100 }
101 
102 /* Process any previous ticks, then return current value. */
103 static int fmeter_getrate(struct fmeter *fmp)
104 {
105         int val;
106 
107         spin_lock(&fmp->lock);
108         fmeter_update(fmp);
109         val = fmp->val;
110         spin_unlock(&fmp->lock);
111         return val;
112 }
113 
114 /*
115  * Collection of memory_pressure is suppressed unless
116  * this flag is enabled by writing "1" to the special
117  * cpuset file 'memory_pressure_enabled' in the root cpuset.
118  */
119 
120 int cpuset_memory_pressure_enabled __read_mostly;
121 
122 /*
123  * __cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims.
124  *
125  * Keep a running average of the rate of synchronous (direct)
126  * page reclaim efforts initiated by tasks in each cpuset.
127  *
128  * This represents the rate at which some task in the cpuset
129  * ran low on memory on all nodes it was allowed to use, and
130  * had to enter the kernels page reclaim code in an effort to
131  * create more free memory by tossing clean pages or swapping
132  * or writing dirty pages.
133  *
134  * Display to user space in the per-cpuset read-only file
135  * "memory_pressure".  Value displayed is an integer
136  * representing the recent rate of entry into the synchronous
137  * (direct) page reclaim by any task attached to the cpuset.
138  */
139 
140 void __cpuset_memory_pressure_bump(void)
141 {
142         rcu_read_lock();
143         fmeter_markevent(&task_cs(current)->fmeter);
144         rcu_read_unlock();
145 }
146 
147 static int update_relax_domain_level(struct cpuset *cs, s64 val)
148 {
149 #ifdef CONFIG_SMP
150         if (val < -1 || val > sched_domain_level_max + 1)
151                 return -EINVAL;
152 #endif
153 
154         if (val != cs->relax_domain_level) {
155                 cs->relax_domain_level = val;
156                 if (!cpumask_empty(cs->cpus_allowed) &&
157                     is_sched_load_balance(cs))
158                         rebuild_sched_domains_locked();
159         }
160 
161         return 0;
162 }
163 
164 static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
165                             s64 val)
166 {
167         struct cpuset *cs = css_cs(css);
168         cpuset_filetype_t type = cft->private;
169         int retval = -ENODEV;
170 
171         cpus_read_lock();
172         cpuset_lock();
173         if (!is_cpuset_online(cs))
174                 goto out_unlock;
175 
176         switch (type) {
177         case FILE_SCHED_RELAX_DOMAIN_LEVEL:
178                 retval = update_relax_domain_level(cs, val);
179                 break;
180         default:
181                 retval = -EINVAL;
182                 break;
183         }
184 out_unlock:
185         cpuset_unlock();
186         cpus_read_unlock();
187         return retval;
188 }
189 
190 static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
191 {
192         struct cpuset *cs = css_cs(css);
193         cpuset_filetype_t type = cft->private;
194 
195         switch (type) {
196         case FILE_SCHED_RELAX_DOMAIN_LEVEL:
197                 return cs->relax_domain_level;
198         default:
199                 BUG();
200         }
201 
202         /* Unreachable but makes gcc happy */
203         return 0;
204 }
205 
206 /*
207  * update task's spread flag if cpuset's page/slab spread flag is set
208  *
209  * Call with callback_lock or cpuset_mutex held. The check can be skipped
210  * if on default hierarchy.
211  */
212 void cpuset1_update_task_spread_flags(struct cpuset *cs,
213                                         struct task_struct *tsk)
214 {
215         if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
216                 return;
217 
218         if (is_spread_page(cs))
219                 task_set_spread_page(tsk);
220         else
221                 task_clear_spread_page(tsk);
222 
223         if (is_spread_slab(cs))
224                 task_set_spread_slab(tsk);
225         else
226                 task_clear_spread_slab(tsk);
227 }
228 
229 /**
230  * cpuset1_update_tasks_flags - update the spread flags of tasks in the cpuset.
231  * @cs: the cpuset in which each task's spread flags needs to be changed
232  *
233  * Iterate through each task of @cs updating its spread flags.  As this
234  * function is called with cpuset_mutex held, cpuset membership stays
235  * stable.
236  */
237 void cpuset1_update_tasks_flags(struct cpuset *cs)
238 {
239         struct css_task_iter it;
240         struct task_struct *task;
241 
242         css_task_iter_start(&cs->css, 0, &it);
243         while ((task = css_task_iter_next(&it)))
244                 cpuset1_update_task_spread_flags(cs, task);
245         css_task_iter_end(&it);
246 }
247 
248 /*
249  * If CPU and/or memory hotplug handlers, below, unplug any CPUs
250  * or memory nodes, we need to walk over the cpuset hierarchy,
251  * removing that CPU or node from all cpusets.  If this removes the
252  * last CPU or node from a cpuset, then move the tasks in the empty
253  * cpuset to its next-highest non-empty parent.
254  */
255 static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
256 {
257         struct cpuset *parent;
258 
259         /*
260          * Find its next-highest non-empty parent, (top cpuset
261          * has online cpus, so can't be empty).
262          */
263         parent = parent_cs(cs);
264         while (cpumask_empty(parent->cpus_allowed) ||
265                         nodes_empty(parent->mems_allowed))
266                 parent = parent_cs(parent);
267 
268         if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) {
269                 pr_err("cpuset: failed to transfer tasks out of empty cpuset ");
270                 pr_cont_cgroup_name(cs->css.cgroup);
271                 pr_cont("\n");
272         }
273 }
274 
275 static void cpuset_migrate_tasks_workfn(struct work_struct *work)
276 {
277         struct cpuset_remove_tasks_struct *s;
278 
279         s = container_of(work, struct cpuset_remove_tasks_struct, work);
280         remove_tasks_in_empty_cpuset(s->cs);
281         css_put(&s->cs->css);
282         kfree(s);
283 }
284 
285 void cpuset1_hotplug_update_tasks(struct cpuset *cs,
286                             struct cpumask *new_cpus, nodemask_t *new_mems,
287                             bool cpus_updated, bool mems_updated)
288 {
289         bool is_empty;
290 
291         cpuset_callback_lock_irq();
292         cpumask_copy(cs->cpus_allowed, new_cpus);
293         cpumask_copy(cs->effective_cpus, new_cpus);
294         cs->mems_allowed = *new_mems;
295         cs->effective_mems = *new_mems;
296         cpuset_callback_unlock_irq();
297 
298         /*
299          * Don't call cpuset_update_tasks_cpumask() if the cpuset becomes empty,
300          * as the tasks will be migrated to an ancestor.
301          */
302         if (cpus_updated && !cpumask_empty(cs->cpus_allowed))
303                 cpuset_update_tasks_cpumask(cs, new_cpus);
304         if (mems_updated && !nodes_empty(cs->mems_allowed))
305                 cpuset_update_tasks_nodemask(cs);
306 
307         is_empty = cpumask_empty(cs->cpus_allowed) ||
308                    nodes_empty(cs->mems_allowed);
309 
310         /*
311          * Move tasks to the nearest ancestor with execution resources,
312          * This is full cgroup operation which will also call back into
313          * cpuset. Execute it asynchronously using workqueue.
314          */
315         if (is_empty && cs->css.cgroup->nr_populated_csets &&
316             css_tryget_online(&cs->css)) {
317                 struct cpuset_remove_tasks_struct *s;
318 
319                 s = kzalloc(sizeof(*s), GFP_KERNEL);
320                 if (WARN_ON_ONCE(!s)) {
321                         css_put(&cs->css);
322                         return;
323                 }
324 
325                 s->cs = cs;
326                 INIT_WORK(&s->work, cpuset_migrate_tasks_workfn);
327                 schedule_work(&s->work);
328         }
329 }
330 
331 /*
332  * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q?
333  *
334  * One cpuset is a subset of another if all its allowed CPUs and
335  * Memory Nodes are a subset of the other, and its exclusive flags
336  * are only set if the other's are set.  Call holding cpuset_mutex.
337  */
338 
339 static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
340 {
341         return  cpumask_subset(p->cpus_allowed, q->cpus_allowed) &&
342                 nodes_subset(p->mems_allowed, q->mems_allowed) &&
343                 is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
344                 is_mem_exclusive(p) <= is_mem_exclusive(q);
345 }
346 
347 /*
348  * cpuset1_validate_change() - Validate conditions specific to legacy (v1)
349  *                            behavior.
350  */
351 int cpuset1_validate_change(struct cpuset *cur, struct cpuset *trial)
352 {
353         struct cgroup_subsys_state *css;
354         struct cpuset *c, *par;
355         int ret;
356 
357         WARN_ON_ONCE(!rcu_read_lock_held());
358 
359         /* Each of our child cpusets must be a subset of us */
360         ret = -EBUSY;
361         cpuset_for_each_child(c, css, cur)
362                 if (!is_cpuset_subset(c, trial))
363                         goto out;
364 
365         /* On legacy hierarchy, we must be a subset of our parent cpuset. */
366         ret = -EACCES;
367         par = parent_cs(cur);
368         if (par && !is_cpuset_subset(trial, par))
369                 goto out;
370 
371         ret = 0;
372 out:
373         return ret;
374 }
375 
376 static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
377 {
378         struct cpuset *cs = css_cs(css);
379         cpuset_filetype_t type = cft->private;
380 
381         switch (type) {
382         case FILE_CPU_EXCLUSIVE:
383                 return is_cpu_exclusive(cs);
384         case FILE_MEM_EXCLUSIVE:
385                 return is_mem_exclusive(cs);
386         case FILE_MEM_HARDWALL:
387                 return is_mem_hardwall(cs);
388         case FILE_SCHED_LOAD_BALANCE:
389                 return is_sched_load_balance(cs);
390         case FILE_MEMORY_MIGRATE:
391                 return is_memory_migrate(cs);
392         case FILE_MEMORY_PRESSURE_ENABLED:
393                 return cpuset_memory_pressure_enabled;
394         case FILE_MEMORY_PRESSURE:
395                 return fmeter_getrate(&cs->fmeter);
396         case FILE_SPREAD_PAGE:
397                 return is_spread_page(cs);
398         case FILE_SPREAD_SLAB:
399                 return is_spread_slab(cs);
400         default:
401                 BUG();
402         }
403 
404         /* Unreachable but makes gcc happy */
405         return 0;
406 }
407 
408 static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
409                             u64 val)
410 {
411         struct cpuset *cs = css_cs(css);
412         cpuset_filetype_t type = cft->private;
413         int retval = 0;
414 
415         cpus_read_lock();
416         cpuset_lock();
417         if (!is_cpuset_online(cs)) {
418                 retval = -ENODEV;
419                 goto out_unlock;
420         }
421 
422         switch (type) {
423         case FILE_CPU_EXCLUSIVE:
424                 retval = cpuset_update_flag(CS_CPU_EXCLUSIVE, cs, val);
425                 break;
426         case FILE_MEM_EXCLUSIVE:
427                 retval = cpuset_update_flag(CS_MEM_EXCLUSIVE, cs, val);
428                 break;
429         case FILE_MEM_HARDWALL:
430                 retval = cpuset_update_flag(CS_MEM_HARDWALL, cs, val);
431                 break;
432         case FILE_SCHED_LOAD_BALANCE:
433                 retval = cpuset_update_flag(CS_SCHED_LOAD_BALANCE, cs, val);
434                 break;
435         case FILE_MEMORY_MIGRATE:
436                 retval = cpuset_update_flag(CS_MEMORY_MIGRATE, cs, val);
437                 break;
438         case FILE_MEMORY_PRESSURE_ENABLED:
439                 cpuset_memory_pressure_enabled = !!val;
440                 break;
441         case FILE_SPREAD_PAGE:
442                 retval = cpuset_update_flag(CS_SPREAD_PAGE, cs, val);
443                 break;
444         case FILE_SPREAD_SLAB:
445                 retval = cpuset_update_flag(CS_SPREAD_SLAB, cs, val);
446                 break;
447         default:
448                 retval = -EINVAL;
449                 break;
450         }
451 out_unlock:
452         cpuset_unlock();
453         cpus_read_unlock();
454         return retval;
455 }
456 
457 /*
458  * for the common functions, 'private' gives the type of file
459  */
460 
461 struct cftype cpuset1_files[] = {
462         {
463                 .name = "cpus",
464                 .seq_show = cpuset_common_seq_show,
465                 .write = cpuset_write_resmask,
466                 .max_write_len = (100U + 6 * NR_CPUS),
467                 .private = FILE_CPULIST,
468         },
469 
470         {
471                 .name = "mems",
472                 .seq_show = cpuset_common_seq_show,
473                 .write = cpuset_write_resmask,
474                 .max_write_len = (100U + 6 * MAX_NUMNODES),
475                 .private = FILE_MEMLIST,
476         },
477 
478         {
479                 .name = "effective_cpus",
480                 .seq_show = cpuset_common_seq_show,
481                 .private = FILE_EFFECTIVE_CPULIST,
482         },
483 
484         {
485                 .name = "effective_mems",
486                 .seq_show = cpuset_common_seq_show,
487                 .private = FILE_EFFECTIVE_MEMLIST,
488         },
489 
490         {
491                 .name = "cpu_exclusive",
492                 .read_u64 = cpuset_read_u64,
493                 .write_u64 = cpuset_write_u64,
494                 .private = FILE_CPU_EXCLUSIVE,
495         },
496 
497         {
498                 .name = "mem_exclusive",
499                 .read_u64 = cpuset_read_u64,
500                 .write_u64 = cpuset_write_u64,
501                 .private = FILE_MEM_EXCLUSIVE,
502         },
503 
504         {
505                 .name = "mem_hardwall",
506                 .read_u64 = cpuset_read_u64,
507                 .write_u64 = cpuset_write_u64,
508                 .private = FILE_MEM_HARDWALL,
509         },
510 
511         {
512                 .name = "sched_load_balance",
513                 .read_u64 = cpuset_read_u64,
514                 .write_u64 = cpuset_write_u64,
515                 .private = FILE_SCHED_LOAD_BALANCE,
516         },
517 
518         {
519                 .name = "sched_relax_domain_level",
520                 .read_s64 = cpuset_read_s64,
521                 .write_s64 = cpuset_write_s64,
522                 .private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
523         },
524 
525         {
526                 .name = "memory_migrate",
527                 .read_u64 = cpuset_read_u64,
528                 .write_u64 = cpuset_write_u64,
529                 .private = FILE_MEMORY_MIGRATE,
530         },
531 
532         {
533                 .name = "memory_pressure",
534                 .read_u64 = cpuset_read_u64,
535                 .private = FILE_MEMORY_PRESSURE,
536         },
537 
538         {
539                 .name = "memory_spread_page",
540                 .read_u64 = cpuset_read_u64,
541                 .write_u64 = cpuset_write_u64,
542                 .private = FILE_SPREAD_PAGE,
543         },
544 
545         {
546                 /* obsolete, may be removed in the future */
547                 .name = "memory_spread_slab",
548                 .read_u64 = cpuset_read_u64,
549                 .write_u64 = cpuset_write_u64,
550                 .private = FILE_SPREAD_SLAB,
551         },
552 
553         {
554                 .name = "memory_pressure_enabled",
555                 .flags = CFTYPE_ONLY_ON_ROOT,
556                 .read_u64 = cpuset_read_u64,
557                 .write_u64 = cpuset_write_u64,
558                 .private = FILE_MEMORY_PRESSURE_ENABLED,
559         },
560 
561         { }     /* terminate */
562 };
563 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

sflogo.php