~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/kernel/cgroup/cpuset.c

Version: ~ [ linux-6.11-rc3 ] ~ [ linux-6.10.4 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.45 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.104 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.164 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.223 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.281 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.319 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.9 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 /*
  2  *  kernel/cpuset.c
  3  *
  4  *  Processor and Memory placement constraints for sets of tasks.
  5  *
  6  *  Copyright (C) 2003 BULL SA.
  7  *  Copyright (C) 2004-2007 Silicon Graphics, Inc.
  8  *  Copyright (C) 2006 Google, Inc
  9  *
 10  *  Portions derived from Patrick Mochel's sysfs code.
 11  *  sysfs is Copyright (c) 2001-3 Patrick Mochel
 12  *
 13  *  2003-10-10 Written by Simon Derr.
 14  *  2003-10-22 Updates by Stephen Hemminger.
 15  *  2004 May-July Rework by Paul Jackson.
 16  *  2006 Rework by Paul Menage to use generic cgroups
 17  *  2008 Rework of the scheduler domains and CPU hotplug handling
 18  *       by Max Krasnyansky
 19  *
 20  *  This file is subject to the terms and conditions of the GNU General Public
 21  *  License.  See the file COPYING in the main directory of the Linux
 22  *  distribution for more details.
 23  */
 24 #include "cgroup-internal.h"
 25 
 26 #include <linux/cpu.h>
 27 #include <linux/cpumask.h>
 28 #include <linux/cpuset.h>
 29 #include <linux/delay.h>
 30 #include <linux/init.h>
 31 #include <linux/interrupt.h>
 32 #include <linux/kernel.h>
 33 #include <linux/mempolicy.h>
 34 #include <linux/mm.h>
 35 #include <linux/memory.h>
 36 #include <linux/export.h>
 37 #include <linux/rcupdate.h>
 38 #include <linux/sched.h>
 39 #include <linux/sched/deadline.h>
 40 #include <linux/sched/mm.h>
 41 #include <linux/sched/task.h>
 42 #include <linux/security.h>
 43 #include <linux/spinlock.h>
 44 #include <linux/oom.h>
 45 #include <linux/sched/isolation.h>
 46 #include <linux/cgroup.h>
 47 #include <linux/wait.h>
 48 #include <linux/workqueue.h>
 49 
 50 DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key);
 51 DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
 52 
 53 /*
 54  * There could be abnormal cpuset configurations for cpu or memory
 55  * node binding, add this key to provide a quick low-cost judgment
 56  * of the situation.
 57  */
 58 DEFINE_STATIC_KEY_FALSE(cpusets_insane_config_key);
 59 
 60 /* See "Frequency meter" comments, below. */
 61 
 62 struct fmeter {
 63         int cnt;                /* unprocessed events count */
 64         int val;                /* most recent output value */
 65         time64_t time;          /* clock (secs) when val computed */
 66         spinlock_t lock;        /* guards read or write of above */
 67 };
 68 
 69 /*
 70  * Invalid partition error code
 71  */
 72 enum prs_errcode {
 73         PERR_NONE = 0,
 74         PERR_INVCPUS,
 75         PERR_INVPARENT,
 76         PERR_NOTPART,
 77         PERR_NOTEXCL,
 78         PERR_NOCPUS,
 79         PERR_HOTPLUG,
 80         PERR_CPUSEMPTY,
 81         PERR_HKEEPING,
 82 };
 83 
 84 static const char * const perr_strings[] = {
 85         [PERR_INVCPUS]   = "Invalid cpu list in cpuset.cpus.exclusive",
 86         [PERR_INVPARENT] = "Parent is an invalid partition root",
 87         [PERR_NOTPART]   = "Parent is not a partition root",
 88         [PERR_NOTEXCL]   = "Cpu list in cpuset.cpus not exclusive",
 89         [PERR_NOCPUS]    = "Parent unable to distribute cpu downstream",
 90         [PERR_HOTPLUG]   = "No cpu available due to hotplug",
 91         [PERR_CPUSEMPTY] = "cpuset.cpus and cpuset.cpus.exclusive are empty",
 92         [PERR_HKEEPING]  = "partition config conflicts with housekeeping setup",
 93 };
 94 
 95 struct cpuset {
 96         struct cgroup_subsys_state css;
 97 
 98         unsigned long flags;            /* "unsigned long" so bitops work */
 99 
100         /*
101          * On default hierarchy:
102          *
103          * The user-configured masks can only be changed by writing to
104          * cpuset.cpus and cpuset.mems, and won't be limited by the
105          * parent masks.
106          *
107          * The effective masks is the real masks that apply to the tasks
108          * in the cpuset. They may be changed if the configured masks are
109          * changed or hotplug happens.
110          *
111          * effective_mask == configured_mask & parent's effective_mask,
112          * and if it ends up empty, it will inherit the parent's mask.
113          *
114          *
115          * On legacy hierarchy:
116          *
117          * The user-configured masks are always the same with effective masks.
118          */
119 
120         /* user-configured CPUs and Memory Nodes allow to tasks */
121         cpumask_var_t cpus_allowed;
122         nodemask_t mems_allowed;
123 
124         /* effective CPUs and Memory Nodes allow to tasks */
125         cpumask_var_t effective_cpus;
126         nodemask_t effective_mems;
127 
128         /*
129          * Exclusive CPUs dedicated to current cgroup (default hierarchy only)
130          *
131          * The effective_cpus of a valid partition root comes solely from its
132          * effective_xcpus and some of the effective_xcpus may be distributed
133          * to sub-partitions below & hence excluded from its effective_cpus.
134          * For a valid partition root, its effective_cpus have no relationship
135          * with cpus_allowed unless its exclusive_cpus isn't set.
136          *
137          * This value will only be set if either exclusive_cpus is set or
138          * when this cpuset becomes a local partition root.
139          */
140         cpumask_var_t effective_xcpus;
141 
142         /*
143          * Exclusive CPUs as requested by the user (default hierarchy only)
144          *
145          * Its value is independent of cpus_allowed and designates the set of
146          * CPUs that can be granted to the current cpuset or its children when
147          * it becomes a valid partition root. The effective set of exclusive
148          * CPUs granted (effective_xcpus) depends on whether those exclusive
149          * CPUs are passed down by its ancestors and not yet taken up by
150          * another sibling partition root along the way.
151          *
152          * If its value isn't set, it defaults to cpus_allowed.
153          */
154         cpumask_var_t exclusive_cpus;
155 
156         /*
157          * This is old Memory Nodes tasks took on.
158          *
159          * - top_cpuset.old_mems_allowed is initialized to mems_allowed.
160          * - A new cpuset's old_mems_allowed is initialized when some
161          *   task is moved into it.
162          * - old_mems_allowed is used in cpuset_migrate_mm() when we change
163          *   cpuset.mems_allowed and have tasks' nodemask updated, and
164          *   then old_mems_allowed is updated to mems_allowed.
165          */
166         nodemask_t old_mems_allowed;
167 
168         struct fmeter fmeter;           /* memory_pressure filter */
169 
170         /*
171          * Tasks are being attached to this cpuset.  Used to prevent
172          * zeroing cpus/mems_allowed between ->can_attach() and ->attach().
173          */
174         int attach_in_progress;
175 
176         /* partition number for rebuild_sched_domains() */
177         int pn;
178 
179         /* for custom sched domain */
180         int relax_domain_level;
181 
182         /* number of valid local child partitions */
183         int nr_subparts;
184 
185         /* partition root state */
186         int partition_root_state;
187 
188         /*
189          * Default hierarchy only:
190          * use_parent_ecpus - set if using parent's effective_cpus
191          * child_ecpus_count - # of children with use_parent_ecpus set
192          */
193         int use_parent_ecpus;
194         int child_ecpus_count;
195 
196         /*
197          * number of SCHED_DEADLINE tasks attached to this cpuset, so that we
198          * know when to rebuild associated root domain bandwidth information.
199          */
200         int nr_deadline_tasks;
201         int nr_migrate_dl_tasks;
202         u64 sum_migrate_dl_bw;
203 
204         /* Invalid partition error code, not lock protected */
205         enum prs_errcode prs_err;
206 
207         /* Handle for cpuset.cpus.partition */
208         struct cgroup_file partition_file;
209 
210         /* Remote partition silbling list anchored at remote_children */
211         struct list_head remote_sibling;
212 };
213 
214 /*
215  * Legacy hierarchy call to cgroup_transfer_tasks() is handled asynchrously
216  */
217 struct cpuset_remove_tasks_struct {
218         struct work_struct work;
219         struct cpuset *cs;
220 };
221 
222 /*
223  * Exclusive CPUs distributed out to sub-partitions of top_cpuset
224  */
225 static cpumask_var_t    subpartitions_cpus;
226 
227 /*
228  * Exclusive CPUs in isolated partitions
229  */
230 static cpumask_var_t    isolated_cpus;
231 
232 /* List of remote partition root children */
233 static struct list_head remote_children;
234 
235 /*
236  * Partition root states:
237  *
238  *   0 - member (not a partition root)
239  *   1 - partition root
240  *   2 - partition root without load balancing (isolated)
241  *  -1 - invalid partition root
242  *  -2 - invalid isolated partition root
243  *
244  *  There are 2 types of partitions - local or remote. Local partitions are
245  *  those whose parents are partition root themselves. Setting of
246  *  cpuset.cpus.exclusive are optional in setting up local partitions.
247  *  Remote partitions are those whose parents are not partition roots. Passing
248  *  down exclusive CPUs by setting cpuset.cpus.exclusive along its ancestor
249  *  nodes are mandatory in creating a remote partition.
250  *
251  *  For simplicity, a local partition can be created under a local or remote
252  *  partition but a remote partition cannot have any partition root in its
253  *  ancestor chain except the cgroup root.
254  */
255 #define PRS_MEMBER              0
256 #define PRS_ROOT                1
257 #define PRS_ISOLATED            2
258 #define PRS_INVALID_ROOT        -1
259 #define PRS_INVALID_ISOLATED    -2
260 
261 static inline bool is_prs_invalid(int prs_state)
262 {
263         return prs_state < 0;
264 }
265 
266 /*
267  * Temporary cpumasks for working with partitions that are passed among
268  * functions to avoid memory allocation in inner functions.
269  */
270 struct tmpmasks {
271         cpumask_var_t addmask, delmask; /* For partition root */
272         cpumask_var_t new_cpus;         /* For update_cpumasks_hier() */
273 };
274 
275 static inline struct cpuset *css_cs(struct cgroup_subsys_state *css)
276 {
277         return css ? container_of(css, struct cpuset, css) : NULL;
278 }
279 
280 /* Retrieve the cpuset for a task */
281 static inline struct cpuset *task_cs(struct task_struct *task)
282 {
283         return css_cs(task_css(task, cpuset_cgrp_id));
284 }
285 
286 static inline struct cpuset *parent_cs(struct cpuset *cs)
287 {
288         return css_cs(cs->css.parent);
289 }
290 
291 void inc_dl_tasks_cs(struct task_struct *p)
292 {
293         struct cpuset *cs = task_cs(p);
294 
295         cs->nr_deadline_tasks++;
296 }
297 
298 void dec_dl_tasks_cs(struct task_struct *p)
299 {
300         struct cpuset *cs = task_cs(p);
301 
302         cs->nr_deadline_tasks--;
303 }
304 
305 /* bits in struct cpuset flags field */
306 typedef enum {
307         CS_ONLINE,
308         CS_CPU_EXCLUSIVE,
309         CS_MEM_EXCLUSIVE,
310         CS_MEM_HARDWALL,
311         CS_MEMORY_MIGRATE,
312         CS_SCHED_LOAD_BALANCE,
313         CS_SPREAD_PAGE,
314         CS_SPREAD_SLAB,
315 } cpuset_flagbits_t;
316 
317 /* convenient tests for these bits */
318 static inline bool is_cpuset_online(struct cpuset *cs)
319 {
320         return test_bit(CS_ONLINE, &cs->flags) && !css_is_dying(&cs->css);
321 }
322 
323 static inline int is_cpu_exclusive(const struct cpuset *cs)
324 {
325         return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
326 }
327 
328 static inline int is_mem_exclusive(const struct cpuset *cs)
329 {
330         return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
331 }
332 
333 static inline int is_mem_hardwall(const struct cpuset *cs)
334 {
335         return test_bit(CS_MEM_HARDWALL, &cs->flags);
336 }
337 
338 static inline int is_sched_load_balance(const struct cpuset *cs)
339 {
340         return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
341 }
342 
343 static inline int is_memory_migrate(const struct cpuset *cs)
344 {
345         return test_bit(CS_MEMORY_MIGRATE, &cs->flags);
346 }
347 
348 static inline int is_spread_page(const struct cpuset *cs)
349 {
350         return test_bit(CS_SPREAD_PAGE, &cs->flags);
351 }
352 
353 static inline int is_spread_slab(const struct cpuset *cs)
354 {
355         return test_bit(CS_SPREAD_SLAB, &cs->flags);
356 }
357 
358 static inline int is_partition_valid(const struct cpuset *cs)
359 {
360         return cs->partition_root_state > 0;
361 }
362 
363 static inline int is_partition_invalid(const struct cpuset *cs)
364 {
365         return cs->partition_root_state < 0;
366 }
367 
368 /*
369  * Callers should hold callback_lock to modify partition_root_state.
370  */
371 static inline void make_partition_invalid(struct cpuset *cs)
372 {
373         if (cs->partition_root_state > 0)
374                 cs->partition_root_state = -cs->partition_root_state;
375 }
376 
377 /*
378  * Send notification event of whenever partition_root_state changes.
379  */
380 static inline void notify_partition_change(struct cpuset *cs, int old_prs)
381 {
382         if (old_prs == cs->partition_root_state)
383                 return;
384         cgroup_file_notify(&cs->partition_file);
385 
386         /* Reset prs_err if not invalid */
387         if (is_partition_valid(cs))
388                 WRITE_ONCE(cs->prs_err, PERR_NONE);
389 }
390 
391 static struct cpuset top_cpuset = {
392         .flags = BIT(CS_ONLINE) | BIT(CS_CPU_EXCLUSIVE) |
393                  BIT(CS_MEM_EXCLUSIVE) | BIT(CS_SCHED_LOAD_BALANCE),
394         .partition_root_state = PRS_ROOT,
395         .relax_domain_level = -1,
396         .remote_sibling = LIST_HEAD_INIT(top_cpuset.remote_sibling),
397 };
398 
399 /**
400  * cpuset_for_each_child - traverse online children of a cpuset
401  * @child_cs: loop cursor pointing to the current child
402  * @pos_css: used for iteration
403  * @parent_cs: target cpuset to walk children of
404  *
405  * Walk @child_cs through the online children of @parent_cs.  Must be used
406  * with RCU read locked.
407  */
408 #define cpuset_for_each_child(child_cs, pos_css, parent_cs)             \
409         css_for_each_child((pos_css), &(parent_cs)->css)                \
410                 if (is_cpuset_online(((child_cs) = css_cs((pos_css)))))
411 
412 /**
413  * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants
414  * @des_cs: loop cursor pointing to the current descendant
415  * @pos_css: used for iteration
416  * @root_cs: target cpuset to walk ancestor of
417  *
418  * Walk @des_cs through the online descendants of @root_cs.  Must be used
419  * with RCU read locked.  The caller may modify @pos_css by calling
420  * css_rightmost_descendant() to skip subtree.  @root_cs is included in the
421  * iteration and the first node to be visited.
422  */
423 #define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs)        \
424         css_for_each_descendant_pre((pos_css), &(root_cs)->css)         \
425                 if (is_cpuset_online(((des_cs) = css_cs((pos_css)))))
426 
427 /*
428  * There are two global locks guarding cpuset structures - cpuset_mutex and
429  * callback_lock. We also require taking task_lock() when dereferencing a
430  * task's cpuset pointer. See "The task_lock() exception", at the end of this
431  * comment.  The cpuset code uses only cpuset_mutex. Other kernel subsystems
432  * can use cpuset_lock()/cpuset_unlock() to prevent change to cpuset
433  * structures. Note that cpuset_mutex needs to be a mutex as it is used in
434  * paths that rely on priority inheritance (e.g. scheduler - on RT) for
435  * correctness.
436  *
437  * A task must hold both locks to modify cpusets.  If a task holds
438  * cpuset_mutex, it blocks others, ensuring that it is the only task able to
439  * also acquire callback_lock and be able to modify cpusets.  It can perform
440  * various checks on the cpuset structure first, knowing nothing will change.
441  * It can also allocate memory while just holding cpuset_mutex.  While it is
442  * performing these checks, various callback routines can briefly acquire
443  * callback_lock to query cpusets.  Once it is ready to make the changes, it
444  * takes callback_lock, blocking everyone else.
445  *
446  * Calls to the kernel memory allocator can not be made while holding
447  * callback_lock, as that would risk double tripping on callback_lock
448  * from one of the callbacks into the cpuset code from within
449  * __alloc_pages().
450  *
451  * If a task is only holding callback_lock, then it has read-only
452  * access to cpusets.
453  *
454  * Now, the task_struct fields mems_allowed and mempolicy may be changed
455  * by other task, we use alloc_lock in the task_struct fields to protect
456  * them.
457  *
458  * The cpuset_common_seq_show() handlers only hold callback_lock across
459  * small pieces of code, such as when reading out possibly multi-word
460  * cpumasks and nodemasks.
461  *
462  * Accessing a task's cpuset should be done in accordance with the
463  * guidelines for accessing subsystem state in kernel/cgroup.c
464  */
465 
466 static DEFINE_MUTEX(cpuset_mutex);
467 
468 void cpuset_lock(void)
469 {
470         mutex_lock(&cpuset_mutex);
471 }
472 
473 void cpuset_unlock(void)
474 {
475         mutex_unlock(&cpuset_mutex);
476 }
477 
478 static DEFINE_SPINLOCK(callback_lock);
479 
480 static struct workqueue_struct *cpuset_migrate_mm_wq;
481 
482 static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
483 
484 static inline void check_insane_mems_config(nodemask_t *nodes)
485 {
486         if (!cpusets_insane_config() &&
487                 movable_only_nodes(nodes)) {
488                 static_branch_enable(&cpusets_insane_config_key);
489                 pr_info("Unsupported (movable nodes only) cpuset configuration detected (nmask=%*pbl)!\n"
490                         "Cpuset allocations might fail even with a lot of memory available.\n",
491                         nodemask_pr_args(nodes));
492         }
493 }
494 
495 /*
496  * Cgroup v2 behavior is used on the "cpus" and "mems" control files when
497  * on default hierarchy or when the cpuset_v2_mode flag is set by mounting
498  * the v1 cpuset cgroup filesystem with the "cpuset_v2_mode" mount option.
499  * With v2 behavior, "cpus" and "mems" are always what the users have
500  * requested and won't be changed by hotplug events. Only the effective
501  * cpus or mems will be affected.
502  */
503 static inline bool is_in_v2_mode(void)
504 {
505         return cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
506               (cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE);
507 }
508 
509 /**
510  * partition_is_populated - check if partition has tasks
511  * @cs: partition root to be checked
512  * @excluded_child: a child cpuset to be excluded in task checking
513  * Return: true if there are tasks, false otherwise
514  *
515  * It is assumed that @cs is a valid partition root. @excluded_child should
516  * be non-NULL when this cpuset is going to become a partition itself.
517  */
518 static inline bool partition_is_populated(struct cpuset *cs,
519                                           struct cpuset *excluded_child)
520 {
521         struct cgroup_subsys_state *css;
522         struct cpuset *child;
523 
524         if (cs->css.cgroup->nr_populated_csets)
525                 return true;
526         if (!excluded_child && !cs->nr_subparts)
527                 return cgroup_is_populated(cs->css.cgroup);
528 
529         rcu_read_lock();
530         cpuset_for_each_child(child, css, cs) {
531                 if (child == excluded_child)
532                         continue;
533                 if (is_partition_valid(child))
534                         continue;
535                 if (cgroup_is_populated(child->css.cgroup)) {
536                         rcu_read_unlock();
537                         return true;
538                 }
539         }
540         rcu_read_unlock();
541         return false;
542 }
543 
544 /*
545  * Return in pmask the portion of a task's cpusets's cpus_allowed that
546  * are online and are capable of running the task.  If none are found,
547  * walk up the cpuset hierarchy until we find one that does have some
548  * appropriate cpus.
549  *
550  * One way or another, we guarantee to return some non-empty subset
551  * of cpu_online_mask.
552  *
553  * Call with callback_lock or cpuset_mutex held.
554  */
555 static void guarantee_online_cpus(struct task_struct *tsk,
556                                   struct cpumask *pmask)
557 {
558         const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);
559         struct cpuset *cs;
560 
561         if (WARN_ON(!cpumask_and(pmask, possible_mask, cpu_online_mask)))
562                 cpumask_copy(pmask, cpu_online_mask);
563 
564         rcu_read_lock();
565         cs = task_cs(tsk);
566 
567         while (!cpumask_intersects(cs->effective_cpus, pmask))
568                 cs = parent_cs(cs);
569 
570         cpumask_and(pmask, pmask, cs->effective_cpus);
571         rcu_read_unlock();
572 }
573 
574 /*
575  * Return in *pmask the portion of a cpusets's mems_allowed that
576  * are online, with memory.  If none are online with memory, walk
577  * up the cpuset hierarchy until we find one that does have some
578  * online mems.  The top cpuset always has some mems online.
579  *
580  * One way or another, we guarantee to return some non-empty subset
581  * of node_states[N_MEMORY].
582  *
583  * Call with callback_lock or cpuset_mutex held.
584  */
585 static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
586 {
587         while (!nodes_intersects(cs->effective_mems, node_states[N_MEMORY]))
588                 cs = parent_cs(cs);
589         nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]);
590 }
591 
592 /*
593  * update task's spread flag if cpuset's page/slab spread flag is set
594  *
595  * Call with callback_lock or cpuset_mutex held. The check can be skipped
596  * if on default hierarchy.
597  */
598 static void cpuset_update_task_spread_flags(struct cpuset *cs,
599                                         struct task_struct *tsk)
600 {
601         if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
602                 return;
603 
604         if (is_spread_page(cs))
605                 task_set_spread_page(tsk);
606         else
607                 task_clear_spread_page(tsk);
608 
609         if (is_spread_slab(cs))
610                 task_set_spread_slab(tsk);
611         else
612                 task_clear_spread_slab(tsk);
613 }
614 
615 /*
616  * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q?
617  *
618  * One cpuset is a subset of another if all its allowed CPUs and
619  * Memory Nodes are a subset of the other, and its exclusive flags
620  * are only set if the other's are set.  Call holding cpuset_mutex.
621  */
622 
623 static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
624 {
625         return  cpumask_subset(p->cpus_allowed, q->cpus_allowed) &&
626                 nodes_subset(p->mems_allowed, q->mems_allowed) &&
627                 is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
628                 is_mem_exclusive(p) <= is_mem_exclusive(q);
629 }
630 
631 /**
632  * alloc_cpumasks - allocate three cpumasks for cpuset
633  * @cs:  the cpuset that have cpumasks to be allocated.
634  * @tmp: the tmpmasks structure pointer
635  * Return: 0 if successful, -ENOMEM otherwise.
636  *
637  * Only one of the two input arguments should be non-NULL.
638  */
639 static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
640 {
641         cpumask_var_t *pmask1, *pmask2, *pmask3, *pmask4;
642 
643         if (cs) {
644                 pmask1 = &cs->cpus_allowed;
645                 pmask2 = &cs->effective_cpus;
646                 pmask3 = &cs->effective_xcpus;
647                 pmask4 = &cs->exclusive_cpus;
648         } else {
649                 pmask1 = &tmp->new_cpus;
650                 pmask2 = &tmp->addmask;
651                 pmask3 = &tmp->delmask;
652                 pmask4 = NULL;
653         }
654 
655         if (!zalloc_cpumask_var(pmask1, GFP_KERNEL))
656                 return -ENOMEM;
657 
658         if (!zalloc_cpumask_var(pmask2, GFP_KERNEL))
659                 goto free_one;
660 
661         if (!zalloc_cpumask_var(pmask3, GFP_KERNEL))
662                 goto free_two;
663 
664         if (pmask4 && !zalloc_cpumask_var(pmask4, GFP_KERNEL))
665                 goto free_three;
666 
667 
668         return 0;
669 
670 free_three:
671         free_cpumask_var(*pmask3);
672 free_two:
673         free_cpumask_var(*pmask2);
674 free_one:
675         free_cpumask_var(*pmask1);
676         return -ENOMEM;
677 }
678 
679 /**
680  * free_cpumasks - free cpumasks in a tmpmasks structure
681  * @cs:  the cpuset that have cpumasks to be free.
682  * @tmp: the tmpmasks structure pointer
683  */
684 static inline void free_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
685 {
686         if (cs) {
687                 free_cpumask_var(cs->cpus_allowed);
688                 free_cpumask_var(cs->effective_cpus);
689                 free_cpumask_var(cs->effective_xcpus);
690                 free_cpumask_var(cs->exclusive_cpus);
691         }
692         if (tmp) {
693                 free_cpumask_var(tmp->new_cpus);
694                 free_cpumask_var(tmp->addmask);
695                 free_cpumask_var(tmp->delmask);
696         }
697 }
698 
699 /**
700  * alloc_trial_cpuset - allocate a trial cpuset
701  * @cs: the cpuset that the trial cpuset duplicates
702  */
703 static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
704 {
705         struct cpuset *trial;
706 
707         trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL);
708         if (!trial)
709                 return NULL;
710 
711         if (alloc_cpumasks(trial, NULL)) {
712                 kfree(trial);
713                 return NULL;
714         }
715 
716         cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
717         cpumask_copy(trial->effective_cpus, cs->effective_cpus);
718         cpumask_copy(trial->effective_xcpus, cs->effective_xcpus);
719         cpumask_copy(trial->exclusive_cpus, cs->exclusive_cpus);
720         return trial;
721 }
722 
723 /**
724  * free_cpuset - free the cpuset
725  * @cs: the cpuset to be freed
726  */
727 static inline void free_cpuset(struct cpuset *cs)
728 {
729         free_cpumasks(cs, NULL);
730         kfree(cs);
731 }
732 
733 /* Return user specified exclusive CPUs */
734 static inline struct cpumask *user_xcpus(struct cpuset *cs)
735 {
736         return cpumask_empty(cs->exclusive_cpus) ? cs->cpus_allowed
737                                                  : cs->exclusive_cpus;
738 }
739 
740 static inline bool xcpus_empty(struct cpuset *cs)
741 {
742         return cpumask_empty(cs->cpus_allowed) &&
743                cpumask_empty(cs->exclusive_cpus);
744 }
745 
746 static inline struct cpumask *fetch_xcpus(struct cpuset *cs)
747 {
748         return !cpumask_empty(cs->exclusive_cpus) ? cs->exclusive_cpus :
749                cpumask_empty(cs->effective_xcpus) ? cs->cpus_allowed
750                                                   : cs->effective_xcpus;
751 }
752 
753 /*
754  * cpusets_are_exclusive() - check if two cpusets are exclusive
755  *
756  * Return true if exclusive, false if not
757  */
758 static inline bool cpusets_are_exclusive(struct cpuset *cs1, struct cpuset *cs2)
759 {
760         struct cpumask *xcpus1 = fetch_xcpus(cs1);
761         struct cpumask *xcpus2 = fetch_xcpus(cs2);
762 
763         if (cpumask_intersects(xcpus1, xcpus2))
764                 return false;
765         return true;
766 }
767 
768 /*
769  * validate_change_legacy() - Validate conditions specific to legacy (v1)
770  *                            behavior.
771  */
772 static int validate_change_legacy(struct cpuset *cur, struct cpuset *trial)
773 {
774         struct cgroup_subsys_state *css;
775         struct cpuset *c, *par;
776         int ret;
777 
778         WARN_ON_ONCE(!rcu_read_lock_held());
779 
780         /* Each of our child cpusets must be a subset of us */
781         ret = -EBUSY;
782         cpuset_for_each_child(c, css, cur)
783                 if (!is_cpuset_subset(c, trial))
784                         goto out;
785 
786         /* On legacy hierarchy, we must be a subset of our parent cpuset. */
787         ret = -EACCES;
788         par = parent_cs(cur);
789         if (par && !is_cpuset_subset(trial, par))
790                 goto out;
791 
792         ret = 0;
793 out:
794         return ret;
795 }
796 
797 /*
798  * validate_change() - Used to validate that any proposed cpuset change
799  *                     follows the structural rules for cpusets.
800  *
801  * If we replaced the flag and mask values of the current cpuset
802  * (cur) with those values in the trial cpuset (trial), would
803  * our various subset and exclusive rules still be valid?  Presumes
804  * cpuset_mutex held.
805  *
806  * 'cur' is the address of an actual, in-use cpuset.  Operations
807  * such as list traversal that depend on the actual address of the
808  * cpuset in the list must use cur below, not trial.
809  *
810  * 'trial' is the address of bulk structure copy of cur, with
811  * perhaps one or more of the fields cpus_allowed, mems_allowed,
812  * or flags changed to new, trial values.
813  *
814  * Return 0 if valid, -errno if not.
815  */
816 
817 static int validate_change(struct cpuset *cur, struct cpuset *trial)
818 {
819         struct cgroup_subsys_state *css;
820         struct cpuset *c, *par;
821         int ret = 0;
822 
823         rcu_read_lock();
824 
825         if (!is_in_v2_mode())
826                 ret = validate_change_legacy(cur, trial);
827         if (ret)
828                 goto out;
829 
830         /* Remaining checks don't apply to root cpuset */
831         if (cur == &top_cpuset)
832                 goto out;
833 
834         par = parent_cs(cur);
835 
836         /*
837          * Cpusets with tasks - existing or newly being attached - can't
838          * be changed to have empty cpus_allowed or mems_allowed.
839          */
840         ret = -ENOSPC;
841         if ((cgroup_is_populated(cur->css.cgroup) || cur->attach_in_progress)) {
842                 if (!cpumask_empty(cur->cpus_allowed) &&
843                     cpumask_empty(trial->cpus_allowed))
844                         goto out;
845                 if (!nodes_empty(cur->mems_allowed) &&
846                     nodes_empty(trial->mems_allowed))
847                         goto out;
848         }
849 
850         /*
851          * We can't shrink if we won't have enough room for SCHED_DEADLINE
852          * tasks.
853          */
854         ret = -EBUSY;
855         if (is_cpu_exclusive(cur) &&
856             !cpuset_cpumask_can_shrink(cur->cpus_allowed,
857                                        trial->cpus_allowed))
858                 goto out;
859 
860         /*
861          * If either I or some sibling (!= me) is exclusive, we can't
862          * overlap. exclusive_cpus cannot overlap with each other if set.
863          */
864         ret = -EINVAL;
865         cpuset_for_each_child(c, css, par) {
866                 bool txset, cxset;      /* Are exclusive_cpus set? */
867 
868                 if (c == cur)
869                         continue;
870 
871                 txset = !cpumask_empty(trial->exclusive_cpus);
872                 cxset = !cpumask_empty(c->exclusive_cpus);
873                 if (is_cpu_exclusive(trial) || is_cpu_exclusive(c) ||
874                     (txset && cxset)) {
875                         if (!cpusets_are_exclusive(trial, c))
876                                 goto out;
877                 } else if (txset || cxset) {
878                         struct cpumask *xcpus, *acpus;
879 
880                         /*
881                          * When just one of the exclusive_cpus's is set,
882                          * cpus_allowed of the other cpuset, if set, cannot be
883                          * a subset of it or none of those CPUs will be
884                          * available if these exclusive CPUs are activated.
885                          */
886                         if (txset) {
887                                 xcpus = trial->exclusive_cpus;
888                                 acpus = c->cpus_allowed;
889                         } else {
890                                 xcpus = c->exclusive_cpus;
891                                 acpus = trial->cpus_allowed;
892                         }
893                         if (!cpumask_empty(acpus) && cpumask_subset(acpus, xcpus))
894                                 goto out;
895                 }
896                 if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
897                     nodes_intersects(trial->mems_allowed, c->mems_allowed))
898                         goto out;
899         }
900 
901         ret = 0;
902 out:
903         rcu_read_unlock();
904         return ret;
905 }
906 
907 #ifdef CONFIG_SMP
908 /*
909  * Helper routine for generate_sched_domains().
910  * Do cpusets a, b have overlapping effective cpus_allowed masks?
911  */
912 static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
913 {
914         return cpumask_intersects(a->effective_cpus, b->effective_cpus);
915 }
916 
917 static void
918 update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
919 {
920         if (dattr->relax_domain_level < c->relax_domain_level)
921                 dattr->relax_domain_level = c->relax_domain_level;
922         return;
923 }
924 
925 static void update_domain_attr_tree(struct sched_domain_attr *dattr,
926                                     struct cpuset *root_cs)
927 {
928         struct cpuset *cp;
929         struct cgroup_subsys_state *pos_css;
930 
931         rcu_read_lock();
932         cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
933                 /* skip the whole subtree if @cp doesn't have any CPU */
934                 if (cpumask_empty(cp->cpus_allowed)) {
935                         pos_css = css_rightmost_descendant(pos_css);
936                         continue;
937                 }
938 
939                 if (is_sched_load_balance(cp))
940                         update_domain_attr(dattr, cp);
941         }
942         rcu_read_unlock();
943 }
944 
945 /* Must be called with cpuset_mutex held.  */
946 static inline int nr_cpusets(void)
947 {
948         /* jump label reference count + the top-level cpuset */
949         return static_key_count(&cpusets_enabled_key.key) + 1;
950 }
951 
952 /*
953  * generate_sched_domains()
954  *
955  * This function builds a partial partition of the systems CPUs
956  * A 'partial partition' is a set of non-overlapping subsets whose
957  * union is a subset of that set.
958  * The output of this function needs to be passed to kernel/sched/core.c
959  * partition_sched_domains() routine, which will rebuild the scheduler's
960  * load balancing domains (sched domains) as specified by that partial
961  * partition.
962  *
963  * See "What is sched_load_balance" in Documentation/admin-guide/cgroup-v1/cpusets.rst
964  * for a background explanation of this.
965  *
966  * Does not return errors, on the theory that the callers of this
967  * routine would rather not worry about failures to rebuild sched
968  * domains when operating in the severe memory shortage situations
969  * that could cause allocation failures below.
970  *
971  * Must be called with cpuset_mutex held.
972  *
973  * The three key local variables below are:
974  *    cp - cpuset pointer, used (together with pos_css) to perform a
975  *         top-down scan of all cpusets. For our purposes, rebuilding
976  *         the schedulers sched domains, we can ignore !is_sched_load_
977  *         balance cpusets.
978  *  csa  - (for CpuSet Array) Array of pointers to all the cpusets
979  *         that need to be load balanced, for convenient iterative
980  *         access by the subsequent code that finds the best partition,
981  *         i.e the set of domains (subsets) of CPUs such that the
982  *         cpus_allowed of every cpuset marked is_sched_load_balance
983  *         is a subset of one of these domains, while there are as
984  *         many such domains as possible, each as small as possible.
985  * doms  - Conversion of 'csa' to an array of cpumasks, for passing to
986  *         the kernel/sched/core.c routine partition_sched_domains() in a
987  *         convenient format, that can be easily compared to the prior
988  *         value to determine what partition elements (sched domains)
989  *         were changed (added or removed.)
990  *
991  * Finding the best partition (set of domains):
992  *      The triple nested loops below over i, j, k scan over the
993  *      load balanced cpusets (using the array of cpuset pointers in
994  *      csa[]) looking for pairs of cpusets that have overlapping
995  *      cpus_allowed, but which don't have the same 'pn' partition
996  *      number and gives them in the same partition number.  It keeps
997  *      looping on the 'restart' label until it can no longer find
998  *      any such pairs.
999  *
1000  *      The union of the cpus_allowed masks from the set of
1001  *      all cpusets having the same 'pn' value then form the one
1002  *      element of the partition (one sched domain) to be passed to
1003  *      partition_sched_domains().
1004  */
1005 static int generate_sched_domains(cpumask_var_t **domains,
1006                         struct sched_domain_attr **attributes)
1007 {
1008         struct cpuset *cp;      /* top-down scan of cpusets */
1009         struct cpuset **csa;    /* array of all cpuset ptrs */
1010         int csn;                /* how many cpuset ptrs in csa so far */
1011         int i, j, k;            /* indices for partition finding loops */
1012         cpumask_var_t *doms;    /* resulting partition; i.e. sched domains */
1013         struct sched_domain_attr *dattr;  /* attributes for custom domains */
1014         int ndoms = 0;          /* number of sched domains in result */
1015         int nslot;              /* next empty doms[] struct cpumask slot */
1016         struct cgroup_subsys_state *pos_css;
1017         bool root_load_balance = is_sched_load_balance(&top_cpuset);
1018         bool cgrpv2 = cgroup_subsys_on_dfl(cpuset_cgrp_subsys);
1019 
1020         doms = NULL;
1021         dattr = NULL;
1022         csa = NULL;
1023 
1024         /* Special case for the 99% of systems with one, full, sched domain */
1025         if (root_load_balance && cpumask_empty(subpartitions_cpus)) {
1026 single_root_domain:
1027                 ndoms = 1;
1028                 doms = alloc_sched_domains(ndoms);
1029                 if (!doms)
1030                         goto done;
1031 
1032                 dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
1033                 if (dattr) {
1034                         *dattr = SD_ATTR_INIT;
1035                         update_domain_attr_tree(dattr, &top_cpuset);
1036                 }
1037                 cpumask_and(doms[0], top_cpuset.effective_cpus,
1038                             housekeeping_cpumask(HK_TYPE_DOMAIN));
1039 
1040                 goto done;
1041         }
1042 
1043         csa = kmalloc_array(nr_cpusets(), sizeof(cp), GFP_KERNEL);
1044         if (!csa)
1045                 goto done;
1046         csn = 0;
1047 
1048         rcu_read_lock();
1049         if (root_load_balance)
1050                 csa[csn++] = &top_cpuset;
1051         cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
1052                 if (cp == &top_cpuset)
1053                         continue;
1054 
1055                 if (cgrpv2)
1056                         goto v2;
1057 
1058                 /*
1059                  * v1:
1060                  * Continue traversing beyond @cp iff @cp has some CPUs and
1061                  * isn't load balancing.  The former is obvious.  The
1062                  * latter: All child cpusets contain a subset of the
1063                  * parent's cpus, so just skip them, and then we call
1064                  * update_domain_attr_tree() to calc relax_domain_level of
1065                  * the corresponding sched domain.
1066                  */
1067                 if (!cpumask_empty(cp->cpus_allowed) &&
1068                     !(is_sched_load_balance(cp) &&
1069                       cpumask_intersects(cp->cpus_allowed,
1070                                          housekeeping_cpumask(HK_TYPE_DOMAIN))))
1071                         continue;
1072 
1073                 if (is_sched_load_balance(cp) &&
1074                     !cpumask_empty(cp->effective_cpus))
1075                         csa[csn++] = cp;
1076 
1077                 /* skip @cp's subtree */
1078                 pos_css = css_rightmost_descendant(pos_css);
1079                 continue;
1080 
1081 v2:
1082                 /*
1083                  * Only valid partition roots that are not isolated and with
1084                  * non-empty effective_cpus will be saved into csn[].
1085                  */
1086                 if ((cp->partition_root_state == PRS_ROOT) &&
1087                     !cpumask_empty(cp->effective_cpus))
1088                         csa[csn++] = cp;
1089 
1090                 /*
1091                  * Skip @cp's subtree if not a partition root and has no
1092                  * exclusive CPUs to be granted to child cpusets.
1093                  */
1094                 if (!is_partition_valid(cp) && cpumask_empty(cp->exclusive_cpus))
1095                         pos_css = css_rightmost_descendant(pos_css);
1096         }
1097         rcu_read_unlock();
1098 
1099         /*
1100          * If there are only isolated partitions underneath the cgroup root,
1101          * we can optimize out unneeded sched domains scanning.
1102          */
1103         if (root_load_balance && (csn == 1))
1104                 goto single_root_domain;
1105 
1106         for (i = 0; i < csn; i++)
1107                 csa[i]->pn = i;
1108         ndoms = csn;
1109 
1110 restart:
1111         /* Find the best partition (set of sched domains) */
1112         for (i = 0; i < csn; i++) {
1113                 struct cpuset *a = csa[i];
1114                 int apn = a->pn;
1115 
1116                 for (j = 0; j < csn; j++) {
1117                         struct cpuset *b = csa[j];
1118                         int bpn = b->pn;
1119 
1120                         if (apn != bpn && cpusets_overlap(a, b)) {
1121                                 for (k = 0; k < csn; k++) {
1122                                         struct cpuset *c = csa[k];
1123 
1124                                         if (c->pn == bpn)
1125                                                 c->pn = apn;
1126                                 }
1127                                 ndoms--;        /* one less element */
1128                                 goto restart;
1129                         }
1130                 }
1131         }
1132 
1133         /*
1134          * Now we know how many domains to create.
1135          * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
1136          */
1137         doms = alloc_sched_domains(ndoms);
1138         if (!doms)
1139                 goto done;
1140 
1141         /*
1142          * The rest of the code, including the scheduler, can deal with
1143          * dattr==NULL case. No need to abort if alloc fails.
1144          */
1145         dattr = kmalloc_array(ndoms, sizeof(struct sched_domain_attr),
1146                               GFP_KERNEL);
1147 
1148         /*
1149          * Cgroup v2 doesn't support domain attributes, just set all of them
1150          * to SD_ATTR_INIT. Also non-isolating partition root CPUs are a
1151          * subset of HK_TYPE_DOMAIN housekeeping CPUs.
1152          */
1153         if (cgrpv2) {
1154                 for (i = 0; i < ndoms; i++) {
1155                         cpumask_copy(doms[i], csa[i]->effective_cpus);
1156                         if (dattr)
1157                                 dattr[i] = SD_ATTR_INIT;
1158                 }
1159                 goto done;
1160         }
1161 
1162         for (nslot = 0, i = 0; i < csn; i++) {
1163                 struct cpuset *a = csa[i];
1164                 struct cpumask *dp;
1165                 int apn = a->pn;
1166 
1167                 if (apn < 0) {
1168                         /* Skip completed partitions */
1169                         continue;
1170                 }
1171 
1172                 dp = doms[nslot];
1173 
1174                 if (nslot == ndoms) {
1175                         static int warnings = 10;
1176                         if (warnings) {
1177                                 pr_warn("rebuild_sched_domains confused: nslot %d, ndoms %d, csn %d, i %d, apn %d\n",
1178                                         nslot, ndoms, csn, i, apn);
1179                                 warnings--;
1180                         }
1181                         continue;
1182                 }
1183 
1184                 cpumask_clear(dp);
1185                 if (dattr)
1186                         *(dattr + nslot) = SD_ATTR_INIT;
1187                 for (j = i; j < csn; j++) {
1188                         struct cpuset *b = csa[j];
1189 
1190                         if (apn == b->pn) {
1191                                 cpumask_or(dp, dp, b->effective_cpus);
1192                                 cpumask_and(dp, dp, housekeeping_cpumask(HK_TYPE_DOMAIN));
1193                                 if (dattr)
1194                                         update_domain_attr_tree(dattr + nslot, b);
1195 
1196                                 /* Done with this partition */
1197                                 b->pn = -1;
1198                         }
1199                 }
1200                 nslot++;
1201         }
1202         BUG_ON(nslot != ndoms);
1203 
1204 done:
1205         kfree(csa);
1206 
1207         /*
1208          * Fallback to the default domain if kmalloc() failed.
1209          * See comments in partition_sched_domains().
1210          */
1211         if (doms == NULL)
1212                 ndoms = 1;
1213 
1214         *domains    = doms;
1215         *attributes = dattr;
1216         return ndoms;
1217 }
1218 
1219 static void dl_update_tasks_root_domain(struct cpuset *cs)
1220 {
1221         struct css_task_iter it;
1222         struct task_struct *task;
1223 
1224         if (cs->nr_deadline_tasks == 0)
1225                 return;
1226 
1227         css_task_iter_start(&cs->css, 0, &it);
1228 
1229         while ((task = css_task_iter_next(&it)))
1230                 dl_add_task_root_domain(task);
1231 
1232         css_task_iter_end(&it);
1233 }
1234 
1235 static void dl_rebuild_rd_accounting(void)
1236 {
1237         struct cpuset *cs = NULL;
1238         struct cgroup_subsys_state *pos_css;
1239 
1240         lockdep_assert_held(&cpuset_mutex);
1241         lockdep_assert_cpus_held();
1242         lockdep_assert_held(&sched_domains_mutex);
1243 
1244         rcu_read_lock();
1245 
1246         /*
1247          * Clear default root domain DL accounting, it will be computed again
1248          * if a task belongs to it.
1249          */
1250         dl_clear_root_domain(&def_root_domain);
1251 
1252         cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
1253 
1254                 if (cpumask_empty(cs->effective_cpus)) {
1255                         pos_css = css_rightmost_descendant(pos_css);
1256                         continue;
1257                 }
1258 
1259                 css_get(&cs->css);
1260 
1261                 rcu_read_unlock();
1262 
1263                 dl_update_tasks_root_domain(cs);
1264 
1265                 rcu_read_lock();
1266                 css_put(&cs->css);
1267         }
1268         rcu_read_unlock();
1269 }
1270 
1271 static void
1272 partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
1273                                     struct sched_domain_attr *dattr_new)
1274 {
1275         mutex_lock(&sched_domains_mutex);
1276         partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);
1277         dl_rebuild_rd_accounting();
1278         mutex_unlock(&sched_domains_mutex);
1279 }
1280 
1281 /*
1282  * Rebuild scheduler domains.
1283  *
1284  * If the flag 'sched_load_balance' of any cpuset with non-empty
1285  * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
1286  * which has that flag enabled, or if any cpuset with a non-empty
1287  * 'cpus' is removed, then call this routine to rebuild the
1288  * scheduler's dynamic sched domains.
1289  *
1290  * Call with cpuset_mutex held.  Takes cpus_read_lock().
1291  */
1292 static void rebuild_sched_domains_locked(void)
1293 {
1294         struct cgroup_subsys_state *pos_css;
1295         struct sched_domain_attr *attr;
1296         cpumask_var_t *doms;
1297         struct cpuset *cs;
1298         int ndoms;
1299 
1300         lockdep_assert_cpus_held();
1301         lockdep_assert_held(&cpuset_mutex);
1302 
1303         /*
1304          * If we have raced with CPU hotplug, return early to avoid
1305          * passing doms with offlined cpu to partition_sched_domains().
1306          * Anyways, cpuset_handle_hotplug() will rebuild sched domains.
1307          *
1308          * With no CPUs in any subpartitions, top_cpuset's effective CPUs
1309          * should be the same as the active CPUs, so checking only top_cpuset
1310          * is enough to detect racing CPU offlines.
1311          */
1312         if (cpumask_empty(subpartitions_cpus) &&
1313             !cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
1314                 return;
1315 
1316         /*
1317          * With subpartition CPUs, however, the effective CPUs of a partition
1318          * root should be only a subset of the active CPUs.  Since a CPU in any
1319          * partition root could be offlined, all must be checked.
1320          */
1321         if (!cpumask_empty(subpartitions_cpus)) {
1322                 rcu_read_lock();
1323                 cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
1324                         if (!is_partition_valid(cs)) {
1325                                 pos_css = css_rightmost_descendant(pos_css);
1326                                 continue;
1327                         }
1328                         if (!cpumask_subset(cs->effective_cpus,
1329                                             cpu_active_mask)) {
1330                                 rcu_read_unlock();
1331                                 return;
1332                         }
1333                 }
1334                 rcu_read_unlock();
1335         }
1336 
1337         /* Generate domain masks and attrs */
1338         ndoms = generate_sched_domains(&doms, &attr);
1339 
1340         /* Have scheduler rebuild the domains */
1341         partition_and_rebuild_sched_domains(ndoms, doms, attr);
1342 }
1343 #else /* !CONFIG_SMP */
1344 static void rebuild_sched_domains_locked(void)
1345 {
1346 }
1347 #endif /* CONFIG_SMP */
1348 
1349 static void rebuild_sched_domains_cpuslocked(void)
1350 {
1351         mutex_lock(&cpuset_mutex);
1352         rebuild_sched_domains_locked();
1353         mutex_unlock(&cpuset_mutex);
1354 }
1355 
1356 void rebuild_sched_domains(void)
1357 {
1358         cpus_read_lock();
1359         rebuild_sched_domains_cpuslocked();
1360         cpus_read_unlock();
1361 }
1362 
1363 /**
1364  * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
1365  * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
1366  * @new_cpus: the temp variable for the new effective_cpus mask
1367  *
1368  * Iterate through each task of @cs updating its cpus_allowed to the
1369  * effective cpuset's.  As this function is called with cpuset_mutex held,
1370  * cpuset membership stays stable. For top_cpuset, task_cpu_possible_mask()
1371  * is used instead of effective_cpus to make sure all offline CPUs are also
1372  * included as hotplug code won't update cpumasks for tasks in top_cpuset.
1373  */
1374 static void update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus)
1375 {
1376         struct css_task_iter it;
1377         struct task_struct *task;
1378         bool top_cs = cs == &top_cpuset;
1379 
1380         css_task_iter_start(&cs->css, 0, &it);
1381         while ((task = css_task_iter_next(&it))) {
1382                 const struct cpumask *possible_mask = task_cpu_possible_mask(task);
1383 
1384                 if (top_cs) {
1385                         /*
1386                          * Percpu kthreads in top_cpuset are ignored
1387                          */
1388                         if (kthread_is_per_cpu(task))
1389                                 continue;
1390                         cpumask_andnot(new_cpus, possible_mask, subpartitions_cpus);
1391                 } else {
1392                         cpumask_and(new_cpus, possible_mask, cs->effective_cpus);
1393                 }
1394                 set_cpus_allowed_ptr(task, new_cpus);
1395         }
1396         css_task_iter_end(&it);
1397 }
1398 
1399 /**
1400  * compute_effective_cpumask - Compute the effective cpumask of the cpuset
1401  * @new_cpus: the temp variable for the new effective_cpus mask
1402  * @cs: the cpuset the need to recompute the new effective_cpus mask
1403  * @parent: the parent cpuset
1404  *
1405  * The result is valid only if the given cpuset isn't a partition root.
1406  */
1407 static void compute_effective_cpumask(struct cpumask *new_cpus,
1408                                       struct cpuset *cs, struct cpuset *parent)
1409 {
1410         cpumask_and(new_cpus, cs->cpus_allowed, parent->effective_cpus);
1411 }
1412 
1413 /*
1414  * Commands for update_parent_effective_cpumask
1415  */
1416 enum partition_cmd {
1417         partcmd_enable,         /* Enable partition root          */
1418         partcmd_enablei,        /* Enable isolated partition root */
1419         partcmd_disable,        /* Disable partition root         */
1420         partcmd_update,         /* Update parent's effective_cpus */
1421         partcmd_invalidate,     /* Make partition invalid         */
1422 };
1423 
1424 static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1425                        int turning_on);
1426 static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
1427                                     struct tmpmasks *tmp);
1428 
1429 /*
1430  * Update partition exclusive flag
1431  *
1432  * Return: 0 if successful, an error code otherwise
1433  */
1434 static int update_partition_exclusive(struct cpuset *cs, int new_prs)
1435 {
1436         bool exclusive = (new_prs > PRS_MEMBER);
1437 
1438         if (exclusive && !is_cpu_exclusive(cs)) {
1439                 if (update_flag(CS_CPU_EXCLUSIVE, cs, 1))
1440                         return PERR_NOTEXCL;
1441         } else if (!exclusive && is_cpu_exclusive(cs)) {
1442                 /* Turning off CS_CPU_EXCLUSIVE will not return error */
1443                 update_flag(CS_CPU_EXCLUSIVE, cs, 0);
1444         }
1445         return 0;
1446 }
1447 
1448 /*
1449  * Update partition load balance flag and/or rebuild sched domain
1450  *
1451  * Changing load balance flag will automatically call
1452  * rebuild_sched_domains_locked().
1453  * This function is for cgroup v2 only.
1454  */
1455 static void update_partition_sd_lb(struct cpuset *cs, int old_prs)
1456 {
1457         int new_prs = cs->partition_root_state;
1458         bool rebuild_domains = (new_prs > 0) || (old_prs > 0);
1459         bool new_lb;
1460 
1461         /*
1462          * If cs is not a valid partition root, the load balance state
1463          * will follow its parent.
1464          */
1465         if (new_prs > 0) {
1466                 new_lb = (new_prs != PRS_ISOLATED);
1467         } else {
1468                 new_lb = is_sched_load_balance(parent_cs(cs));
1469         }
1470         if (new_lb != !!is_sched_load_balance(cs)) {
1471                 rebuild_domains = true;
1472                 if (new_lb)
1473                         set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
1474                 else
1475                         clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
1476         }
1477 
1478         if (rebuild_domains)
1479                 rebuild_sched_domains_locked();
1480 }
1481 
1482 /*
1483  * tasks_nocpu_error - Return true if tasks will have no effective_cpus
1484  */
1485 static bool tasks_nocpu_error(struct cpuset *parent, struct cpuset *cs,
1486                               struct cpumask *xcpus)
1487 {
1488         /*
1489          * A populated partition (cs or parent) can't have empty effective_cpus
1490          */
1491         return (cpumask_subset(parent->effective_cpus, xcpus) &&
1492                 partition_is_populated(parent, cs)) ||
1493                (!cpumask_intersects(xcpus, cpu_active_mask) &&
1494                 partition_is_populated(cs, NULL));
1495 }
1496 
1497 static void reset_partition_data(struct cpuset *cs)
1498 {
1499         struct cpuset *parent = parent_cs(cs);
1500 
1501         if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
1502                 return;
1503 
1504         lockdep_assert_held(&callback_lock);
1505 
1506         cs->nr_subparts = 0;
1507         if (cpumask_empty(cs->exclusive_cpus)) {
1508                 cpumask_clear(cs->effective_xcpus);
1509                 if (is_cpu_exclusive(cs))
1510                         clear_bit(CS_CPU_EXCLUSIVE, &cs->flags);
1511         }
1512         if (!cpumask_and(cs->effective_cpus,
1513                          parent->effective_cpus, cs->cpus_allowed)) {
1514                 cs->use_parent_ecpus = true;
1515                 parent->child_ecpus_count++;
1516                 cpumask_copy(cs->effective_cpus, parent->effective_cpus);
1517         }
1518 }
1519 
1520 /*
1521  * partition_xcpus_newstate - Exclusive CPUs state change
1522  * @old_prs: old partition_root_state
1523  * @new_prs: new partition_root_state
1524  * @xcpus: exclusive CPUs with state change
1525  */
1526 static void partition_xcpus_newstate(int old_prs, int new_prs, struct cpumask *xcpus)
1527 {
1528         WARN_ON_ONCE(old_prs == new_prs);
1529         if (new_prs == PRS_ISOLATED)
1530                 cpumask_or(isolated_cpus, isolated_cpus, xcpus);
1531         else
1532                 cpumask_andnot(isolated_cpus, isolated_cpus, xcpus);
1533 }
1534 
1535 /*
1536  * partition_xcpus_add - Add new exclusive CPUs to partition
1537  * @new_prs: new partition_root_state
1538  * @parent: parent cpuset
1539  * @xcpus: exclusive CPUs to be added
1540  * Return: true if isolated_cpus modified, false otherwise
1541  *
1542  * Remote partition if parent == NULL
1543  */
1544 static bool partition_xcpus_add(int new_prs, struct cpuset *parent,
1545                                 struct cpumask *xcpus)
1546 {
1547         bool isolcpus_updated;
1548 
1549         WARN_ON_ONCE(new_prs < 0);
1550         lockdep_assert_held(&callback_lock);
1551         if (!parent)
1552                 parent = &top_cpuset;
1553 
1554 
1555         if (parent == &top_cpuset)
1556                 cpumask_or(subpartitions_cpus, subpartitions_cpus, xcpus);
1557 
1558         isolcpus_updated = (new_prs != parent->partition_root_state);
1559         if (isolcpus_updated)
1560                 partition_xcpus_newstate(parent->partition_root_state, new_prs,
1561                                          xcpus);
1562 
1563         cpumask_andnot(parent->effective_cpus, parent->effective_cpus, xcpus);
1564         return isolcpus_updated;
1565 }
1566 
1567 /*
1568  * partition_xcpus_del - Remove exclusive CPUs from partition
1569  * @old_prs: old partition_root_state
1570  * @parent: parent cpuset
1571  * @xcpus: exclusive CPUs to be removed
1572  * Return: true if isolated_cpus modified, false otherwise
1573  *
1574  * Remote partition if parent == NULL
1575  */
1576 static bool partition_xcpus_del(int old_prs, struct cpuset *parent,
1577                                 struct cpumask *xcpus)
1578 {
1579         bool isolcpus_updated;
1580 
1581         WARN_ON_ONCE(old_prs < 0);
1582         lockdep_assert_held(&callback_lock);
1583         if (!parent)
1584                 parent = &top_cpuset;
1585 
1586         if (parent == &top_cpuset)
1587                 cpumask_andnot(subpartitions_cpus, subpartitions_cpus, xcpus);
1588 
1589         isolcpus_updated = (old_prs != parent->partition_root_state);
1590         if (isolcpus_updated)
1591                 partition_xcpus_newstate(old_prs, parent->partition_root_state,
1592                                          xcpus);
1593 
1594         cpumask_and(xcpus, xcpus, cpu_active_mask);
1595         cpumask_or(parent->effective_cpus, parent->effective_cpus, xcpus);
1596         return isolcpus_updated;
1597 }
1598 
1599 static void update_unbound_workqueue_cpumask(bool isolcpus_updated)
1600 {
1601         int ret;
1602 
1603         lockdep_assert_cpus_held();
1604 
1605         if (!isolcpus_updated)
1606                 return;
1607 
1608         ret = workqueue_unbound_exclude_cpumask(isolated_cpus);
1609         WARN_ON_ONCE(ret < 0);
1610 }
1611 
1612 /**
1613  * cpuset_cpu_is_isolated - Check if the given CPU is isolated
1614  * @cpu: the CPU number to be checked
1615  * Return: true if CPU is used in an isolated partition, false otherwise
1616  */
1617 bool cpuset_cpu_is_isolated(int cpu)
1618 {
1619         return cpumask_test_cpu(cpu, isolated_cpus);
1620 }
1621 EXPORT_SYMBOL_GPL(cpuset_cpu_is_isolated);
1622 
1623 /*
1624  * compute_effective_exclusive_cpumask - compute effective exclusive CPUs
1625  * @cs: cpuset
1626  * @xcpus: effective exclusive CPUs value to be set
1627  * Return: true if xcpus is not empty, false otherwise.
1628  *
1629  * Starting with exclusive_cpus (cpus_allowed if exclusive_cpus is not set),
1630  * it must be a subset of parent's effective_xcpus.
1631  */
1632 static bool compute_effective_exclusive_cpumask(struct cpuset *cs,
1633                                                 struct cpumask *xcpus)
1634 {
1635         struct cpuset *parent = parent_cs(cs);
1636 
1637         if (!xcpus)
1638                 xcpus = cs->effective_xcpus;
1639 
1640         return cpumask_and(xcpus, user_xcpus(cs), parent->effective_xcpus);
1641 }
1642 
1643 static inline bool is_remote_partition(struct cpuset *cs)
1644 {
1645         return !list_empty(&cs->remote_sibling);
1646 }
1647 
1648 static inline bool is_local_partition(struct cpuset *cs)
1649 {
1650         return is_partition_valid(cs) && !is_remote_partition(cs);
1651 }
1652 
1653 /*
1654  * remote_partition_enable - Enable current cpuset as a remote partition root
1655  * @cs: the cpuset to update
1656  * @new_prs: new partition_root_state
1657  * @tmp: temparary masks
1658  * Return: 1 if successful, 0 if error
1659  *
1660  * Enable the current cpuset to become a remote partition root taking CPUs
1661  * directly from the top cpuset. cpuset_mutex must be held by the caller.
1662  */
1663 static int remote_partition_enable(struct cpuset *cs, int new_prs,
1664                                    struct tmpmasks *tmp)
1665 {
1666         bool isolcpus_updated;
1667 
1668         /*
1669          * The user must have sysadmin privilege.
1670          */
1671         if (!capable(CAP_SYS_ADMIN))
1672                 return 0;
1673 
1674         /*
1675          * The requested exclusive_cpus must not be allocated to other
1676          * partitions and it can't use up all the root's effective_cpus.
1677          *
1678          * Note that if there is any local partition root above it or
1679          * remote partition root underneath it, its exclusive_cpus must
1680          * have overlapped with subpartitions_cpus.
1681          */
1682         compute_effective_exclusive_cpumask(cs, tmp->new_cpus);
1683         if (cpumask_empty(tmp->new_cpus) ||
1684             cpumask_intersects(tmp->new_cpus, subpartitions_cpus) ||
1685             cpumask_subset(top_cpuset.effective_cpus, tmp->new_cpus))
1686                 return 0;
1687 
1688         spin_lock_irq(&callback_lock);
1689         isolcpus_updated = partition_xcpus_add(new_prs, NULL, tmp->new_cpus);
1690         list_add(&cs->remote_sibling, &remote_children);
1691         if (cs->use_parent_ecpus) {
1692                 struct cpuset *parent = parent_cs(cs);
1693 
1694                 cs->use_parent_ecpus = false;
1695                 parent->child_ecpus_count--;
1696         }
1697         spin_unlock_irq(&callback_lock);
1698         update_unbound_workqueue_cpumask(isolcpus_updated);
1699 
1700         /*
1701          * Proprogate changes in top_cpuset's effective_cpus down the hierarchy.
1702          */
1703         update_tasks_cpumask(&top_cpuset, tmp->new_cpus);
1704         update_sibling_cpumasks(&top_cpuset, NULL, tmp);
1705         return 1;
1706 }
1707 
1708 /*
1709  * remote_partition_disable - Remove current cpuset from remote partition list
1710  * @cs: the cpuset to update
1711  * @tmp: temparary masks
1712  *
1713  * The effective_cpus is also updated.
1714  *
1715  * cpuset_mutex must be held by the caller.
1716  */
1717 static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp)
1718 {
1719         bool isolcpus_updated;
1720 
1721         compute_effective_exclusive_cpumask(cs, tmp->new_cpus);
1722         WARN_ON_ONCE(!is_remote_partition(cs));
1723         WARN_ON_ONCE(!cpumask_subset(tmp->new_cpus, subpartitions_cpus));
1724 
1725         spin_lock_irq(&callback_lock);
1726         list_del_init(&cs->remote_sibling);
1727         isolcpus_updated = partition_xcpus_del(cs->partition_root_state,
1728                                                NULL, tmp->new_cpus);
1729         cs->partition_root_state = -cs->partition_root_state;
1730         if (!cs->prs_err)
1731                 cs->prs_err = PERR_INVCPUS;
1732         reset_partition_data(cs);
1733         spin_unlock_irq(&callback_lock);
1734         update_unbound_workqueue_cpumask(isolcpus_updated);
1735 
1736         /*
1737          * Proprogate changes in top_cpuset's effective_cpus down the hierarchy.
1738          */
1739         update_tasks_cpumask(&top_cpuset, tmp->new_cpus);
1740         update_sibling_cpumasks(&top_cpuset, NULL, tmp);
1741 }
1742 
1743 /*
1744  * remote_cpus_update - cpus_exclusive change of remote partition
1745  * @cs: the cpuset to be updated
1746  * @newmask: the new effective_xcpus mask
1747  * @tmp: temparary masks
1748  *
1749  * top_cpuset and subpartitions_cpus will be updated or partition can be
1750  * invalidated.
1751  */
1752 static void remote_cpus_update(struct cpuset *cs, struct cpumask *newmask,
1753                                struct tmpmasks *tmp)
1754 {
1755         bool adding, deleting;
1756         int prs = cs->partition_root_state;
1757         int isolcpus_updated = 0;
1758 
1759         if (WARN_ON_ONCE(!is_remote_partition(cs)))
1760                 return;
1761 
1762         WARN_ON_ONCE(!cpumask_subset(cs->effective_xcpus, subpartitions_cpus));
1763 
1764         if (cpumask_empty(newmask))
1765                 goto invalidate;
1766 
1767         adding   = cpumask_andnot(tmp->addmask, newmask, cs->effective_xcpus);
1768         deleting = cpumask_andnot(tmp->delmask, cs->effective_xcpus, newmask);
1769 
1770         /*
1771          * Additions of remote CPUs is only allowed if those CPUs are
1772          * not allocated to other partitions and there are effective_cpus
1773          * left in the top cpuset.
1774          */
1775         if (adding && (!capable(CAP_SYS_ADMIN) ||
1776                        cpumask_intersects(tmp->addmask, subpartitions_cpus) ||
1777                        cpumask_subset(top_cpuset.effective_cpus, tmp->addmask)))
1778                 goto invalidate;
1779 
1780         spin_lock_irq(&callback_lock);
1781         if (adding)
1782                 isolcpus_updated += partition_xcpus_add(prs, NULL, tmp->addmask);
1783         if (deleting)
1784                 isolcpus_updated += partition_xcpus_del(prs, NULL, tmp->delmask);
1785         spin_unlock_irq(&callback_lock);
1786         update_unbound_workqueue_cpumask(isolcpus_updated);
1787 
1788         /*
1789          * Proprogate changes in top_cpuset's effective_cpus down the hierarchy.
1790          */
1791         update_tasks_cpumask(&top_cpuset, tmp->new_cpus);
1792         update_sibling_cpumasks(&top_cpuset, NULL, tmp);
1793         return;
1794 
1795 invalidate:
1796         remote_partition_disable(cs, tmp);
1797 }
1798 
1799 /*
1800  * remote_partition_check - check if a child remote partition needs update
1801  * @cs: the cpuset to be updated
1802  * @newmask: the new effective_xcpus mask
1803  * @delmask: temporary mask for deletion (not in tmp)
1804  * @tmp: temparary masks
1805  *
1806  * This should be called before the given cs has updated its cpus_allowed
1807  * and/or effective_xcpus.
1808  */
1809 static void remote_partition_check(struct cpuset *cs, struct cpumask *newmask,
1810                                    struct cpumask *delmask, struct tmpmasks *tmp)
1811 {
1812         struct cpuset *child, *next;
1813         int disable_cnt = 0;
1814 
1815         /*
1816          * Compute the effective exclusive CPUs that will be deleted.
1817          */
1818         if (!cpumask_andnot(delmask, cs->effective_xcpus, newmask) ||
1819             !cpumask_intersects(delmask, subpartitions_cpus))
1820                 return; /* No deletion of exclusive CPUs in partitions */
1821 
1822         /*
1823          * Searching the remote children list to look for those that will
1824          * be impacted by the deletion of exclusive CPUs.
1825          *
1826          * Since a cpuset must be removed from the remote children list
1827          * before it can go offline and holding cpuset_mutex will prevent
1828          * any change in cpuset status. RCU read lock isn't needed.
1829          */
1830         lockdep_assert_held(&cpuset_mutex);
1831         list_for_each_entry_safe(child, next, &remote_children, remote_sibling)
1832                 if (cpumask_intersects(child->effective_cpus, delmask)) {
1833                         remote_partition_disable(child, tmp);
1834                         disable_cnt++;
1835                 }
1836         if (disable_cnt)
1837                 rebuild_sched_domains_locked();
1838 }
1839 
1840 /*
1841  * prstate_housekeeping_conflict - check for partition & housekeeping conflicts
1842  * @prstate: partition root state to be checked
1843  * @new_cpus: cpu mask
1844  * Return: true if there is conflict, false otherwise
1845  *
1846  * CPUs outside of housekeeping_cpumask(HK_TYPE_DOMAIN) can only be used in
1847  * an isolated partition.
1848  */
1849 static bool prstate_housekeeping_conflict(int prstate, struct cpumask *new_cpus)
1850 {
1851         const struct cpumask *hk_domain = housekeeping_cpumask(HK_TYPE_DOMAIN);
1852         bool all_in_hk = cpumask_subset(new_cpus, hk_domain);
1853 
1854         if (!all_in_hk && (prstate != PRS_ISOLATED))
1855                 return true;
1856 
1857         return false;
1858 }
1859 
1860 /**
1861  * update_parent_effective_cpumask - update effective_cpus mask of parent cpuset
1862  * @cs:      The cpuset that requests change in partition root state
1863  * @cmd:     Partition root state change command
1864  * @newmask: Optional new cpumask for partcmd_update
1865  * @tmp:     Temporary addmask and delmask
1866  * Return:   0 or a partition root state error code
1867  *
1868  * For partcmd_enable*, the cpuset is being transformed from a non-partition
1869  * root to a partition root. The effective_xcpus (cpus_allowed if
1870  * effective_xcpus not set) mask of the given cpuset will be taken away from
1871  * parent's effective_cpus. The function will return 0 if all the CPUs listed
1872  * in effective_xcpus can be granted or an error code will be returned.
1873  *
1874  * For partcmd_disable, the cpuset is being transformed from a partition
1875  * root back to a non-partition root. Any CPUs in effective_xcpus will be
1876  * given back to parent's effective_cpus. 0 will always be returned.
1877  *
1878  * For partcmd_update, if the optional newmask is specified, the cpu list is
1879  * to be changed from effective_xcpus to newmask. Otherwise, effective_xcpus is
1880  * assumed to remain the same. The cpuset should either be a valid or invalid
1881  * partition root. The partition root state may change from valid to invalid
1882  * or vice versa. An error code will be returned if transitioning from
1883  * invalid to valid violates the exclusivity rule.
1884  *
1885  * For partcmd_invalidate, the current partition will be made invalid.
1886  *
1887  * The partcmd_enable* and partcmd_disable commands are used by
1888  * update_prstate(). An error code may be returned and the caller will check
1889  * for error.
1890  *
1891  * The partcmd_update command is used by update_cpumasks_hier() with newmask
1892  * NULL and update_cpumask() with newmask set. The partcmd_invalidate is used
1893  * by update_cpumask() with NULL newmask. In both cases, the callers won't
1894  * check for error and so partition_root_state and prs_error will be updated
1895  * directly.
1896  */
1897 static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
1898                                            struct cpumask *newmask,
1899                                            struct tmpmasks *tmp)
1900 {
1901         struct cpuset *parent = parent_cs(cs);
1902         int adding;     /* Adding cpus to parent's effective_cpus       */
1903         int deleting;   /* Deleting cpus from parent's effective_cpus   */
1904         int old_prs, new_prs;
1905         int part_error = PERR_NONE;     /* Partition error? */
1906         int subparts_delta = 0;
1907         struct cpumask *xcpus;          /* cs effective_xcpus */
1908         int isolcpus_updated = 0;
1909         bool nocpu;
1910 
1911         lockdep_assert_held(&cpuset_mutex);
1912 
1913         /*
1914          * new_prs will only be changed for the partcmd_update and
1915          * partcmd_invalidate commands.
1916          */
1917         adding = deleting = false;
1918         old_prs = new_prs = cs->partition_root_state;
1919         xcpus = user_xcpus(cs);
1920 
1921         if (cmd == partcmd_invalidate) {
1922                 if (is_prs_invalid(old_prs))
1923                         return 0;
1924 
1925                 /*
1926                  * Make the current partition invalid.
1927                  */
1928                 if (is_partition_valid(parent))
1929                         adding = cpumask_and(tmp->addmask,
1930                                              xcpus, parent->effective_xcpus);
1931                 if (old_prs > 0) {
1932                         new_prs = -old_prs;
1933                         subparts_delta--;
1934                 }
1935                 goto write_error;
1936         }
1937 
1938         /*
1939          * The parent must be a partition root.
1940          * The new cpumask, if present, or the current cpus_allowed must
1941          * not be empty.
1942          */
1943         if (!is_partition_valid(parent)) {
1944                 return is_partition_invalid(parent)
1945                        ? PERR_INVPARENT : PERR_NOTPART;
1946         }
1947         if (!newmask && xcpus_empty(cs))
1948                 return PERR_CPUSEMPTY;
1949 
1950         nocpu = tasks_nocpu_error(parent, cs, xcpus);
1951 
1952         if ((cmd == partcmd_enable) || (cmd == partcmd_enablei)) {
1953                 /*
1954                  * Enabling partition root is not allowed if its
1955                  * effective_xcpus is empty or doesn't overlap with
1956                  * parent's effective_xcpus.
1957                  */
1958                 if (cpumask_empty(xcpus) ||
1959                     !cpumask_intersects(xcpus, parent->effective_xcpus))
1960                         return PERR_INVCPUS;
1961 
1962                 if (prstate_housekeeping_conflict(new_prs, xcpus))
1963                         return PERR_HKEEPING;
1964 
1965                 /*
1966                  * A parent can be left with no CPU as long as there is no
1967                  * task directly associated with the parent partition.
1968                  */
1969                 if (nocpu)
1970                         return PERR_NOCPUS;
1971 
1972                 cpumask_copy(tmp->delmask, xcpus);
1973                 deleting = true;
1974                 subparts_delta++;
1975                 new_prs = (cmd == partcmd_enable) ? PRS_ROOT : PRS_ISOLATED;
1976         } else if (cmd == partcmd_disable) {
1977                 /*
1978                  * May need to add cpus to parent's effective_cpus for
1979                  * valid partition root.
1980                  */
1981                 adding = !is_prs_invalid(old_prs) &&
1982                           cpumask_and(tmp->addmask, xcpus, parent->effective_xcpus);
1983                 if (adding)
1984                         subparts_delta--;
1985                 new_prs = PRS_MEMBER;
1986         } else if (newmask) {
1987                 /*
1988                  * Empty cpumask is not allowed
1989                  */
1990                 if (cpumask_empty(newmask)) {
1991                         part_error = PERR_CPUSEMPTY;
1992                         goto write_error;
1993                 }
1994 
1995                 /*
1996                  * partcmd_update with newmask:
1997                  *
1998                  * Compute add/delete mask to/from effective_cpus
1999                  *
2000                  * For valid partition:
2001                  *   addmask = exclusive_cpus & ~newmask
2002                  *                            & parent->effective_xcpus
2003                  *   delmask = newmask & ~exclusive_cpus
2004                  *                     & parent->effective_xcpus
2005                  *
2006                  * For invalid partition:
2007                  *   delmask = newmask & parent->effective_xcpus
2008                  */
2009                 if (is_prs_invalid(old_prs)) {
2010                         adding = false;
2011                         deleting = cpumask_and(tmp->delmask,
2012                                         newmask, parent->effective_xcpus);
2013                 } else {
2014                         cpumask_andnot(tmp->addmask, xcpus, newmask);
2015                         adding = cpumask_and(tmp->addmask, tmp->addmask,
2016                                              parent->effective_xcpus);
2017 
2018                         cpumask_andnot(tmp->delmask, newmask, xcpus);
2019                         deleting = cpumask_and(tmp->delmask, tmp->delmask,
2020                                                parent->effective_xcpus);
2021                 }
2022                 /*
2023                  * Make partition invalid if parent's effective_cpus could
2024                  * become empty and there are tasks in the parent.
2025                  */
2026                 if (nocpu && (!adding ||
2027                     !cpumask_intersects(tmp->addmask, cpu_active_mask))) {
2028                         part_error = PERR_NOCPUS;
2029                         deleting = false;
2030                         adding = cpumask_and(tmp->addmask,
2031                                              xcpus, parent->effective_xcpus);
2032                 }
2033         } else {
2034                 /*
2035                  * partcmd_update w/o newmask
2036                  *
2037                  * delmask = effective_xcpus & parent->effective_cpus
2038                  *
2039                  * This can be called from:
2040                  * 1) update_cpumasks_hier()
2041                  * 2) cpuset_hotplug_update_tasks()
2042                  *
2043                  * Check to see if it can be transitioned from valid to
2044                  * invalid partition or vice versa.
2045                  *
2046                  * A partition error happens when parent has tasks and all
2047                  * its effective CPUs will have to be distributed out.
2048                  */
2049                 WARN_ON_ONCE(!is_partition_valid(parent));
2050                 if (nocpu) {
2051                         part_error = PERR_NOCPUS;
2052                         if (is_partition_valid(cs))
2053                                 adding = cpumask_and(tmp->addmask,
2054                                                 xcpus, parent->effective_xcpus);
2055                 } else if (is_partition_invalid(cs) &&
2056                            cpumask_subset(xcpus, parent->effective_xcpus)) {
2057                         struct cgroup_subsys_state *css;
2058                         struct cpuset *child;
2059                         bool exclusive = true;
2060 
2061                         /*
2062                          * Convert invalid partition to valid has to
2063                          * pass the cpu exclusivity test.
2064                          */
2065                         rcu_read_lock();
2066                         cpuset_for_each_child(child, css, parent) {
2067                                 if (child == cs)
2068                                         continue;
2069                                 if (!cpusets_are_exclusive(cs, child)) {
2070                                         exclusive = false;
2071                                         break;
2072                                 }
2073                         }
2074                         rcu_read_unlock();
2075                         if (exclusive)
2076                                 deleting = cpumask_and(tmp->delmask,
2077                                                 xcpus, parent->effective_cpus);
2078                         else
2079                                 part_error = PERR_NOTEXCL;
2080                 }
2081         }
2082 
2083 write_error:
2084         if (part_error)
2085                 WRITE_ONCE(cs->prs_err, part_error);
2086 
2087         if (cmd == partcmd_update) {
2088                 /*
2089                  * Check for possible transition between valid and invalid
2090                  * partition root.
2091                  */
2092                 switch (cs->partition_root_state) {
2093                 case PRS_ROOT:
2094                 case PRS_ISOLATED:
2095                         if (part_error) {
2096                                 new_prs = -old_prs;
2097                                 subparts_delta--;
2098                         }
2099                         break;
2100                 case PRS_INVALID_ROOT:
2101                 case PRS_INVALID_ISOLATED:
2102                         if (!part_error) {
2103                                 new_prs = -old_prs;
2104                                 subparts_delta++;
2105                         }
2106                         break;
2107                 }
2108         }
2109 
2110         if (!adding && !deleting && (new_prs == old_prs))
2111                 return 0;
2112 
2113         /*
2114          * Transitioning between invalid to valid or vice versa may require
2115          * changing CS_CPU_EXCLUSIVE. In the case of partcmd_update,
2116          * validate_change() has already been successfully called and
2117          * CPU lists in cs haven't been updated yet. So defer it to later.
2118          */
2119         if ((old_prs != new_prs) && (cmd != partcmd_update))  {
2120                 int err = update_partition_exclusive(cs, new_prs);
2121 
2122                 if (err)
2123                         return err;
2124         }
2125 
2126         /*
2127          * Change the parent's effective_cpus & effective_xcpus (top cpuset
2128          * only).
2129          *
2130          * Newly added CPUs will be removed from effective_cpus and
2131          * newly deleted ones will be added back to effective_cpus.
2132          */
2133         spin_lock_irq(&callback_lock);
2134         if (old_prs != new_prs) {
2135                 cs->partition_root_state = new_prs;
2136                 if (new_prs <= 0)
2137                         cs->nr_subparts = 0;
2138         }
2139         /*
2140          * Adding to parent's effective_cpus means deletion CPUs from cs
2141          * and vice versa.
2142          */
2143         if (adding)
2144                 isolcpus_updated += partition_xcpus_del(old_prs, parent,
2145                                                         tmp->addmask);
2146         if (deleting)
2147                 isolcpus_updated += partition_xcpus_add(new_prs, parent,
2148                                                         tmp->delmask);
2149 
2150         if (is_partition_valid(parent)) {
2151                 parent->nr_subparts += subparts_delta;
2152                 WARN_ON_ONCE(parent->nr_subparts < 0);
2153         }
2154         spin_unlock_irq(&callback_lock);
2155         update_unbound_workqueue_cpumask(isolcpus_updated);
2156 
2157         if ((old_prs != new_prs) && (cmd == partcmd_update))
2158                 update_partition_exclusive(cs, new_prs);
2159 
2160         if (adding || deleting) {
2161                 update_tasks_cpumask(parent, tmp->addmask);
2162                 update_sibling_cpumasks(parent, cs, tmp);
2163         }
2164 
2165         /*
2166          * For partcmd_update without newmask, it is being called from
2167          * cpuset_handle_hotplug(). Update the load balance flag and
2168          * scheduling domain accordingly.
2169          */
2170         if ((cmd == partcmd_update) && !newmask)
2171                 update_partition_sd_lb(cs, old_prs);
2172 
2173         notify_partition_change(cs, old_prs);
2174         return 0;
2175 }
2176 
2177 /**
2178  * compute_partition_effective_cpumask - compute effective_cpus for partition
2179  * @cs: partition root cpuset
2180  * @new_ecpus: previously computed effective_cpus to be updated
2181  *
2182  * Compute the effective_cpus of a partition root by scanning effective_xcpus
2183  * of child partition roots and excluding their effective_xcpus.
2184  *
2185  * This has the side effect of invalidating valid child partition roots,
2186  * if necessary. Since it is called from either cpuset_hotplug_update_tasks()
2187  * or update_cpumasks_hier() where parent and children are modified
2188  * successively, we don't need to call update_parent_effective_cpumask()
2189  * and the child's effective_cpus will be updated in later iterations.
2190  *
2191  * Note that rcu_read_lock() is assumed to be held.
2192  */
2193 static void compute_partition_effective_cpumask(struct cpuset *cs,
2194                                                 struct cpumask *new_ecpus)
2195 {
2196         struct cgroup_subsys_state *css;
2197         struct cpuset *child;
2198         bool populated = partition_is_populated(cs, NULL);
2199 
2200         /*
2201          * Check child partition roots to see if they should be
2202          * invalidated when
2203          *  1) child effective_xcpus not a subset of new
2204          *     excluisve_cpus
2205          *  2) All the effective_cpus will be used up and cp
2206          *     has tasks
2207          */
2208         compute_effective_exclusive_cpumask(cs, new_ecpus);
2209         cpumask_and(new_ecpus, new_ecpus, cpu_active_mask);
2210 
2211         rcu_read_lock();
2212         cpuset_for_each_child(child, css, cs) {
2213                 if (!is_partition_valid(child))
2214                         continue;
2215 
2216                 child->prs_err = 0;
2217                 if (!cpumask_subset(child->effective_xcpus,
2218                                     cs->effective_xcpus))
2219                         child->prs_err = PERR_INVCPUS;
2220                 else if (populated &&
2221                          cpumask_subset(new_ecpus, child->effective_xcpus))
2222                         child->prs_err = PERR_NOCPUS;
2223 
2224                 if (child->prs_err) {
2225                         int old_prs = child->partition_root_state;
2226 
2227                         /*
2228                          * Invalidate child partition
2229                          */
2230                         spin_lock_irq(&callback_lock);
2231                         make_partition_invalid(child);
2232                         cs->nr_subparts--;
2233                         child->nr_subparts = 0;
2234                         spin_unlock_irq(&callback_lock);
2235                         notify_partition_change(child, old_prs);
2236                         continue;
2237                 }
2238                 cpumask_andnot(new_ecpus, new_ecpus,
2239                                child->effective_xcpus);
2240         }
2241         rcu_read_unlock();
2242 }
2243 
2244 /*
2245  * update_cpumasks_hier() flags
2246  */
2247 #define HIER_CHECKALL           0x01    /* Check all cpusets with no skipping */
2248 #define HIER_NO_SD_REBUILD      0x02    /* Don't rebuild sched domains */
2249 
2250 /*
2251  * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree
2252  * @cs:  the cpuset to consider
2253  * @tmp: temp variables for calculating effective_cpus & partition setup
2254  * @force: don't skip any descendant cpusets if set
2255  *
2256  * When configured cpumask is changed, the effective cpumasks of this cpuset
2257  * and all its descendants need to be updated.
2258  *
2259  * On legacy hierarchy, effective_cpus will be the same with cpu_allowed.
2260  *
2261  * Called with cpuset_mutex held
2262  */
2263 static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,
2264                                  int flags)
2265 {
2266         struct cpuset *cp;
2267         struct cgroup_subsys_state *pos_css;
2268         bool need_rebuild_sched_domains = false;
2269         int old_prs, new_prs;
2270 
2271         rcu_read_lock();
2272         cpuset_for_each_descendant_pre(cp, pos_css, cs) {
2273                 struct cpuset *parent = parent_cs(cp);
2274                 bool remote = is_remote_partition(cp);
2275                 bool update_parent = false;
2276 
2277                 /*
2278                  * Skip descendent remote partition that acquires CPUs
2279                  * directly from top cpuset unless it is cs.
2280                  */
2281                 if (remote && (cp != cs)) {
2282                         pos_css = css_rightmost_descendant(pos_css);
2283                         continue;
2284                 }
2285 
2286                 /*
2287                  * Update effective_xcpus if exclusive_cpus set.
2288                  * The case when exclusive_cpus isn't set is handled later.
2289                  */
2290                 if (!cpumask_empty(cp->exclusive_cpus) && (cp != cs)) {
2291                         spin_lock_irq(&callback_lock);
2292                         compute_effective_exclusive_cpumask(cp, NULL);
2293                         spin_unlock_irq(&callback_lock);
2294                 }
2295 
2296                 old_prs = new_prs = cp->partition_root_state;
2297                 if (remote || (is_partition_valid(parent) &&
2298                                is_partition_valid(cp)))
2299                         compute_partition_effective_cpumask(cp, tmp->new_cpus);
2300                 else
2301                         compute_effective_cpumask(tmp->new_cpus, cp, parent);
2302 
2303                 /*
2304                  * A partition with no effective_cpus is allowed as long as
2305                  * there is no task associated with it. Call
2306                  * update_parent_effective_cpumask() to check it.
2307                  */
2308                 if (is_partition_valid(cp) && cpumask_empty(tmp->new_cpus)) {
2309                         update_parent = true;
2310                         goto update_parent_effective;
2311                 }
2312 
2313                 /*
2314                  * If it becomes empty, inherit the effective mask of the
2315                  * parent, which is guaranteed to have some CPUs unless
2316                  * it is a partition root that has explicitly distributed
2317                  * out all its CPUs.
2318                  */
2319                 if (is_in_v2_mode() && !remote && cpumask_empty(tmp->new_cpus)) {
2320                         cpumask_copy(tmp->new_cpus, parent->effective_cpus);
2321                         if (!cp->use_parent_ecpus) {
2322                                 cp->use_parent_ecpus = true;
2323                                 parent->child_ecpus_count++;
2324                         }
2325                 } else if (cp->use_parent_ecpus) {
2326                         cp->use_parent_ecpus = false;
2327                         WARN_ON_ONCE(!parent->child_ecpus_count);
2328                         parent->child_ecpus_count--;
2329                 }
2330 
2331                 if (remote)
2332                         goto get_css;
2333 
2334                 /*
2335                  * Skip the whole subtree if
2336                  * 1) the cpumask remains the same,
2337                  * 2) has no partition root state,
2338                  * 3) HIER_CHECKALL flag not set, and
2339                  * 4) for v2 load balance state same as its parent.
2340                  */
2341                 if (!cp->partition_root_state && !(flags & HIER_CHECKALL) &&
2342                     cpumask_equal(tmp->new_cpus, cp->effective_cpus) &&
2343                     (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
2344                     (is_sched_load_balance(parent) == is_sched_load_balance(cp)))) {
2345                         pos_css = css_rightmost_descendant(pos_css);
2346                         continue;
2347                 }
2348 
2349 update_parent_effective:
2350                 /*
2351                  * update_parent_effective_cpumask() should have been called
2352                  * for cs already in update_cpumask(). We should also call
2353                  * update_tasks_cpumask() again for tasks in the parent
2354                  * cpuset if the parent's effective_cpus changes.
2355                  */
2356                 if ((cp != cs) && old_prs) {
2357                         switch (parent->partition_root_state) {
2358                         case PRS_ROOT:
2359                         case PRS_ISOLATED:
2360                                 update_parent = true;
2361                                 break;
2362 
2363                         default:
2364                                 /*
2365                                  * When parent is not a partition root or is
2366                                  * invalid, child partition roots become
2367                                  * invalid too.
2368                                  */
2369                                 if (is_partition_valid(cp))
2370                                         new_prs = -cp->partition_root_state;
2371                                 WRITE_ONCE(cp->prs_err,
2372                                            is_partition_invalid(parent)
2373                                            ? PERR_INVPARENT : PERR_NOTPART);
2374                                 break;
2375                         }
2376                 }
2377 get_css:
2378                 if (!css_tryget_online(&cp->css))
2379                         continue;
2380                 rcu_read_unlock();
2381 
2382                 if (update_parent) {
2383                         update_parent_effective_cpumask(cp, partcmd_update, NULL, tmp);
2384                         /*
2385                          * The cpuset partition_root_state may become
2386                          * invalid. Capture it.
2387                          */
2388                         new_prs = cp->partition_root_state;
2389                 }
2390 
2391                 spin_lock_irq(&callback_lock);
2392                 cpumask_copy(cp->effective_cpus, tmp->new_cpus);
2393                 cp->partition_root_state = new_prs;
2394                 /*
2395                  * Make sure effective_xcpus is properly set for a valid
2396                  * partition root.
2397                  */
2398                 if ((new_prs > 0) && cpumask_empty(cp->exclusive_cpus))
2399                         cpumask_and(cp->effective_xcpus,
2400                                     cp->cpus_allowed, parent->effective_xcpus);
2401                 else if (new_prs < 0)
2402                         reset_partition_data(cp);
2403                 spin_unlock_irq(&callback_lock);
2404 
2405                 notify_partition_change(cp, old_prs);
2406 
2407                 WARN_ON(!is_in_v2_mode() &&
2408                         !cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
2409 
2410                 update_tasks_cpumask(cp, cp->effective_cpus);
2411 
2412                 /*
2413                  * On default hierarchy, inherit the CS_SCHED_LOAD_BALANCE
2414                  * from parent if current cpuset isn't a valid partition root
2415                  * and their load balance states differ.
2416                  */
2417                 if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
2418                     !is_partition_valid(cp) &&
2419                     (is_sched_load_balance(parent) != is_sched_load_balance(cp))) {
2420                         if (is_sched_load_balance(parent))
2421                                 set_bit(CS_SCHED_LOAD_BALANCE, &cp->flags);
2422                         else
2423                                 clear_bit(CS_SCHED_LOAD_BALANCE, &cp->flags);
2424                 }
2425 
2426                 /*
2427                  * On legacy hierarchy, if the effective cpumask of any non-
2428                  * empty cpuset is changed, we need to rebuild sched domains.
2429                  * On default hierarchy, the cpuset needs to be a partition
2430                  * root as well.
2431                  */
2432                 if (!cpumask_empty(cp->cpus_allowed) &&
2433                     is_sched_load_balance(cp) &&
2434                    (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
2435                     is_partition_valid(cp)))
2436                         need_rebuild_sched_domains = true;
2437 
2438                 rcu_read_lock();
2439                 css_put(&cp->css);
2440         }
2441         rcu_read_unlock();
2442 
2443         if (need_rebuild_sched_domains && !(flags & HIER_NO_SD_REBUILD))
2444                 rebuild_sched_domains_locked();
2445 }
2446 
2447 /**
2448  * update_sibling_cpumasks - Update siblings cpumasks
2449  * @parent:  Parent cpuset
2450  * @cs:      Current cpuset
2451  * @tmp:     Temp variables
2452  */
2453 static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
2454                                     struct tmpmasks *tmp)
2455 {
2456         struct cpuset *sibling;
2457         struct cgroup_subsys_state *pos_css;
2458 
2459         lockdep_assert_held(&cpuset_mutex);
2460 
2461         /*
2462          * Check all its siblings and call update_cpumasks_hier()
2463          * if their effective_cpus will need to be changed.
2464          *
2465          * With the addition of effective_xcpus which is a subset of
2466          * cpus_allowed. It is possible a change in parent's effective_cpus
2467          * due to a change in a child partition's effective_xcpus will impact
2468          * its siblings even if they do not inherit parent's effective_cpus
2469          * directly.
2470          *
2471          * The update_cpumasks_hier() function may sleep. So we have to
2472          * release the RCU read lock before calling it. HIER_NO_SD_REBUILD
2473          * flag is used to suppress rebuild of sched domains as the callers
2474          * will take care of that.
2475          */
2476         rcu_read_lock();
2477         cpuset_for_each_child(sibling, pos_css, parent) {
2478                 if (sibling == cs)
2479                         continue;
2480                 if (!sibling->use_parent_ecpus &&
2481                     !is_partition_valid(sibling)) {
2482                         compute_effective_cpumask(tmp->new_cpus, sibling,
2483                                                   parent);
2484                         if (cpumask_equal(tmp->new_cpus, sibling->effective_cpus))
2485                                 continue;
2486                 }
2487                 if (!css_tryget_online(&sibling->css))
2488                         continue;
2489 
2490                 rcu_read_unlock();
2491                 update_cpumasks_hier(sibling, tmp, HIER_NO_SD_REBUILD);
2492                 rcu_read_lock();
2493                 css_put(&sibling->css);
2494         }
2495         rcu_read_unlock();
2496 }
2497 
2498 /**
2499  * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
2500  * @cs: the cpuset to consider
2501  * @trialcs: trial cpuset
2502  * @buf: buffer of cpu numbers written to this cpuset
2503  */
2504 static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
2505                           const char *buf)
2506 {
2507         int retval;
2508         struct tmpmasks tmp;
2509         struct cpuset *parent = parent_cs(cs);
2510         bool invalidate = false;
2511         int hier_flags = 0;
2512         int old_prs = cs->partition_root_state;
2513 
2514         /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
2515         if (cs == &top_cpuset)
2516                 return -EACCES;
2517 
2518         /*
2519          * An empty cpus_allowed is ok only if the cpuset has no tasks.
2520          * Since cpulist_parse() fails on an empty mask, we special case
2521          * that parsing.  The validate_change() call ensures that cpusets
2522          * with tasks have cpus.
2523          */
2524         if (!*buf) {
2525                 cpumask_clear(trialcs->cpus_allowed);
2526                 cpumask_clear(trialcs->effective_xcpus);
2527         } else {
2528                 retval = cpulist_parse(buf, trialcs->cpus_allowed);
2529                 if (retval < 0)
2530                         return retval;
2531 
2532                 if (!cpumask_subset(trialcs->cpus_allowed,
2533                                     top_cpuset.cpus_allowed))
2534                         return -EINVAL;
2535 
2536                 /*
2537                  * When exclusive_cpus isn't explicitly set, it is constrainted
2538                  * by cpus_allowed and parent's effective_xcpus. Otherwise,
2539                  * trialcs->effective_xcpus is used as a temporary cpumask
2540                  * for checking validity of the partition root.
2541                  */
2542                 if (!cpumask_empty(trialcs->exclusive_cpus) || is_partition_valid(cs))
2543                         compute_effective_exclusive_cpumask(trialcs, NULL);
2544         }
2545 
2546         /* Nothing to do if the cpus didn't change */
2547         if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
2548                 return 0;
2549 
2550         if (alloc_cpumasks(NULL, &tmp))
2551                 return -ENOMEM;
2552 
2553         if (old_prs) {
2554                 if (is_partition_valid(cs) &&
2555                     cpumask_empty(trialcs->effective_xcpus)) {
2556                         invalidate = true;
2557                         cs->prs_err = PERR_INVCPUS;
2558                 } else if (prstate_housekeeping_conflict(old_prs, trialcs->effective_xcpus)) {
2559                         invalidate = true;
2560                         cs->prs_err = PERR_HKEEPING;
2561                 } else if (tasks_nocpu_error(parent, cs, trialcs->effective_xcpus)) {
2562                         invalidate = true;
2563                         cs->prs_err = PERR_NOCPUS;
2564                 }
2565         }
2566 
2567         /*
2568          * Check all the descendants in update_cpumasks_hier() if
2569          * effective_xcpus is to be changed.
2570          */
2571         if (!cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus))
2572                 hier_flags = HIER_CHECKALL;
2573 
2574         retval = validate_change(cs, trialcs);
2575 
2576         if ((retval == -EINVAL) && cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
2577                 struct cgroup_subsys_state *css;
2578                 struct cpuset *cp;
2579 
2580                 /*
2581                  * The -EINVAL error code indicates that partition sibling
2582                  * CPU exclusivity rule has been violated. We still allow
2583                  * the cpumask change to proceed while invalidating the
2584                  * partition. However, any conflicting sibling partitions
2585                  * have to be marked as invalid too.
2586                  */
2587                 invalidate = true;
2588                 rcu_read_lock();
2589                 cpuset_for_each_child(cp, css, parent) {
2590                         struct cpumask *xcpus = fetch_xcpus(trialcs);
2591 
2592                         if (is_partition_valid(cp) &&
2593                             cpumask_intersects(xcpus, cp->effective_xcpus)) {
2594                                 rcu_read_unlock();
2595                                 update_parent_effective_cpumask(cp, partcmd_invalidate, NULL, &tmp);
2596                                 rcu_read_lock();
2597                         }
2598                 }
2599                 rcu_read_unlock();
2600                 retval = 0;
2601         }
2602 
2603         if (retval < 0)
2604                 goto out_free;
2605 
2606         if (is_partition_valid(cs) ||
2607            (is_partition_invalid(cs) && !invalidate)) {
2608                 struct cpumask *xcpus = trialcs->effective_xcpus;
2609 
2610                 if (cpumask_empty(xcpus) && is_partition_invalid(cs))
2611                         xcpus = trialcs->cpus_allowed;
2612 
2613                 /*
2614                  * Call remote_cpus_update() to handle valid remote partition
2615                  */
2616                 if (is_remote_partition(cs))
2617                         remote_cpus_update(cs, xcpus, &tmp);
2618                 else if (invalidate)
2619                         update_parent_effective_cpumask(cs, partcmd_invalidate,
2620                                                         NULL, &tmp);
2621                 else
2622                         update_parent_effective_cpumask(cs, partcmd_update,
2623                                                         xcpus, &tmp);
2624         } else if (!cpumask_empty(cs->exclusive_cpus)) {
2625                 /*
2626                  * Use trialcs->effective_cpus as a temp cpumask
2627                  */
2628                 remote_partition_check(cs, trialcs->effective_xcpus,
2629                                        trialcs->effective_cpus, &tmp);
2630         }
2631 
2632         spin_lock_irq(&callback_lock);
2633         cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
2634         cpumask_copy(cs->effective_xcpus, trialcs->effective_xcpus);
2635         if ((old_prs > 0) && !is_partition_valid(cs))
2636                 reset_partition_data(cs);
2637         spin_unlock_irq(&callback_lock);
2638 
2639         /* effective_cpus/effective_xcpus will be updated here */
2640         update_cpumasks_hier(cs, &tmp, hier_flags);
2641 
2642         /* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */
2643         if (cs->partition_root_state)
2644                 update_partition_sd_lb(cs, old_prs);
2645 out_free:
2646         free_cpumasks(NULL, &tmp);
2647         return retval;
2648 }
2649 
2650 /**
2651  * update_exclusive_cpumask - update the exclusive_cpus mask of a cpuset
2652  * @cs: the cpuset to consider
2653  * @trialcs: trial cpuset
2654  * @buf: buffer of cpu numbers written to this cpuset
2655  *
2656  * The tasks' cpumask will be updated if cs is a valid partition root.
2657  */
2658 static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs,
2659                                     const char *buf)
2660 {
2661         int retval;
2662         struct tmpmasks tmp;
2663         struct cpuset *parent = parent_cs(cs);
2664         bool invalidate = false;
2665         int hier_flags = 0;
2666         int old_prs = cs->partition_root_state;
2667 
2668         if (!*buf) {
2669                 cpumask_clear(trialcs->exclusive_cpus);
2670                 cpumask_clear(trialcs->effective_xcpus);
2671         } else {
2672                 retval = cpulist_parse(buf, trialcs->exclusive_cpus);
2673                 if (retval < 0)
2674                         return retval;
2675         }
2676 
2677         /* Nothing to do if the CPUs didn't change */
2678         if (cpumask_equal(cs->exclusive_cpus, trialcs->exclusive_cpus))
2679                 return 0;
2680 
2681         if (*buf)
2682                 compute_effective_exclusive_cpumask(trialcs, NULL);
2683 
2684         /*
2685          * Check all the descendants in update_cpumasks_hier() if
2686          * effective_xcpus is to be changed.
2687          */
2688         if (!cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus))
2689                 hier_flags = HIER_CHECKALL;
2690 
2691         retval = validate_change(cs, trialcs);
2692         if (retval)
2693                 return retval;
2694 
2695         if (alloc_cpumasks(NULL, &tmp))
2696                 return -ENOMEM;
2697 
2698         if (old_prs) {
2699                 if (cpumask_empty(trialcs->effective_xcpus)) {
2700                         invalidate = true;
2701                         cs->prs_err = PERR_INVCPUS;
2702                 } else if (prstate_housekeeping_conflict(old_prs, trialcs->effective_xcpus)) {
2703                         invalidate = true;
2704                         cs->prs_err = PERR_HKEEPING;
2705                 } else if (tasks_nocpu_error(parent, cs, trialcs->effective_xcpus)) {
2706                         invalidate = true;
2707                         cs->prs_err = PERR_NOCPUS;
2708                 }
2709 
2710                 if (is_remote_partition(cs)) {
2711                         if (invalidate)
2712                                 remote_partition_disable(cs, &tmp);
2713                         else
2714                                 remote_cpus_update(cs, trialcs->effective_xcpus,
2715                                                    &tmp);
2716                 } else if (invalidate) {
2717                         update_parent_effective_cpumask(cs, partcmd_invalidate,
2718                                                         NULL, &tmp);
2719                 } else {
2720                         update_parent_effective_cpumask(cs, partcmd_update,
2721                                                 trialcs->effective_xcpus, &tmp);
2722                 }
2723         } else if (!cpumask_empty(trialcs->exclusive_cpus)) {
2724                 /*
2725                  * Use trialcs->effective_cpus as a temp cpumask
2726                  */
2727                 remote_partition_check(cs, trialcs->effective_xcpus,
2728                                        trialcs->effective_cpus, &tmp);
2729         }
2730         spin_lock_irq(&callback_lock);
2731         cpumask_copy(cs->exclusive_cpus, trialcs->exclusive_cpus);
2732         cpumask_copy(cs->effective_xcpus, trialcs->effective_xcpus);
2733         if ((old_prs > 0) && !is_partition_valid(cs))
2734                 reset_partition_data(cs);
2735         spin_unlock_irq(&callback_lock);
2736 
2737         /*
2738          * Call update_cpumasks_hier() to update effective_cpus/effective_xcpus
2739          * of the subtree when it is a valid partition root or effective_xcpus
2740          * is updated.
2741          */
2742         if (is_partition_valid(cs) || hier_flags)
2743                 update_cpumasks_hier(cs, &tmp, hier_flags);
2744 
2745         /* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */
2746         if (cs->partition_root_state)
2747                 update_partition_sd_lb(cs, old_prs);
2748 
2749         free_cpumasks(NULL, &tmp);
2750         return 0;
2751 }
2752 
2753 /*
2754  * Migrate memory region from one set of nodes to another.  This is
2755  * performed asynchronously as it can be called from process migration path
2756  * holding locks involved in process management.  All mm migrations are
2757  * performed in the queued order and can be waited for by flushing
2758  * cpuset_migrate_mm_wq.
2759  */
2760 
2761 struct cpuset_migrate_mm_work {
2762         struct work_struct      work;
2763         struct mm_struct        *mm;
2764         nodemask_t              from;
2765         nodemask_t              to;
2766 };
2767 
2768 static void cpuset_migrate_mm_workfn(struct work_struct *work)
2769 {
2770         struct cpuset_migrate_mm_work *mwork =
2771                 container_of(work, struct cpuset_migrate_mm_work, work);
2772 
2773         /* on a wq worker, no need to worry about %current's mems_allowed */
2774         do_migrate_pages(mwork->mm, &mwork->from, &mwork->to, MPOL_MF_MOVE_ALL);
2775         mmput(mwork->mm);
2776         kfree(mwork);
2777 }
2778 
2779 static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
2780                                                         const nodemask_t *to)
2781 {
2782         struct cpuset_migrate_mm_work *mwork;
2783 
2784         if (nodes_equal(*from, *to)) {
2785                 mmput(mm);
2786                 return;
2787         }
2788 
2789         mwork = kzalloc(sizeof(*mwork), GFP_KERNEL);
2790         if (mwork) {
2791                 mwork->mm = mm;
2792                 mwork->from = *from;
2793                 mwork->to = *to;
2794                 INIT_WORK(&mwork->work, cpuset_migrate_mm_workfn);
2795                 queue_work(cpuset_migrate_mm_wq, &mwork->work);
2796         } else {
2797                 mmput(mm);
2798         }
2799 }
2800 
2801 static void cpuset_post_attach(void)
2802 {
2803         flush_workqueue(cpuset_migrate_mm_wq);
2804 }
2805 
2806 /*
2807  * cpuset_change_task_nodemask - change task's mems_allowed and mempolicy
2808  * @tsk: the task to change
2809  * @newmems: new nodes that the task will be set
2810  *
2811  * We use the mems_allowed_seq seqlock to safely update both tsk->mems_allowed
2812  * and rebind an eventual tasks' mempolicy. If the task is allocating in
2813  * parallel, it might temporarily see an empty intersection, which results in
2814  * a seqlock check and retry before OOM or allocation failure.
2815  */
2816 static void cpuset_change_task_nodemask(struct task_struct *tsk,
2817                                         nodemask_t *newmems)
2818 {
2819         task_lock(tsk);
2820 
2821         local_irq_disable();
2822         write_seqcount_begin(&tsk->mems_allowed_seq);
2823 
2824         nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
2825         mpol_rebind_task(tsk, newmems);
2826         tsk->mems_allowed = *newmems;
2827 
2828         write_seqcount_end(&tsk->mems_allowed_seq);
2829         local_irq_enable();
2830 
2831         task_unlock(tsk);
2832 }
2833 
2834 static void *cpuset_being_rebound;
2835 
2836 /**
2837  * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
2838  * @cs: the cpuset in which each task's mems_allowed mask needs to be changed
2839  *
2840  * Iterate through each task of @cs updating its mems_allowed to the
2841  * effective cpuset's.  As this function is called with cpuset_mutex held,
2842  * cpuset membership stays stable.
2843  */
2844 static void update_tasks_nodemask(struct cpuset *cs)
2845 {
2846         static nodemask_t newmems;      /* protected by cpuset_mutex */
2847         struct css_task_iter it;
2848         struct task_struct *task;
2849 
2850         cpuset_being_rebound = cs;              /* causes mpol_dup() rebind */
2851 
2852         guarantee_online_mems(cs, &newmems);
2853 
2854         /*
2855          * The mpol_rebind_mm() call takes mmap_lock, which we couldn't
2856          * take while holding tasklist_lock.  Forks can happen - the
2857          * mpol_dup() cpuset_being_rebound check will catch such forks,
2858          * and rebind their vma mempolicies too.  Because we still hold
2859          * the global cpuset_mutex, we know that no other rebind effort
2860          * will be contending for the global variable cpuset_being_rebound.
2861          * It's ok if we rebind the same mm twice; mpol_rebind_mm()
2862          * is idempotent.  Also migrate pages in each mm to new nodes.
2863          */
2864         css_task_iter_start(&cs->css, 0, &it);
2865         while ((task = css_task_iter_next(&it))) {
2866                 struct mm_struct *mm;
2867                 bool migrate;
2868 
2869                 cpuset_change_task_nodemask(task, &newmems);
2870 
2871                 mm = get_task_mm(task);
2872                 if (!mm)
2873                         continue;
2874 
2875                 migrate = is_memory_migrate(cs);
2876 
2877                 mpol_rebind_mm(mm, &cs->mems_allowed);
2878                 if (migrate)
2879                         cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems);
2880                 else
2881                         mmput(mm);
2882         }
2883         css_task_iter_end(&it);
2884 
2885         /*
2886          * All the tasks' nodemasks have been updated, update
2887          * cs->old_mems_allowed.
2888          */
2889         cs->old_mems_allowed = newmems;
2890 
2891         /* We're done rebinding vmas to this cpuset's new mems_allowed. */
2892         cpuset_being_rebound = NULL;
2893 }
2894 
2895 /*
2896  * update_nodemasks_hier - Update effective nodemasks and tasks in the subtree
2897  * @cs: the cpuset to consider
2898  * @new_mems: a temp variable for calculating new effective_mems
2899  *
2900  * When configured nodemask is changed, the effective nodemasks of this cpuset
2901  * and all its descendants need to be updated.
2902  *
2903  * On legacy hierarchy, effective_mems will be the same with mems_allowed.
2904  *
2905  * Called with cpuset_mutex held
2906  */
2907 static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
2908 {
2909         struct cpuset *cp;
2910         struct cgroup_subsys_state *pos_css;
2911 
2912         rcu_read_lock();
2913         cpuset_for_each_descendant_pre(cp, pos_css, cs) {
2914                 struct cpuset *parent = parent_cs(cp);
2915 
2916                 nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems);
2917 
2918                 /*
2919                  * If it becomes empty, inherit the effective mask of the
2920                  * parent, which is guaranteed to have some MEMs.
2921                  */
2922                 if (is_in_v2_mode() && nodes_empty(*new_mems))
2923                         *new_mems = parent->effective_mems;
2924 
2925                 /* Skip the whole subtree if the nodemask remains the same. */
2926                 if (nodes_equal(*new_mems, cp->effective_mems)) {
2927                         pos_css = css_rightmost_descendant(pos_css);
2928                         continue;
2929                 }
2930 
2931                 if (!css_tryget_online(&cp->css))
2932                         continue;
2933                 rcu_read_unlock();
2934 
2935                 spin_lock_irq(&callback_lock);
2936                 cp->effective_mems = *new_mems;
2937                 spin_unlock_irq(&callback_lock);
2938 
2939                 WARN_ON(!is_in_v2_mode() &&
2940                         !nodes_equal(cp->mems_allowed, cp->effective_mems));
2941 
2942                 update_tasks_nodemask(cp);
2943 
2944                 rcu_read_lock();
2945                 css_put(&cp->css);
2946         }
2947         rcu_read_unlock();
2948 }
2949 
2950 /*
2951  * Handle user request to change the 'mems' memory placement
2952  * of a cpuset.  Needs to validate the request, update the
2953  * cpusets mems_allowed, and for each task in the cpuset,
2954  * update mems_allowed and rebind task's mempolicy and any vma
2955  * mempolicies and if the cpuset is marked 'memory_migrate',
2956  * migrate the tasks pages to the new memory.
2957  *
2958  * Call with cpuset_mutex held. May take callback_lock during call.
2959  * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
2960  * lock each such tasks mm->mmap_lock, scan its vma's and rebind
2961  * their mempolicies to the cpusets new mems_allowed.
2962  */
2963 static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
2964                            const char *buf)
2965 {
2966         int retval;
2967 
2968         /*
2969          * top_cpuset.mems_allowed tracks node_stats[N_MEMORY];
2970          * it's read-only
2971          */
2972         if (cs == &top_cpuset) {
2973                 retval = -EACCES;
2974                 goto done;
2975         }
2976 
2977         /*
2978          * An empty mems_allowed is ok iff there are no tasks in the cpuset.
2979          * Since nodelist_parse() fails on an empty mask, we special case
2980          * that parsing.  The validate_change() call ensures that cpusets
2981          * with tasks have memory.
2982          */
2983         if (!*buf) {
2984                 nodes_clear(trialcs->mems_allowed);
2985         } else {
2986                 retval = nodelist_parse(buf, trialcs->mems_allowed);
2987                 if (retval < 0)
2988                         goto done;
2989 
2990                 if (!nodes_subset(trialcs->mems_allowed,
2991                                   top_cpuset.mems_allowed)) {
2992                         retval = -EINVAL;
2993                         goto done;
2994                 }
2995         }
2996 
2997         if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) {
2998                 retval = 0;             /* Too easy - nothing to do */
2999                 goto done;
3000         }
3001         retval = validate_change(cs, trialcs);
3002         if (retval < 0)
3003                 goto done;
3004 
3005         check_insane_mems_config(&trialcs->mems_allowed);
3006 
3007         spin_lock_irq(&callback_lock);
3008         cs->mems_allowed = trialcs->mems_allowed;
3009         spin_unlock_irq(&callback_lock);
3010 
3011         /* use trialcs->mems_allowed as a temp variable */
3012         update_nodemasks_hier(cs, &trialcs->mems_allowed);
3013 done:
3014         return retval;
3015 }
3016 
3017 bool current_cpuset_is_being_rebound(void)
3018 {
3019         bool ret;
3020 
3021         rcu_read_lock();
3022         ret = task_cs(current) == cpuset_being_rebound;
3023         rcu_read_unlock();
3024 
3025         return ret;
3026 }
3027 
3028 static int update_relax_domain_level(struct cpuset *cs, s64 val)
3029 {
3030 #ifdef CONFIG_SMP
3031         if (val < -1 || val > sched_domain_level_max + 1)
3032                 return -EINVAL;
3033 #endif
3034 
3035         if (val != cs->relax_domain_level) {
3036                 cs->relax_domain_level = val;
3037                 if (!cpumask_empty(cs->cpus_allowed) &&
3038                     is_sched_load_balance(cs))
3039                         rebuild_sched_domains_locked();
3040         }
3041 
3042         return 0;
3043 }
3044 
3045 /**
3046  * update_tasks_flags - update the spread flags of tasks in the cpuset.
3047  * @cs: the cpuset in which each task's spread flags needs to be changed
3048  *
3049  * Iterate through each task of @cs updating its spread flags.  As this
3050  * function is called with cpuset_mutex held, cpuset membership stays
3051  * stable.
3052  */
3053 static void update_tasks_flags(struct cpuset *cs)
3054 {
3055         struct css_task_iter it;
3056         struct task_struct *task;
3057 
3058         css_task_iter_start(&cs->css, 0, &it);
3059         while ((task = css_task_iter_next(&it)))
3060                 cpuset_update_task_spread_flags(cs, task);
3061         css_task_iter_end(&it);
3062 }
3063 
3064 /*
3065  * update_flag - read a 0 or a 1 in a file and update associated flag
3066  * bit:         the bit to update (see cpuset_flagbits_t)
3067  * cs:          the cpuset to update
3068  * turning_on:  whether the flag is being set or cleared
3069  *
3070  * Call with cpuset_mutex held.
3071  */
3072 
3073 static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
3074                        int turning_on)
3075 {
3076         struct cpuset *trialcs;
3077         int balance_flag_changed;
3078         int spread_flag_changed;
3079         int err;
3080 
3081         trialcs = alloc_trial_cpuset(cs);
3082         if (!trialcs)
3083                 return -ENOMEM;
3084 
3085         if (turning_on)
3086                 set_bit(bit, &trialcs->flags);
3087         else
3088                 clear_bit(bit, &trialcs->flags);
3089 
3090         err = validate_change(cs, trialcs);
3091         if (err < 0)
3092                 goto out;
3093 
3094         balance_flag_changed = (is_sched_load_balance(cs) !=
3095                                 is_sched_load_balance(trialcs));
3096 
3097         spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
3098                         || (is_spread_page(cs) != is_spread_page(trialcs)));
3099 
3100         spin_lock_irq(&callback_lock);
3101         cs->flags = trialcs->flags;
3102         spin_unlock_irq(&callback_lock);
3103 
3104         if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
3105                 rebuild_sched_domains_locked();
3106 
3107         if (spread_flag_changed)
3108                 update_tasks_flags(cs);
3109 out:
3110         free_cpuset(trialcs);
3111         return err;
3112 }
3113 
3114 /**
3115  * update_prstate - update partition_root_state
3116  * @cs: the cpuset to update
3117  * @new_prs: new partition root state
3118  * Return: 0 if successful, != 0 if error
3119  *
3120  * Call with cpuset_mutex held.
3121  */
3122 static int update_prstate(struct cpuset *cs, int new_prs)
3123 {
3124         int err = PERR_NONE, old_prs = cs->partition_root_state;
3125         struct cpuset *parent = parent_cs(cs);
3126         struct tmpmasks tmpmask;
3127         bool new_xcpus_state = false;
3128 
3129         if (old_prs == new_prs)
3130                 return 0;
3131 
3132         /*
3133          * Treat a previously invalid partition root as if it is a "member".
3134          */
3135         if (new_prs && is_prs_invalid(old_prs))
3136                 old_prs = PRS_MEMBER;
3137 
3138         if (alloc_cpumasks(NULL, &tmpmask))
3139                 return -ENOMEM;
3140 
3141         /*
3142          * Setup effective_xcpus if not properly set yet, it will be cleared
3143          * later if partition becomes invalid.
3144          */
3145         if ((new_prs > 0) && cpumask_empty(cs->exclusive_cpus)) {
3146                 spin_lock_irq(&callback_lock);
3147                 cpumask_and(cs->effective_xcpus,
3148                             cs->cpus_allowed, parent->effective_xcpus);
3149                 spin_unlock_irq(&callback_lock);
3150         }
3151 
3152         err = update_partition_exclusive(cs, new_prs);
3153         if (err)
3154                 goto out;
3155 
3156         if (!old_prs) {
3157                 enum partition_cmd cmd = (new_prs == PRS_ROOT)
3158                                        ? partcmd_enable : partcmd_enablei;
3159 
3160                 /*
3161                  * cpus_allowed and exclusive_cpus cannot be both empty.
3162                  */
3163                 if (xcpus_empty(cs)) {
3164                         err = PERR_CPUSEMPTY;
3165                         goto out;
3166                 }
3167 
3168                 err = update_parent_effective_cpumask(cs, cmd, NULL, &tmpmask);
3169                 /*
3170                  * If an attempt to become local partition root fails,
3171                  * try to become a remote partition root instead.
3172                  */
3173                 if (err && remote_partition_enable(cs, new_prs, &tmpmask))
3174                         err = 0;
3175         } else if (old_prs && new_prs) {
3176                 /*
3177                  * A change in load balance state only, no change in cpumasks.
3178                  */
3179                 new_xcpus_state = true;
3180         } else {
3181                 /*
3182                  * Switching back to member is always allowed even if it
3183                  * disables child partitions.
3184                  */
3185                 if (is_remote_partition(cs))
3186                         remote_partition_disable(cs, &tmpmask);
3187                 else
3188                         update_parent_effective_cpumask(cs, partcmd_disable,
3189                                                         NULL, &tmpmask);
3190 
3191                 /*
3192                  * Invalidation of child partitions will be done in
3193                  * update_cpumasks_hier().
3194                  */
3195         }
3196 out:
3197         /*
3198          * Make partition invalid & disable CS_CPU_EXCLUSIVE if an error
3199          * happens.
3200          */
3201         if (err) {
3202                 new_prs = -new_prs;
3203                 update_partition_exclusive(cs, new_prs);
3204         }
3205 
3206         spin_lock_irq(&callback_lock);
3207         cs->partition_root_state = new_prs;
3208         WRITE_ONCE(cs->prs_err, err);
3209         if (!is_partition_valid(cs))
3210                 reset_partition_data(cs);
3211         else if (new_xcpus_state)
3212                 partition_xcpus_newstate(old_prs, new_prs, cs->effective_xcpus);
3213         spin_unlock_irq(&callback_lock);
3214         update_unbound_workqueue_cpumask(new_xcpus_state);
3215 
3216         /* Force update if switching back to member */
3217         update_cpumasks_hier(cs, &tmpmask, !new_prs ? HIER_CHECKALL : 0);
3218 
3219         /* Update sched domains and load balance flag */
3220         update_partition_sd_lb(cs, old_prs);
3221 
3222         notify_partition_change(cs, old_prs);
3223         free_cpumasks(NULL, &tmpmask);
3224         return 0;
3225 }
3226 
3227 /*
3228  * Frequency meter - How fast is some event occurring?
3229  *
3230  * These routines manage a digitally filtered, constant time based,
3231  * event frequency meter.  There are four routines:
3232  *   fmeter_init() - initialize a frequency meter.
3233  *   fmeter_markevent() - called each time the event happens.
3234  *   fmeter_getrate() - returns the recent rate of such events.
3235  *   fmeter_update() - internal routine used to update fmeter.
3236  *
3237  * A common data structure is passed to each of these routines,
3238  * which is used to keep track of the state required to manage the
3239  * frequency meter and its digital filter.
3240  *
3241  * The filter works on the number of events marked per unit time.
3242  * The filter is single-pole low-pass recursive (IIR).  The time unit
3243  * is 1 second.  Arithmetic is done using 32-bit integers scaled to
3244  * simulate 3 decimal digits of precision (multiplied by 1000).
3245  *
3246  * With an FM_COEF of 933, and a time base of 1 second, the filter
3247  * has a half-life of 10 seconds, meaning that if the events quit
3248  * happening, then the rate returned from the fmeter_getrate()
3249  * will be cut in half each 10 seconds, until it converges to zero.
3250  *
3251  * It is not worth doing a real infinitely recursive filter.  If more
3252  * than FM_MAXTICKS ticks have elapsed since the last filter event,
3253  * just compute FM_MAXTICKS ticks worth, by which point the level
3254  * will be stable.
3255  *
3256  * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid
3257  * arithmetic overflow in the fmeter_update() routine.
3258  *
3259  * Given the simple 32 bit integer arithmetic used, this meter works
3260  * best for reporting rates between one per millisecond (msec) and
3261  * one per 32 (approx) seconds.  At constant rates faster than one
3262  * per msec it maxes out at values just under 1,000,000.  At constant
3263  * rates between one per msec, and one per second it will stabilize
3264  * to a value N*1000, where N is the rate of events per second.
3265  * At constant rates between one per second and one per 32 seconds,
3266  * it will be choppy, moving up on the seconds that have an event,
3267  * and then decaying until the next event.  At rates slower than
3268  * about one in 32 seconds, it decays all the way back to zero between
3269  * each event.
3270  */
3271 
3272 #define FM_COEF 933             /* coefficient for half-life of 10 secs */
3273 #define FM_MAXTICKS ((u32)99)   /* useless computing more ticks than this */
3274 #define FM_MAXCNT 1000000       /* limit cnt to avoid overflow */
3275 #define FM_SCALE 1000           /* faux fixed point scale */
3276 
3277 /* Initialize a frequency meter */
3278 static void fmeter_init(struct fmeter *fmp)
3279 {
3280         fmp->cnt = 0;
3281         fmp->val = 0;
3282         fmp->time = 0;
3283         spin_lock_init(&fmp->lock);
3284 }
3285 
3286 /* Internal meter update - process cnt events and update value */
3287 static void fmeter_update(struct fmeter *fmp)
3288 {
3289         time64_t now;
3290         u32 ticks;
3291 
3292         now = ktime_get_seconds();
3293         ticks = now - fmp->time;
3294 
3295         if (ticks == 0)
3296                 return;
3297 
3298         ticks = min(FM_MAXTICKS, ticks);
3299         while (ticks-- > 0)
3300                 fmp->val = (FM_COEF * fmp->val) / FM_SCALE;
3301         fmp->time = now;
3302 
3303         fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE;
3304         fmp->cnt = 0;
3305 }
3306 
3307 /* Process any previous ticks, then bump cnt by one (times scale). */
3308 static void fmeter_markevent(struct fmeter *fmp)
3309 {
3310         spin_lock(&fmp->lock);
3311         fmeter_update(fmp);
3312         fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE);
3313         spin_unlock(&fmp->lock);
3314 }
3315 
3316 /* Process any previous ticks, then return current value. */
3317 static int fmeter_getrate(struct fmeter *fmp)
3318 {
3319         int val;
3320 
3321         spin_lock(&fmp->lock);
3322         fmeter_update(fmp);
3323         val = fmp->val;
3324         spin_unlock(&fmp->lock);
3325         return val;
3326 }
3327 
3328 static struct cpuset *cpuset_attach_old_cs;
3329 
3330 /*
3331  * Check to see if a cpuset can accept a new task
3332  * For v1, cpus_allowed and mems_allowed can't be empty.
3333  * For v2, effective_cpus can't be empty.
3334  * Note that in v1, effective_cpus = cpus_allowed.
3335  */
3336 static int cpuset_can_attach_check(struct cpuset *cs)
3337 {
3338         if (cpumask_empty(cs->effective_cpus) ||
3339            (!is_in_v2_mode() && nodes_empty(cs->mems_allowed)))
3340                 return -ENOSPC;
3341         return 0;
3342 }
3343 
3344 static void reset_migrate_dl_data(struct cpuset *cs)
3345 {
3346         cs->nr_migrate_dl_tasks = 0;
3347         cs->sum_migrate_dl_bw = 0;
3348 }
3349 
3350 /* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
3351 static int cpuset_can_attach(struct cgroup_taskset *tset)
3352 {
3353         struct cgroup_subsys_state *css;
3354         struct cpuset *cs, *oldcs;
3355         struct task_struct *task;
3356         bool cpus_updated, mems_updated;
3357         int ret;
3358 
3359         /* used later by cpuset_attach() */
3360         cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css));
3361         oldcs = cpuset_attach_old_cs;
3362         cs = css_cs(css);
3363 
3364         mutex_lock(&cpuset_mutex);
3365 
3366         /* Check to see if task is allowed in the cpuset */
3367         ret = cpuset_can_attach_check(cs);
3368         if (ret)
3369                 goto out_unlock;
3370 
3371         cpus_updated = !cpumask_equal(cs->effective_cpus, oldcs->effective_cpus);
3372         mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems);
3373 
3374         cgroup_taskset_for_each(task, css, tset) {
3375                 ret = task_can_attach(task);
3376                 if (ret)
3377                         goto out_unlock;
3378 
3379                 /*
3380                  * Skip rights over task check in v2 when nothing changes,
3381                  * migration permission derives from hierarchy ownership in
3382                  * cgroup_procs_write_permission()).
3383                  */
3384                 if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
3385                     (cpus_updated || mems_updated)) {
3386                         ret = security_task_setscheduler(task);
3387                         if (ret)
3388                                 goto out_unlock;
3389                 }
3390 
3391                 if (dl_task(task)) {
3392                         cs->nr_migrate_dl_tasks++;
3393                         cs->sum_migrate_dl_bw += task->dl.dl_bw;
3394                 }
3395         }
3396 
3397         if (!cs->nr_migrate_dl_tasks)
3398                 goto out_success;
3399 
3400         if (!cpumask_intersects(oldcs->effective_cpus, cs->effective_cpus)) {
3401                 int cpu = cpumask_any_and(cpu_active_mask, cs->effective_cpus);
3402 
3403                 if (unlikely(cpu >= nr_cpu_ids)) {
3404                         reset_migrate_dl_data(cs);
3405                         ret = -EINVAL;
3406                         goto out_unlock;
3407                 }
3408 
3409                 ret = dl_bw_alloc(cpu, cs->sum_migrate_dl_bw);
3410                 if (ret) {
3411                         reset_migrate_dl_data(cs);
3412                         goto out_unlock;
3413                 }
3414         }
3415 
3416 out_success:
3417         /*
3418          * Mark attach is in progress.  This makes validate_change() fail
3419          * changes which zero cpus/mems_allowed.
3420          */
3421         cs->attach_in_progress++;
3422 out_unlock:
3423         mutex_unlock(&cpuset_mutex);
3424         return ret;
3425 }
3426 
3427 static void cpuset_cancel_attach(struct cgroup_taskset *tset)
3428 {
3429         struct cgroup_subsys_state *css;
3430         struct cpuset *cs;
3431 
3432         cgroup_taskset_first(tset, &css);
3433         cs = css_cs(css);
3434 
3435         mutex_lock(&cpuset_mutex);
3436         cs->attach_in_progress--;
3437         if (!cs->attach_in_progress)
3438                 wake_up(&cpuset_attach_wq);
3439 
3440         if (cs->nr_migrate_dl_tasks) {
3441                 int cpu = cpumask_any(cs->effective_cpus);
3442 
3443                 dl_bw_free(cpu, cs->sum_migrate_dl_bw);
3444                 reset_migrate_dl_data(cs);
3445         }
3446 
3447         mutex_unlock(&cpuset_mutex);
3448 }
3449 
3450 /*
3451  * Protected by cpuset_mutex. cpus_attach is used only by cpuset_attach_task()
3452  * but we can't allocate it dynamically there.  Define it global and
3453  * allocate from cpuset_init().
3454  */
3455 static cpumask_var_t cpus_attach;
3456 static nodemask_t cpuset_attach_nodemask_to;
3457 
3458 static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task)
3459 {
3460         lockdep_assert_held(&cpuset_mutex);
3461 
3462         if (cs != &top_cpuset)
3463                 guarantee_online_cpus(task, cpus_attach);
3464         else
3465                 cpumask_andnot(cpus_attach, task_cpu_possible_mask(task),
3466                                subpartitions_cpus);
3467         /*
3468          * can_attach beforehand should guarantee that this doesn't
3469          * fail.  TODO: have a better way to handle failure here
3470          */
3471         WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
3472 
3473         cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
3474         cpuset_update_task_spread_flags(cs, task);
3475 }
3476 
3477 static void cpuset_attach(struct cgroup_taskset *tset)
3478 {
3479         struct task_struct *task;
3480         struct task_struct *leader;
3481         struct cgroup_subsys_state *css;
3482         struct cpuset *cs;
3483         struct cpuset *oldcs = cpuset_attach_old_cs;
3484         bool cpus_updated, mems_updated;
3485 
3486         cgroup_taskset_first(tset, &css);
3487         cs = css_cs(css);
3488 
3489         lockdep_assert_cpus_held();     /* see cgroup_attach_lock() */
3490         mutex_lock(&cpuset_mutex);
3491         cpus_updated = !cpumask_equal(cs->effective_cpus,
3492                                       oldcs->effective_cpus);
3493         mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems);
3494 
3495         /*
3496          * In the default hierarchy, enabling cpuset in the child cgroups
3497          * will trigger a number of cpuset_attach() calls with no change
3498          * in effective cpus and mems. In that case, we can optimize out
3499          * by skipping the task iteration and update.
3500          */
3501         if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
3502             !cpus_updated && !mems_updated) {
3503                 cpuset_attach_nodemask_to = cs->effective_mems;
3504                 goto out;
3505         }
3506 
3507         guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
3508 
3509         cgroup_taskset_for_each(task, css, tset)
3510                 cpuset_attach_task(cs, task);
3511 
3512         /*
3513          * Change mm for all threadgroup leaders. This is expensive and may
3514          * sleep and should be moved outside migration path proper. Skip it
3515          * if there is no change in effective_mems and CS_MEMORY_MIGRATE is
3516          * not set.
3517          */
3518         cpuset_attach_nodemask_to = cs->effective_mems;
3519         if (!is_memory_migrate(cs) && !mems_updated)
3520                 goto out;
3521 
3522         cgroup_taskset_for_each_leader(leader, css, tset) {
3523                 struct mm_struct *mm = get_task_mm(leader);
3524 
3525                 if (mm) {
3526                         mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
3527 
3528                         /*
3529                          * old_mems_allowed is the same with mems_allowed
3530                          * here, except if this task is being moved
3531                          * automatically due to hotplug.  In that case
3532                          * @mems_allowed has been updated and is empty, so
3533                          * @old_mems_allowed is the right nodesets that we
3534                          * migrate mm from.
3535                          */
3536                         if (is_memory_migrate(cs))
3537                                 cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
3538                                                   &cpuset_attach_nodemask_to);
3539                         else
3540                                 mmput(mm);
3541                 }
3542         }
3543 
3544 out:
3545         cs->old_mems_allowed = cpuset_attach_nodemask_to;
3546 
3547         if (cs->nr_migrate_dl_tasks) {
3548                 cs->nr_deadline_tasks += cs->nr_migrate_dl_tasks;
3549                 oldcs->nr_deadline_tasks -= cs->nr_migrate_dl_tasks;
3550                 reset_migrate_dl_data(cs);
3551         }
3552 
3553         cs->attach_in_progress--;
3554         if (!cs->attach_in_progress)
3555                 wake_up(&cpuset_attach_wq);
3556 
3557         mutex_unlock(&cpuset_mutex);
3558 }
3559 
3560 /* The various types of files and directories in a cpuset file system */
3561 
3562 typedef enum {
3563         FILE_MEMORY_MIGRATE,
3564         FILE_CPULIST,
3565         FILE_MEMLIST,
3566         FILE_EFFECTIVE_CPULIST,
3567         FILE_EFFECTIVE_MEMLIST,
3568         FILE_SUBPARTS_CPULIST,
3569         FILE_EXCLUSIVE_CPULIST,
3570         FILE_EFFECTIVE_XCPULIST,
3571         FILE_ISOLATED_CPULIST,
3572         FILE_CPU_EXCLUSIVE,
3573         FILE_MEM_EXCLUSIVE,
3574         FILE_MEM_HARDWALL,
3575         FILE_SCHED_LOAD_BALANCE,
3576         FILE_PARTITION_ROOT,
3577         FILE_SCHED_RELAX_DOMAIN_LEVEL,
3578         FILE_MEMORY_PRESSURE_ENABLED,
3579         FILE_MEMORY_PRESSURE,
3580         FILE_SPREAD_PAGE,
3581         FILE_SPREAD_SLAB,
3582 } cpuset_filetype_t;
3583 
3584 static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
3585                             u64 val)
3586 {
3587         struct cpuset *cs = css_cs(css);
3588         cpuset_filetype_t type = cft->private;
3589         int retval = 0;
3590 
3591         cpus_read_lock();
3592         mutex_lock(&cpuset_mutex);
3593         if (!is_cpuset_online(cs)) {
3594                 retval = -ENODEV;
3595                 goto out_unlock;
3596         }
3597 
3598         switch (type) {
3599         case FILE_CPU_EXCLUSIVE:
3600                 retval = update_flag(CS_CPU_EXCLUSIVE, cs, val);
3601                 break;
3602         case FILE_MEM_EXCLUSIVE:
3603                 retval = update_flag(CS_MEM_EXCLUSIVE, cs, val);
3604                 break;
3605         case FILE_MEM_HARDWALL:
3606                 retval = update_flag(CS_MEM_HARDWALL, cs, val);
3607                 break;
3608         case FILE_SCHED_LOAD_BALANCE:
3609                 retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val);
3610                 break;
3611         case FILE_MEMORY_MIGRATE:
3612                 retval = update_flag(CS_MEMORY_MIGRATE, cs, val);
3613                 break;
3614         case FILE_MEMORY_PRESSURE_ENABLED:
3615                 cpuset_memory_pressure_enabled = !!val;
3616                 break;
3617         case FILE_SPREAD_PAGE:
3618                 retval = update_flag(CS_SPREAD_PAGE, cs, val);
3619                 break;
3620         case FILE_SPREAD_SLAB:
3621                 retval = update_flag(CS_SPREAD_SLAB, cs, val);
3622                 break;
3623         default:
3624                 retval = -EINVAL;
3625                 break;
3626         }
3627 out_unlock:
3628         mutex_unlock(&cpuset_mutex);
3629         cpus_read_unlock();
3630         return retval;
3631 }
3632 
3633 static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
3634                             s64 val)
3635 {
3636         struct cpuset *cs = css_cs(css);
3637         cpuset_filetype_t type = cft->private;
3638         int retval = -ENODEV;
3639 
3640         cpus_read_lock();
3641         mutex_lock(&cpuset_mutex);
3642         if (!is_cpuset_online(cs))
3643                 goto out_unlock;
3644 
3645         switch (type) {
3646         case FILE_SCHED_RELAX_DOMAIN_LEVEL:
3647                 retval = update_relax_domain_level(cs, val);
3648                 break;
3649         default:
3650                 retval = -EINVAL;
3651                 break;
3652         }
3653 out_unlock:
3654         mutex_unlock(&cpuset_mutex);
3655         cpus_read_unlock();
3656         return retval;
3657 }
3658 
3659 /*
3660  * Common handling for a write to a "cpus" or "mems" file.
3661  */
3662 static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
3663                                     char *buf, size_t nbytes, loff_t off)
3664 {
3665         struct cpuset *cs = css_cs(of_css(of));
3666         struct cpuset *trialcs;
3667         int retval = -ENODEV;
3668 
3669         buf = strstrip(buf);
3670 
3671         /*
3672          * CPU or memory hotunplug may leave @cs w/o any execution
3673          * resources, in which case the hotplug code asynchronously updates
3674          * configuration and transfers all tasks to the nearest ancestor
3675          * which can execute.
3676          *
3677          * As writes to "cpus" or "mems" may restore @cs's execution
3678          * resources, wait for the previously scheduled operations before
3679          * proceeding, so that we don't end up keep removing tasks added
3680          * after execution capability is restored.
3681          *
3682          * cpuset_handle_hotplug may call back into cgroup core asynchronously
3683          * via cgroup_transfer_tasks() and waiting for it from a cgroupfs
3684          * operation like this one can lead to a deadlock through kernfs
3685          * active_ref protection.  Let's break the protection.  Losing the
3686          * protection is okay as we check whether @cs is online after
3687          * grabbing cpuset_mutex anyway.  This only happens on the legacy
3688          * hierarchies.
3689          */
3690         css_get(&cs->css);
3691         kernfs_break_active_protection(of->kn);
3692 
3693         cpus_read_lock();
3694         mutex_lock(&cpuset_mutex);
3695         if (!is_cpuset_online(cs))
3696                 goto out_unlock;
3697 
3698         trialcs = alloc_trial_cpuset(cs);
3699         if (!trialcs) {
3700                 retval = -ENOMEM;
3701                 goto out_unlock;
3702         }
3703 
3704         switch (of_cft(of)->private) {
3705         case FILE_CPULIST:
3706                 retval = update_cpumask(cs, trialcs, buf);
3707                 break;
3708         case FILE_EXCLUSIVE_CPULIST:
3709                 retval = update_exclusive_cpumask(cs, trialcs, buf);
3710                 break;
3711         case FILE_MEMLIST:
3712                 retval = update_nodemask(cs, trialcs, buf);
3713                 break;
3714         default:
3715                 retval = -EINVAL;
3716                 break;
3717         }
3718 
3719         free_cpuset(trialcs);
3720 out_unlock:
3721         mutex_unlock(&cpuset_mutex);
3722         cpus_read_unlock();
3723         kernfs_unbreak_active_protection(of->kn);
3724         css_put(&cs->css);
3725         flush_workqueue(cpuset_migrate_mm_wq);
3726         return retval ?: nbytes;
3727 }
3728 
3729 /*
3730  * These ascii lists should be read in a single call, by using a user
3731  * buffer large enough to hold the entire map.  If read in smaller
3732  * chunks, there is no guarantee of atomicity.  Since the display format
3733  * used, list of ranges of sequential numbers, is variable length,
3734  * and since these maps can change value dynamically, one could read
3735  * gibberish by doing partial reads while a list was changing.
3736  */
3737 static int cpuset_common_seq_show(struct seq_file *sf, void *v)
3738 {
3739         struct cpuset *cs = css_cs(seq_css(sf));
3740         cpuset_filetype_t type = seq_cft(sf)->private;
3741         int ret = 0;
3742 
3743         spin_lock_irq(&callback_lock);
3744 
3745         switch (type) {
3746         case FILE_CPULIST:
3747                 seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->cpus_allowed));
3748                 break;
3749         case FILE_MEMLIST:
3750                 seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->mems_allowed));
3751                 break;
3752         case FILE_EFFECTIVE_CPULIST:
3753                 seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->effective_cpus));
3754                 break;
3755         case FILE_EFFECTIVE_MEMLIST:
3756                 seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->effective_mems));
3757                 break;
3758         case FILE_EXCLUSIVE_CPULIST:
3759                 seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->exclusive_cpus));
3760                 break;
3761         case FILE_EFFECTIVE_XCPULIST:
3762                 seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->effective_xcpus));
3763                 break;
3764         case FILE_SUBPARTS_CPULIST:
3765                 seq_printf(sf, "%*pbl\n", cpumask_pr_args(subpartitions_cpus));
3766                 break;
3767         case FILE_ISOLATED_CPULIST:
3768                 seq_printf(sf, "%*pbl\n", cpumask_pr_args(isolated_cpus));
3769                 break;
3770         default:
3771                 ret = -EINVAL;
3772         }
3773 
3774         spin_unlock_irq(&callback_lock);
3775         return ret;
3776 }
3777 
3778 static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
3779 {
3780         struct cpuset *cs = css_cs(css);
3781         cpuset_filetype_t type = cft->private;
3782         switch (type) {
3783         case FILE_CPU_EXCLUSIVE:
3784                 return is_cpu_exclusive(cs);
3785         case FILE_MEM_EXCLUSIVE:
3786                 return is_mem_exclusive(cs);
3787         case FILE_MEM_HARDWALL:
3788                 return is_mem_hardwall(cs);
3789         case FILE_SCHED_LOAD_BALANCE:
3790                 return is_sched_load_balance(cs);
3791         case FILE_MEMORY_MIGRATE:
3792                 return is_memory_migrate(cs);
3793         case FILE_MEMORY_PRESSURE_ENABLED:
3794                 return cpuset_memory_pressure_enabled;
3795         case FILE_MEMORY_PRESSURE:
3796                 return fmeter_getrate(&cs->fmeter);
3797         case FILE_SPREAD_PAGE:
3798                 return is_spread_page(cs);
3799         case FILE_SPREAD_SLAB:
3800                 return is_spread_slab(cs);
3801         default:
3802                 BUG();
3803         }
3804 
3805         /* Unreachable but makes gcc happy */
3806         return 0;
3807 }
3808 
3809 static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
3810 {
3811         struct cpuset *cs = css_cs(css);
3812         cpuset_filetype_t type = cft->private;
3813         switch (type) {
3814         case FILE_SCHED_RELAX_DOMAIN_LEVEL:
3815                 return cs->relax_domain_level;
3816         default:
3817                 BUG();
3818         }
3819 
3820         /* Unreachable but makes gcc happy */
3821         return 0;
3822 }
3823 
3824 static int sched_partition_show(struct seq_file *seq, void *v)
3825 {
3826         struct cpuset *cs = css_cs(seq_css(seq));
3827         const char *err, *type = NULL;
3828 
3829         switch (cs->partition_root_state) {
3830         case PRS_ROOT:
3831                 seq_puts(seq, "root\n");
3832                 break;
3833         case PRS_ISOLATED:
3834                 seq_puts(seq, "isolated\n");
3835                 break;
3836         case PRS_MEMBER:
3837                 seq_puts(seq, "member\n");
3838                 break;
3839         case PRS_INVALID_ROOT:
3840                 type = "root";
3841                 fallthrough;
3842         case PRS_INVALID_ISOLATED:
3843                 if (!type)
3844                         type = "isolated";
3845                 err = perr_strings[READ_ONCE(cs->prs_err)];
3846                 if (err)
3847                         seq_printf(seq, "%s invalid (%s)\n", type, err);
3848                 else
3849                         seq_printf(seq, "%s invalid\n", type);
3850                 break;
3851         }
3852         return 0;
3853 }
3854 
3855 static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf,
3856                                      size_t nbytes, loff_t off)
3857 {
3858         struct cpuset *cs = css_cs(of_css(of));
3859         int val;
3860         int retval = -ENODEV;
3861 
3862         buf = strstrip(buf);
3863 
3864         if (!strcmp(buf, "root"))
3865                 val = PRS_ROOT;
3866         else if (!strcmp(buf, "member"))
3867                 val = PRS_MEMBER;
3868         else if (!strcmp(buf, "isolated"))
3869                 val = PRS_ISOLATED;
3870         else
3871                 return -EINVAL;
3872 
3873         css_get(&cs->css);
3874         cpus_read_lock();
3875         mutex_lock(&cpuset_mutex);
3876         if (!is_cpuset_online(cs))
3877                 goto out_unlock;
3878 
3879         retval = update_prstate(cs, val);
3880 out_unlock:
3881         mutex_unlock(&cpuset_mutex);
3882         cpus_read_unlock();
3883         css_put(&cs->css);
3884         return retval ?: nbytes;
3885 }
3886 
3887 /*
3888  * for the common functions, 'private' gives the type of file
3889  */
3890 
3891 static struct cftype legacy_files[] = {
3892         {
3893                 .name = "cpus",
3894                 .seq_show = cpuset_common_seq_show,
3895                 .write = cpuset_write_resmask,
3896                 .max_write_len = (100U + 6 * NR_CPUS),
3897                 .private = FILE_CPULIST,
3898         },
3899 
3900         {
3901                 .name = "mems",
3902                 .seq_show = cpuset_common_seq_show,
3903                 .write = cpuset_write_resmask,
3904                 .max_write_len = (100U + 6 * MAX_NUMNODES),
3905                 .private = FILE_MEMLIST,
3906         },
3907 
3908         {
3909                 .name = "effective_cpus",
3910                 .seq_show = cpuset_common_seq_show,
3911                 .private = FILE_EFFECTIVE_CPULIST,
3912         },
3913 
3914         {
3915                 .name = "effective_mems",
3916                 .seq_show = cpuset_common_seq_show,
3917                 .private = FILE_EFFECTIVE_MEMLIST,
3918         },
3919 
3920         {
3921                 .name = "cpu_exclusive",
3922                 .read_u64 = cpuset_read_u64,
3923                 .write_u64 = cpuset_write_u64,
3924                 .private = FILE_CPU_EXCLUSIVE,
3925         },
3926 
3927         {
3928                 .name = "mem_exclusive",
3929                 .read_u64 = cpuset_read_u64,
3930                 .write_u64 = cpuset_write_u64,
3931                 .private = FILE_MEM_EXCLUSIVE,
3932         },
3933 
3934         {
3935                 .name = "mem_hardwall",
3936                 .read_u64 = cpuset_read_u64,
3937                 .write_u64 = cpuset_write_u64,
3938                 .private = FILE_MEM_HARDWALL,
3939         },
3940 
3941         {
3942                 .name = "sched_load_balance",
3943                 .read_u64 = cpuset_read_u64,
3944                 .write_u64 = cpuset_write_u64,
3945                 .private = FILE_SCHED_LOAD_BALANCE,
3946         },
3947 
3948         {
3949                 .name = "sched_relax_domain_level",
3950                 .read_s64 = cpuset_read_s64,
3951                 .write_s64 = cpuset_write_s64,
3952                 .private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
3953         },
3954 
3955         {
3956                 .name = "memory_migrate",
3957                 .read_u64 = cpuset_read_u64,
3958                 .write_u64 = cpuset_write_u64,
3959                 .private = FILE_MEMORY_MIGRATE,
3960         },
3961 
3962         {
3963                 .name = "memory_pressure",
3964                 .read_u64 = cpuset_read_u64,
3965                 .private = FILE_MEMORY_PRESSURE,
3966         },
3967 
3968         {
3969                 .name = "memory_spread_page",
3970                 .read_u64 = cpuset_read_u64,
3971                 .write_u64 = cpuset_write_u64,
3972                 .private = FILE_SPREAD_PAGE,
3973         },
3974 
3975         {
3976                 /* obsolete, may be removed in the future */
3977                 .name = "memory_spread_slab",
3978                 .read_u64 = cpuset_read_u64,
3979                 .write_u64 = cpuset_write_u64,
3980                 .private = FILE_SPREAD_SLAB,
3981         },
3982 
3983         {
3984                 .name = "memory_pressure_enabled",
3985                 .flags = CFTYPE_ONLY_ON_ROOT,
3986                 .read_u64 = cpuset_read_u64,
3987                 .write_u64 = cpuset_write_u64,
3988                 .private = FILE_MEMORY_PRESSURE_ENABLED,
3989         },
3990 
3991         { }     /* terminate */
3992 };
3993 
3994 /*
3995  * This is currently a minimal set for the default hierarchy. It can be
3996  * expanded later on by migrating more features and control files from v1.
3997  */
3998 static struct cftype dfl_files[] = {
3999         {
4000                 .name = "cpus",
4001                 .seq_show = cpuset_common_seq_show,
4002                 .write = cpuset_write_resmask,
4003                 .max_write_len = (100U + 6 * NR_CPUS),
4004                 .private = FILE_CPULIST,
4005                 .flags = CFTYPE_NOT_ON_ROOT,
4006         },
4007 
4008         {
4009                 .name = "mems",
4010                 .seq_show = cpuset_common_seq_show,
4011                 .write = cpuset_write_resmask,
4012                 .max_write_len = (100U + 6 * MAX_NUMNODES),
4013                 .private = FILE_MEMLIST,
4014                 .flags = CFTYPE_NOT_ON_ROOT,
4015         },
4016 
4017         {
4018                 .name = "cpus.effective",
4019                 .seq_show = cpuset_common_seq_show,
4020                 .private = FILE_EFFECTIVE_CPULIST,
4021         },
4022 
4023         {
4024                 .name = "mems.effective",
4025                 .seq_show = cpuset_common_seq_show,
4026                 .private = FILE_EFFECTIVE_MEMLIST,
4027         },
4028 
4029         {
4030                 .name = "cpus.partition",
4031                 .seq_show = sched_partition_show,
4032                 .write = sched_partition_write,
4033                 .private = FILE_PARTITION_ROOT,
4034                 .flags = CFTYPE_NOT_ON_ROOT,
4035                 .file_offset = offsetof(struct cpuset, partition_file),
4036         },
4037 
4038         {
4039                 .name = "cpus.exclusive",
4040                 .seq_show = cpuset_common_seq_show,
4041                 .write = cpuset_write_resmask,
4042                 .max_write_len = (100U + 6 * NR_CPUS),
4043                 .private = FILE_EXCLUSIVE_CPULIST,
4044                 .flags = CFTYPE_NOT_ON_ROOT,
4045         },
4046 
4047         {
4048                 .name = "cpus.exclusive.effective",
4049                 .seq_show = cpuset_common_seq_show,
4050                 .private = FILE_EFFECTIVE_XCPULIST,
4051                 .flags = CFTYPE_NOT_ON_ROOT,
4052         },
4053 
4054         {
4055                 .name = "cpus.subpartitions",
4056                 .seq_show = cpuset_common_seq_show,
4057                 .private = FILE_SUBPARTS_CPULIST,
4058                 .flags = CFTYPE_ONLY_ON_ROOT | CFTYPE_DEBUG,
4059         },
4060 
4061         {
4062                 .name = "cpus.isolated",
4063                 .seq_show = cpuset_common_seq_show,
4064                 .private = FILE_ISOLATED_CPULIST,
4065                 .flags = CFTYPE_ONLY_ON_ROOT,
4066         },
4067 
4068         { }     /* terminate */
4069 };
4070 
4071 
4072 /**
4073  * cpuset_css_alloc - Allocate a cpuset css
4074  * @parent_css: Parent css of the control group that the new cpuset will be
4075  *              part of
4076  * Return: cpuset css on success, -ENOMEM on failure.
4077  *
4078  * Allocate and initialize a new cpuset css, for non-NULL @parent_css, return
4079  * top cpuset css otherwise.
4080  */
4081 static struct cgroup_subsys_state *
4082 cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
4083 {
4084         struct cpuset *cs;
4085 
4086         if (!parent_css)
4087                 return &top_cpuset.css;
4088 
4089         cs = kzalloc(sizeof(*cs), GFP_KERNEL);
4090         if (!cs)
4091                 return ERR_PTR(-ENOMEM);
4092 
4093         if (alloc_cpumasks(cs, NULL)) {
4094                 kfree(cs);
4095                 return ERR_PTR(-ENOMEM);
4096         }
4097 
4098         __set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
4099         fmeter_init(&cs->fmeter);
4100         cs->relax_domain_level = -1;
4101         INIT_LIST_HEAD(&cs->remote_sibling);
4102 
4103         /* Set CS_MEMORY_MIGRATE for default hierarchy */
4104         if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
4105                 __set_bit(CS_MEMORY_MIGRATE, &cs->flags);
4106 
4107         return &cs->css;
4108 }
4109 
4110 static int cpuset_css_online(struct cgroup_subsys_state *css)
4111 {
4112         struct cpuset *cs = css_cs(css);
4113         struct cpuset *parent = parent_cs(cs);
4114         struct cpuset *tmp_cs;
4115         struct cgroup_subsys_state *pos_css;
4116 
4117         if (!parent)
4118                 return 0;
4119 
4120         cpus_read_lock();
4121         mutex_lock(&cpuset_mutex);
4122 
4123         set_bit(CS_ONLINE, &cs->flags);
4124         if (is_spread_page(parent))
4125                 set_bit(CS_SPREAD_PAGE, &cs->flags);
4126         if (is_spread_slab(parent))
4127                 set_bit(CS_SPREAD_SLAB, &cs->flags);
4128         /*
4129          * For v2, clear CS_SCHED_LOAD_BALANCE if parent is isolated
4130          */
4131         if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
4132             !is_sched_load_balance(parent))
4133                 clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
4134 
4135         cpuset_inc();
4136 
4137         spin_lock_irq(&callback_lock);
4138         if (is_in_v2_mode()) {
4139                 cpumask_copy(cs->effective_cpus, parent->effective_cpus);
4140                 cs->effective_mems = parent->effective_mems;
4141                 cs->use_parent_ecpus = true;
4142                 parent->child_ecpus_count++;
4143         }
4144         spin_unlock_irq(&callback_lock);
4145 
4146         if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
4147                 goto out_unlock;
4148 
4149         /*
4150          * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
4151          * set.  This flag handling is implemented in cgroup core for
4152          * historical reasons - the flag may be specified during mount.
4153          *
4154          * Currently, if any sibling cpusets have exclusive cpus or mem, we
4155          * refuse to clone the configuration - thereby refusing the task to
4156          * be entered, and as a result refusing the sys_unshare() or
4157          * clone() which initiated it.  If this becomes a problem for some
4158          * users who wish to allow that scenario, then this could be
4159          * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
4160          * (and likewise for mems) to the new cgroup.
4161          */
4162         rcu_read_lock();
4163         cpuset_for_each_child(tmp_cs, pos_css, parent) {
4164                 if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) {
4165                         rcu_read_unlock();
4166                         goto out_unlock;
4167                 }
4168         }
4169         rcu_read_unlock();
4170 
4171         spin_lock_irq(&callback_lock);
4172         cs->mems_allowed = parent->mems_allowed;
4173         cs->effective_mems = parent->mems_allowed;
4174         cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
4175         cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
4176         spin_unlock_irq(&callback_lock);
4177 out_unlock:
4178         mutex_unlock(&cpuset_mutex);
4179         cpus_read_unlock();
4180         return 0;
4181 }
4182 
4183 /*
4184  * If the cpuset being removed has its flag 'sched_load_balance'
4185  * enabled, then simulate turning sched_load_balance off, which
4186  * will call rebuild_sched_domains_locked(). That is not needed
4187  * in the default hierarchy where only changes in partition
4188  * will cause repartitioning.
4189  *
4190  * If the cpuset has the 'sched.partition' flag enabled, simulate
4191  * turning 'sched.partition" off.
4192  */
4193 
4194 static void cpuset_css_offline(struct cgroup_subsys_state *css)
4195 {
4196         struct cpuset *cs = css_cs(css);
4197 
4198         cpus_read_lock();
4199         mutex_lock(&cpuset_mutex);
4200 
4201         if (is_partition_valid(cs))
4202                 update_prstate(cs, 0);
4203 
4204         if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
4205             is_sched_load_balance(cs))
4206                 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
4207 
4208         if (cs->use_parent_ecpus) {
4209                 struct cpuset *parent = parent_cs(cs);
4210 
4211                 cs->use_parent_ecpus = false;
4212                 parent->child_ecpus_count--;
4213         }
4214 
4215         cpuset_dec();
4216         clear_bit(CS_ONLINE, &cs->flags);
4217 
4218         mutex_unlock(&cpuset_mutex);
4219         cpus_read_unlock();
4220 }
4221 
4222 static void cpuset_css_free(struct cgroup_subsys_state *css)
4223 {
4224         struct cpuset *cs = css_cs(css);
4225 
4226         free_cpuset(cs);
4227 }
4228 
4229 static void cpuset_bind(struct cgroup_subsys_state *root_css)
4230 {
4231         mutex_lock(&cpuset_mutex);
4232         spin_lock_irq(&callback_lock);
4233 
4234         if (is_in_v2_mode()) {
4235                 cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
4236                 cpumask_copy(top_cpuset.effective_xcpus, cpu_possible_mask);
4237                 top_cpuset.mems_allowed = node_possible_map;
4238         } else {
4239                 cpumask_copy(top_cpuset.cpus_allowed,
4240                              top_cpuset.effective_cpus);
4241                 top_cpuset.mems_allowed = top_cpuset.effective_mems;
4242         }
4243 
4244         spin_unlock_irq(&callback_lock);
4245         mutex_unlock(&cpuset_mutex);
4246 }
4247 
4248 /*
4249  * In case the child is cloned into a cpuset different from its parent,
4250  * additional checks are done to see if the move is allowed.
4251  */
4252 static int cpuset_can_fork(struct task_struct *task, struct css_set *cset)
4253 {
4254         struct cpuset *cs = css_cs(cset->subsys[cpuset_cgrp_id]);
4255         bool same_cs;
4256         int ret;
4257 
4258         rcu_read_lock();
4259         same_cs = (cs == task_cs(current));
4260         rcu_read_unlock();
4261 
4262         if (same_cs)
4263                 return 0;
4264 
4265         lockdep_assert_held(&cgroup_mutex);
4266         mutex_lock(&cpuset_mutex);
4267 
4268         /* Check to see if task is allowed in the cpuset */
4269         ret = cpuset_can_attach_check(cs);
4270         if (ret)
4271                 goto out_unlock;
4272 
4273         ret = task_can_attach(task);
4274         if (ret)
4275                 goto out_unlock;
4276 
4277         ret = security_task_setscheduler(task);
4278         if (ret)
4279                 goto out_unlock;
4280 
4281         /*
4282          * Mark attach is in progress.  This makes validate_change() fail
4283          * changes which zero cpus/mems_allowed.
4284          */
4285         cs->attach_in_progress++;
4286 out_unlock:
4287         mutex_unlock(&cpuset_mutex);
4288         return ret;
4289 }
4290 
4291 static void cpuset_cancel_fork(struct task_struct *task, struct css_set *cset)
4292 {
4293         struct cpuset *cs = css_cs(cset->subsys[cpuset_cgrp_id]);
4294         bool same_cs;
4295 
4296         rcu_read_lock();
4297         same_cs = (cs == task_cs(current));
4298         rcu_read_unlock();
4299 
4300         if (same_cs)
4301                 return;
4302 
4303         mutex_lock(&cpuset_mutex);
4304         cs->attach_in_progress--;
4305         if (!cs->attach_in_progress)
4306                 wake_up(&cpuset_attach_wq);
4307         mutex_unlock(&cpuset_mutex);
4308 }
4309 
4310 /*
4311  * Make sure the new task conform to the current state of its parent,
4312  * which could have been changed by cpuset just after it inherits the
4313  * state from the parent and before it sits on the cgroup's task list.
4314  */
4315 static void cpuset_fork(struct task_struct *task)
4316 {
4317         struct cpuset *cs;
4318         bool same_cs;
4319 
4320         rcu_read_lock();
4321         cs = task_cs(task);
4322         same_cs = (cs == task_cs(current));
4323         rcu_read_unlock();
4324 
4325         if (same_cs) {
4326                 if (cs == &top_cpuset)
4327                         return;
4328 
4329                 set_cpus_allowed_ptr(task, current->cpus_ptr);
4330                 task->mems_allowed = current->mems_allowed;
4331                 return;
4332         }
4333 
4334         /* CLONE_INTO_CGROUP */
4335         mutex_lock(&cpuset_mutex);
4336         guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
4337         cpuset_attach_task(cs, task);
4338 
4339         cs->attach_in_progress--;
4340         if (!cs->attach_in_progress)
4341                 wake_up(&cpuset_attach_wq);
4342 
4343         mutex_unlock(&cpuset_mutex);
4344 }
4345 
4346 struct cgroup_subsys cpuset_cgrp_subsys = {
4347         .css_alloc      = cpuset_css_alloc,
4348         .css_online     = cpuset_css_online,
4349         .css_offline    = cpuset_css_offline,
4350         .css_free       = cpuset_css_free,
4351         .can_attach     = cpuset_can_attach,
4352         .cancel_attach  = cpuset_cancel_attach,
4353         .attach         = cpuset_attach,
4354         .post_attach    = cpuset_post_attach,
4355         .bind           = cpuset_bind,
4356         .can_fork       = cpuset_can_fork,
4357         .cancel_fork    = cpuset_cancel_fork,
4358         .fork           = cpuset_fork,
4359         .legacy_cftypes = legacy_files,
4360         .dfl_cftypes    = dfl_files,
4361         .early_init     = true,
4362         .threaded       = true,
4363 };
4364 
4365 /**
4366  * cpuset_init - initialize cpusets at system boot
4367  *
4368  * Description: Initialize top_cpuset
4369  **/
4370 
4371 int __init cpuset_init(void)
4372 {
4373         BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL));
4374         BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL));
4375         BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_xcpus, GFP_KERNEL));
4376         BUG_ON(!alloc_cpumask_var(&top_cpuset.exclusive_cpus, GFP_KERNEL));
4377         BUG_ON(!zalloc_cpumask_var(&subpartitions_cpus, GFP_KERNEL));
4378         BUG_ON(!zalloc_cpumask_var(&isolated_cpus, GFP_KERNEL));
4379 
4380         cpumask_setall(top_cpuset.cpus_allowed);
4381         nodes_setall(top_cpuset.mems_allowed);
4382         cpumask_setall(top_cpuset.effective_cpus);
4383         cpumask_setall(top_cpuset.effective_xcpus);
4384         cpumask_setall(top_cpuset.exclusive_cpus);
4385         nodes_setall(top_cpuset.effective_mems);
4386 
4387         fmeter_init(&top_cpuset.fmeter);
4388         INIT_LIST_HEAD(&remote_children);
4389 
4390         BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL));
4391 
4392         return 0;
4393 }
4394 
4395 /*
4396  * If CPU and/or memory hotplug handlers, below, unplug any CPUs
4397  * or memory nodes, we need to walk over the cpuset hierarchy,
4398  * removing that CPU or node from all cpusets.  If this removes the
4399  * last CPU or node from a cpuset, then move the tasks in the empty
4400  * cpuset to its next-highest non-empty parent.
4401  */
4402 static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
4403 {
4404         struct cpuset *parent;
4405 
4406         /*
4407          * Find its next-highest non-empty parent, (top cpuset
4408          * has online cpus, so can't be empty).
4409          */
4410         parent = parent_cs(cs);
4411         while (cpumask_empty(parent->cpus_allowed) ||
4412                         nodes_empty(parent->mems_allowed))
4413                 parent = parent_cs(parent);
4414 
4415         if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) {
4416                 pr_err("cpuset: failed to transfer tasks out of empty cpuset ");
4417                 pr_cont_cgroup_name(cs->css.cgroup);
4418                 pr_cont("\n");
4419         }
4420 }
4421 
4422 static void cpuset_migrate_tasks_workfn(struct work_struct *work)
4423 {
4424         struct cpuset_remove_tasks_struct *s;
4425 
4426         s = container_of(work, struct cpuset_remove_tasks_struct, work);
4427         remove_tasks_in_empty_cpuset(s->cs);
4428         css_put(&s->cs->css);
4429         kfree(s);
4430 }
4431 
4432 static void
4433 hotplug_update_tasks_legacy(struct cpuset *cs,
4434                             struct cpumask *new_cpus, nodemask_t *new_mems,
4435                             bool cpus_updated, bool mems_updated)
4436 {
4437         bool is_empty;
4438 
4439         spin_lock_irq(&callback_lock);
4440         cpumask_copy(cs->cpus_allowed, new_cpus);
4441         cpumask_copy(cs->effective_cpus, new_cpus);
4442         cs->mems_allowed = *new_mems;
4443         cs->effective_mems = *new_mems;
4444         spin_unlock_irq(&callback_lock);
4445 
4446         /*
4447          * Don't call update_tasks_cpumask() if the cpuset becomes empty,
4448          * as the tasks will be migrated to an ancestor.
4449          */
4450         if (cpus_updated && !cpumask_empty(cs->cpus_allowed))
4451                 update_tasks_cpumask(cs, new_cpus);
4452         if (mems_updated && !nodes_empty(cs->mems_allowed))
4453                 update_tasks_nodemask(cs);
4454 
4455         is_empty = cpumask_empty(cs->cpus_allowed) ||
4456                    nodes_empty(cs->mems_allowed);
4457 
4458         /*
4459          * Move tasks to the nearest ancestor with execution resources,
4460          * This is full cgroup operation which will also call back into
4461          * cpuset. Execute it asynchronously using workqueue.
4462          */
4463         if (is_empty && cs->css.cgroup->nr_populated_csets &&
4464             css_tryget_online(&cs->css)) {
4465                 struct cpuset_remove_tasks_struct *s;
4466 
4467                 s = kzalloc(sizeof(*s), GFP_KERNEL);
4468                 if (WARN_ON_ONCE(!s)) {
4469                         css_put(&cs->css);
4470                         return;
4471                 }
4472 
4473                 s->cs = cs;
4474                 INIT_WORK(&s->work, cpuset_migrate_tasks_workfn);
4475                 schedule_work(&s->work);
4476         }
4477 }
4478 
4479 static void
4480 hotplug_update_tasks(struct cpuset *cs,
4481                      struct cpumask *new_cpus, nodemask_t *new_mems,
4482                      bool cpus_updated, bool mems_updated)
4483 {
4484         /* A partition root is allowed to have empty effective cpus */
4485         if (cpumask_empty(new_cpus) && !is_partition_valid(cs))
4486                 cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus);
4487         if (nodes_empty(*new_mems))
4488                 *new_mems = parent_cs(cs)->effective_mems;
4489 
4490         spin_lock_irq(&callback_lock);
4491         cpumask_copy(cs->effective_cpus, new_cpus);
4492         cs->effective_mems = *new_mems;
4493         spin_unlock_irq(&callback_lock);
4494 
4495         if (cpus_updated)
4496                 update_tasks_cpumask(cs, new_cpus);
4497         if (mems_updated)
4498                 update_tasks_nodemask(cs);
4499 }
4500 
4501 static bool force_rebuild;
4502 
4503 void cpuset_force_rebuild(void)
4504 {
4505         force_rebuild = true;
4506 }
4507 
4508 /**
4509  * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug
4510  * @cs: cpuset in interest
4511  * @tmp: the tmpmasks structure pointer
4512  *
4513  * Compare @cs's cpu and mem masks against top_cpuset and if some have gone
4514  * offline, update @cs accordingly.  If @cs ends up with no CPU or memory,
4515  * all its tasks are moved to the nearest ancestor with both resources.
4516  */
4517 static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
4518 {
4519         static cpumask_t new_cpus;
4520         static nodemask_t new_mems;
4521         bool cpus_updated;
4522         bool mems_updated;
4523         bool remote;
4524         int partcmd = -1;
4525         struct cpuset *parent;
4526 retry:
4527         wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
4528 
4529         mutex_lock(&cpuset_mutex);
4530 
4531         /*
4532          * We have raced with task attaching. We wait until attaching
4533          * is finished, so we won't attach a task to an empty cpuset.
4534          */
4535         if (cs->attach_in_progress) {
4536                 mutex_unlock(&cpuset_mutex);
4537                 goto retry;
4538         }
4539 
4540         parent = parent_cs(cs);
4541         compute_effective_cpumask(&new_cpus, cs, parent);
4542         nodes_and(new_mems, cs->mems_allowed, parent->effective_mems);
4543 
4544         if (!tmp || !cs->partition_root_state)
4545                 goto update_tasks;
4546 
4547         /*
4548          * Compute effective_cpus for valid partition root, may invalidate
4549          * child partition roots if necessary.
4550          */
4551         remote = is_remote_partition(cs);
4552         if (remote || (is_partition_valid(cs) && is_partition_valid(parent)))
4553                 compute_partition_effective_cpumask(cs, &new_cpus);
4554 
4555         if (remote && cpumask_empty(&new_cpus) &&
4556             partition_is_populated(cs, NULL)) {
4557                 remote_partition_disable(cs, tmp);
4558                 compute_effective_cpumask(&new_cpus, cs, parent);
4559                 remote = false;
4560                 cpuset_force_rebuild();
4561         }
4562 
4563         /*
4564          * Force the partition to become invalid if either one of
4565          * the following conditions hold:
4566          * 1) empty effective cpus but not valid empty partition.
4567          * 2) parent is invalid or doesn't grant any cpus to child
4568          *    partitions.
4569          */
4570         if (is_local_partition(cs) && (!is_partition_valid(parent) ||
4571                                 tasks_nocpu_error(parent, cs, &new_cpus)))
4572                 partcmd = partcmd_invalidate;
4573         /*
4574          * On the other hand, an invalid partition root may be transitioned
4575          * back to a regular one.
4576          */
4577         else if (is_partition_valid(parent) && is_partition_invalid(cs))
4578                 partcmd = partcmd_update;
4579 
4580         if (partcmd >= 0) {
4581                 update_parent_effective_cpumask(cs, partcmd, NULL, tmp);
4582                 if ((partcmd == partcmd_invalidate) || is_partition_valid(cs)) {
4583                         compute_partition_effective_cpumask(cs, &new_cpus);
4584                         cpuset_force_rebuild();
4585                 }
4586         }
4587 
4588 update_tasks:
4589         cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
4590         mems_updated = !nodes_equal(new_mems, cs->effective_mems);
4591         if (!cpus_updated && !mems_updated)
4592                 goto unlock;    /* Hotplug doesn't affect this cpuset */
4593 
4594         if (mems_updated)
4595                 check_insane_mems_config(&new_mems);
4596 
4597         if (is_in_v2_mode())
4598                 hotplug_update_tasks(cs, &new_cpus, &new_mems,
4599                                      cpus_updated, mems_updated);
4600         else
4601                 hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems,
4602                                             cpus_updated, mems_updated);
4603 
4604 unlock:
4605         mutex_unlock(&cpuset_mutex);
4606 }
4607 
4608 /**
4609  * cpuset_handle_hotplug - handle CPU/memory hot{,un}plug for a cpuset
4610  *
4611  * This function is called after either CPU or memory configuration has
4612  * changed and updates cpuset accordingly.  The top_cpuset is always
4613  * synchronized to cpu_active_mask and N_MEMORY, which is necessary in
4614  * order to make cpusets transparent (of no affect) on systems that are
4615  * actively using CPU hotplug but making no active use of cpusets.
4616  *
4617  * Non-root cpusets are only affected by offlining.  If any CPUs or memory
4618  * nodes have been taken down, cpuset_hotplug_update_tasks() is invoked on
4619  * all descendants.
4620  *
4621  * Note that CPU offlining during suspend is ignored.  We don't modify
4622  * cpusets across suspend/resume cycles at all.
4623  *
4624  * CPU / memory hotplug is handled synchronously.
4625  */
4626 static void cpuset_handle_hotplug(void)
4627 {
4628         static cpumask_t new_cpus;
4629         static nodemask_t new_mems;
4630         bool cpus_updated, mems_updated;
4631         bool on_dfl = is_in_v2_mode();
4632         struct tmpmasks tmp, *ptmp = NULL;
4633 
4634         if (on_dfl && !alloc_cpumasks(NULL, &tmp))
4635                 ptmp = &tmp;
4636 
4637         lockdep_assert_cpus_held();
4638         mutex_lock(&cpuset_mutex);
4639 
4640         /* fetch the available cpus/mems and find out which changed how */
4641         cpumask_copy(&new_cpus, cpu_active_mask);
4642         new_mems = node_states[N_MEMORY];
4643 
4644         /*
4645          * If subpartitions_cpus is populated, it is likely that the check
4646          * below will produce a false positive on cpus_updated when the cpu
4647          * list isn't changed. It is extra work, but it is better to be safe.
4648          */
4649         cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus) ||
4650                        !cpumask_empty(subpartitions_cpus);
4651         mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems);
4652 
4653         /*
4654          * In the rare case that hotplug removes all the cpus in
4655          * subpartitions_cpus, we assumed that cpus are updated.
4656          */
4657         if (!cpus_updated && !cpumask_empty(subpartitions_cpus))
4658                 cpus_updated = true;
4659 
4660         /* For v1, synchronize cpus_allowed to cpu_active_mask */
4661         if (cpus_updated) {
4662                 spin_lock_irq(&callback_lock);
4663                 if (!on_dfl)
4664                         cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
4665                 /*
4666                  * Make sure that CPUs allocated to child partitions
4667                  * do not show up in effective_cpus. If no CPU is left,
4668                  * we clear the subpartitions_cpus & let the child partitions
4669                  * fight for the CPUs again.
4670                  */
4671                 if (!cpumask_empty(subpartitions_cpus)) {
4672                         if (cpumask_subset(&new_cpus, subpartitions_cpus)) {
4673                                 top_cpuset.nr_subparts = 0;
4674                                 cpumask_clear(subpartitions_cpus);
4675                         } else {
4676                                 cpumask_andnot(&new_cpus, &new_cpus,
4677                                                subpartitions_cpus);
4678                         }
4679                 }
4680                 cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
4681                 spin_unlock_irq(&callback_lock);
4682                 /* we don't mess with cpumasks of tasks in top_cpuset */
4683         }
4684 
4685         /* synchronize mems_allowed to N_MEMORY */
4686         if (mems_updated) {
4687                 spin_lock_irq(&callback_lock);
4688                 if (!on_dfl)
4689                         top_cpuset.mems_allowed = new_mems;
4690                 top_cpuset.effective_mems = new_mems;
4691                 spin_unlock_irq(&callback_lock);
4692                 update_tasks_nodemask(&top_cpuset);
4693         }
4694 
4695         mutex_unlock(&cpuset_mutex);
4696 
4697         /* if cpus or mems changed, we need to propagate to descendants */
4698         if (cpus_updated || mems_updated) {
4699                 struct cpuset *cs;
4700                 struct cgroup_subsys_state *pos_css;
4701 
4702                 rcu_read_lock();
4703                 cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
4704                         if (cs == &top_cpuset || !css_tryget_online(&cs->css))
4705                                 continue;
4706                         rcu_read_unlock();
4707 
4708                         cpuset_hotplug_update_tasks(cs, ptmp);
4709 
4710                         rcu_read_lock();
4711                         css_put(&cs->css);
4712                 }
4713                 rcu_read_unlock();
4714         }
4715 
4716         /* rebuild sched domains if cpus_allowed has changed */
4717         if (cpus_updated || force_rebuild) {
4718                 force_rebuild = false;
4719                 rebuild_sched_domains_cpuslocked();
4720         }
4721 
4722         free_cpumasks(NULL, ptmp);
4723 }
4724 
4725 void cpuset_update_active_cpus(void)
4726 {
4727         /*
4728          * We're inside cpu hotplug critical region which usually nests
4729          * inside cgroup synchronization.  Bounce actual hotplug processing
4730          * to a work item to avoid reverse locking order.
4731          */
4732         cpuset_handle_hotplug();
4733 }
4734 
4735 /*
4736  * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY].
4737  * Call this routine anytime after node_states[N_MEMORY] changes.
4738  * See cpuset_update_active_cpus() for CPU hotplug handling.
4739  */
4740 static int cpuset_track_online_nodes(struct notifier_block *self,
4741                                 unsigned long action, void *arg)
4742 {
4743         cpuset_handle_hotplug();
4744         return NOTIFY_OK;
4745 }
4746 
4747 /**
4748  * cpuset_init_smp - initialize cpus_allowed
4749  *
4750  * Description: Finish top cpuset after cpu, node maps are initialized
4751  */
4752 void __init cpuset_init_smp(void)
4753 {
4754         /*
4755          * cpus_allowd/mems_allowed set to v2 values in the initial
4756          * cpuset_bind() call will be reset to v1 values in another
4757          * cpuset_bind() call when v1 cpuset is mounted.
4758          */
4759         top_cpuset.old_mems_allowed = top_cpuset.mems_allowed;
4760 
4761         cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask);
4762         top_cpuset.effective_mems = node_states[N_MEMORY];
4763 
4764         hotplug_memory_notifier(cpuset_track_online_nodes, CPUSET_CALLBACK_PRI);
4765 
4766         cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0);
4767         BUG_ON(!cpuset_migrate_mm_wq);
4768 }
4769 
4770 /**
4771  * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset.
4772  * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.
4773  * @pmask: pointer to struct cpumask variable to receive cpus_allowed set.
4774  *
4775  * Description: Returns the cpumask_var_t cpus_allowed of the cpuset
4776  * attached to the specified @tsk.  Guaranteed to return some non-empty
4777  * subset of cpu_online_mask, even if this means going outside the
4778  * tasks cpuset, except when the task is in the top cpuset.
4779  **/
4780 
4781 void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
4782 {
4783         unsigned long flags;
4784         struct cpuset *cs;
4785 
4786         spin_lock_irqsave(&callback_lock, flags);
4787         rcu_read_lock();
4788 
4789         cs = task_cs(tsk);
4790         if (cs != &top_cpuset)
4791                 guarantee_online_cpus(tsk, pmask);
4792         /*
4793          * Tasks in the top cpuset won't get update to their cpumasks
4794          * when a hotplug online/offline event happens. So we include all
4795          * offline cpus in the allowed cpu list.
4796          */
4797         if ((cs == &top_cpuset) || cpumask_empty(pmask)) {
4798                 const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);
4799 
4800                 /*
4801                  * We first exclude cpus allocated to partitions. If there is no
4802                  * allowable online cpu left, we fall back to all possible cpus.
4803                  */
4804                 cpumask_andnot(pmask, possible_mask, subpartitions_cpus);
4805                 if (!cpumask_intersects(pmask, cpu_online_mask))
4806                         cpumask_copy(pmask, possible_mask);
4807         }
4808 
4809         rcu_read_unlock();
4810         spin_unlock_irqrestore(&callback_lock, flags);
4811 }
4812 
4813 /**
4814  * cpuset_cpus_allowed_fallback - final fallback before complete catastrophe.
4815  * @tsk: pointer to task_struct with which the scheduler is struggling
4816  *
4817  * Description: In the case that the scheduler cannot find an allowed cpu in
4818  * tsk->cpus_allowed, we fall back to task_cs(tsk)->cpus_allowed. In legacy
4819  * mode however, this value is the same as task_cs(tsk)->effective_cpus,
4820  * which will not contain a sane cpumask during cases such as cpu hotplugging.
4821  * This is the absolute last resort for the scheduler and it is only used if
4822  * _every_ other avenue has been traveled.
4823  *
4824  * Returns true if the affinity of @tsk was changed, false otherwise.
4825  **/
4826 
4827 bool cpuset_cpus_allowed_fallback(struct task_struct *tsk)
4828 {
4829         const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);
4830         const struct cpumask *cs_mask;
4831         bool changed = false;
4832 
4833         rcu_read_lock();
4834         cs_mask = task_cs(tsk)->cpus_allowed;
4835         if (is_in_v2_mode() && cpumask_subset(cs_mask, possible_mask)) {
4836                 do_set_cpus_allowed(tsk, cs_mask);
4837                 changed = true;
4838         }
4839         rcu_read_unlock();
4840 
4841         /*
4842          * We own tsk->cpus_allowed, nobody can change it under us.
4843          *
4844          * But we used cs && cs->cpus_allowed lockless and thus can
4845          * race with cgroup_attach_task() or update_cpumask() and get
4846          * the wrong tsk->cpus_allowed. However, both cases imply the
4847          * subsequent cpuset_change_cpumask()->set_cpus_allowed_ptr()
4848          * which takes task_rq_lock().
4849          *
4850          * If we are called after it dropped the lock we must see all
4851          * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary
4852          * set any mask even if it is not right from task_cs() pov,
4853          * the pending set_cpus_allowed_ptr() will fix things.
4854          *
4855          * select_fallback_rq() will fix things ups and set cpu_possible_mask
4856          * if required.
4857          */
4858         return changed;
4859 }
4860 
4861 void __init cpuset_init_current_mems_allowed(void)
4862 {
4863         nodes_setall(current->mems_allowed);
4864 }
4865 
4866 /**
4867  * cpuset_mems_allowed - return mems_allowed mask from a tasks cpuset.
4868  * @tsk: pointer to task_struct from which to obtain cpuset->mems_allowed.
4869  *
4870  * Description: Returns the nodemask_t mems_allowed of the cpuset
4871  * attached to the specified @tsk.  Guaranteed to return some non-empty
4872  * subset of node_states[N_MEMORY], even if this means going outside the
4873  * tasks cpuset.
4874  **/
4875 
4876 nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
4877 {
4878         nodemask_t mask;
4879         unsigned long flags;
4880 
4881         spin_lock_irqsave(&callback_lock, flags);
4882         rcu_read_lock();
4883         guarantee_online_mems(task_cs(tsk), &mask);
4884         rcu_read_unlock();
4885         spin_unlock_irqrestore(&callback_lock, flags);
4886 
4887         return mask;
4888 }
4889 
4890 /**
4891  * cpuset_nodemask_valid_mems_allowed - check nodemask vs. current mems_allowed
4892  * @nodemask: the nodemask to be checked
4893  *
4894  * Are any of the nodes in the nodemask allowed in current->mems_allowed?
4895  */
4896 int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
4897 {
4898         return nodes_intersects(*nodemask, current->mems_allowed);
4899 }
4900 
4901 /*
4902  * nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or
4903  * mem_hardwall ancestor to the specified cpuset.  Call holding
4904  * callback_lock.  If no ancestor is mem_exclusive or mem_hardwall
4905  * (an unusual configuration), then returns the root cpuset.
4906  */
4907 static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
4908 {
4909         while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs))
4910                 cs = parent_cs(cs);
4911         return cs;
4912 }
4913 
4914 /*
4915  * cpuset_node_allowed - Can we allocate on a memory node?
4916  * @node: is this an allowed node?
4917  * @gfp_mask: memory allocation flags
4918  *
4919  * If we're in interrupt, yes, we can always allocate.  If @node is set in
4920  * current's mems_allowed, yes.  If it's not a __GFP_HARDWALL request and this
4921  * node is set in the nearest hardwalled cpuset ancestor to current's cpuset,
4922  * yes.  If current has access to memory reserves as an oom victim, yes.
4923  * Otherwise, no.
4924  *
4925  * GFP_USER allocations are marked with the __GFP_HARDWALL bit,
4926  * and do not allow allocations outside the current tasks cpuset
4927  * unless the task has been OOM killed.
4928  * GFP_KERNEL allocations are not so marked, so can escape to the
4929  * nearest enclosing hardwalled ancestor cpuset.
4930  *
4931  * Scanning up parent cpusets requires callback_lock.  The
4932  * __alloc_pages() routine only calls here with __GFP_HARDWALL bit
4933  * _not_ set if it's a GFP_KERNEL allocation, and all nodes in the
4934  * current tasks mems_allowed came up empty on the first pass over
4935  * the zonelist.  So only GFP_KERNEL allocations, if all nodes in the
4936  * cpuset are short of memory, might require taking the callback_lock.
4937  *
4938  * The first call here from mm/page_alloc:get_page_from_freelist()
4939  * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets,
4940  * so no allocation on a node outside the cpuset is allowed (unless
4941  * in interrupt, of course).
4942  *
4943  * The second pass through get_page_from_freelist() doesn't even call
4944  * here for GFP_ATOMIC calls.  For those calls, the __alloc_pages()
4945  * variable 'wait' is not set, and the bit ALLOC_CPUSET is not set
4946  * in alloc_flags.  That logic and the checks below have the combined
4947  * affect that:
4948  *      in_interrupt - any node ok (current task context irrelevant)
4949  *      GFP_ATOMIC   - any node ok
4950  *      tsk_is_oom_victim   - any node ok
4951  *      GFP_KERNEL   - any node in enclosing hardwalled cpuset ok
4952  *      GFP_USER     - only nodes in current tasks mems allowed ok.
4953  */
4954 bool cpuset_node_allowed(int node, gfp_t gfp_mask)
4955 {
4956         struct cpuset *cs;              /* current cpuset ancestors */
4957         bool allowed;                   /* is allocation in zone z allowed? */
4958         unsigned long flags;
4959 
4960         if (in_interrupt())
4961                 return true;
4962         if (node_isset(node, current->mems_allowed))
4963                 return true;
4964         /*
4965          * Allow tasks that have access to memory reserves because they have
4966          * been OOM killed to get memory anywhere.
4967          */
4968         if (unlikely(tsk_is_oom_victim(current)))
4969                 return true;
4970         if (gfp_mask & __GFP_HARDWALL)  /* If hardwall request, stop here */
4971                 return false;
4972 
4973         if (current->flags & PF_EXITING) /* Let dying task have memory */
4974                 return true;
4975 
4976         /* Not hardwall and node outside mems_allowed: scan up cpusets */
4977         spin_lock_irqsave(&callback_lock, flags);
4978 
4979         rcu_read_lock();
4980         cs = nearest_hardwall_ancestor(task_cs(current));
4981         allowed = node_isset(node, cs->mems_allowed);
4982         rcu_read_unlock();
4983 
4984         spin_unlock_irqrestore(&callback_lock, flags);
4985         return allowed;
4986 }
4987 
4988 /**
4989  * cpuset_spread_node() - On which node to begin search for a page
4990  * @rotor: round robin rotor
4991  *
4992  * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for
4993  * tasks in a cpuset with is_spread_page or is_spread_slab set),
4994  * and if the memory allocation used cpuset_mem_spread_node()
4995  * to determine on which node to start looking, as it will for
4996  * certain page cache or slab cache pages such as used for file
4997  * system buffers and inode caches, then instead of starting on the
4998  * local node to look for a free page, rather spread the starting
4999  * node around the tasks mems_allowed nodes.
5000  *
5001  * We don't have to worry about the returned node being offline
5002  * because "it can't happen", and even if it did, it would be ok.
5003  *
5004  * The routines calling guarantee_online_mems() are careful to
5005  * only set nodes in task->mems_allowed that are online.  So it
5006  * should not be possible for the following code to return an
5007  * offline node.  But if it did, that would be ok, as this routine
5008  * is not returning the node where the allocation must be, only
5009  * the node where the search should start.  The zonelist passed to
5010  * __alloc_pages() will include all nodes.  If the slab allocator
5011  * is passed an offline node, it will fall back to the local node.
5012  * See kmem_cache_alloc_node().
5013  */
5014 static int cpuset_spread_node(int *rotor)
5015 {
5016         return *rotor = next_node_in(*rotor, current->mems_allowed);
5017 }
5018 
5019 /**
5020  * cpuset_mem_spread_node() - On which node to begin search for a file page
5021  */
5022 int cpuset_mem_spread_node(void)
5023 {
5024         if (current->cpuset_mem_spread_rotor == NUMA_NO_NODE)
5025                 current->cpuset_mem_spread_rotor =
5026                         node_random(&current->mems_allowed);
5027 
5028         return cpuset_spread_node(&current->cpuset_mem_spread_rotor);
5029 }
5030 
5031 /**
5032  * cpuset_slab_spread_node() - On which node to begin search for a slab page
5033  */
5034 int cpuset_slab_spread_node(void)
5035 {
5036         if (current->cpuset_slab_spread_rotor == NUMA_NO_NODE)
5037                 current->cpuset_slab_spread_rotor =
5038                         node_random(&current->mems_allowed);
5039 
5040         return cpuset_spread_node(&current->cpuset_slab_spread_rotor);
5041 }
5042 EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
5043 
5044 /**
5045  * cpuset_mems_allowed_intersects - Does @tsk1's mems_allowed intersect @tsk2's?
5046  * @tsk1: pointer to task_struct of some task.
5047  * @tsk2: pointer to task_struct of some other task.
5048  *
5049  * Description: Return true if @tsk1's mems_allowed intersects the
5050  * mems_allowed of @tsk2.  Used by the OOM killer to determine if
5051  * one of the task's memory usage might impact the memory available
5052  * to the other.
5053  **/
5054 
5055 int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
5056                                    const struct task_struct *tsk2)
5057 {
5058         return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
5059 }
5060 
5061 /**
5062  * cpuset_print_current_mems_allowed - prints current's cpuset and mems_allowed
5063  *
5064  * Description: Prints current's name, cpuset name, and cached copy of its
5065  * mems_allowed to the kernel log.
5066  */
5067 void cpuset_print_current_mems_allowed(void)
5068 {
5069         struct cgroup *cgrp;
5070 
5071         rcu_read_lock();
5072 
5073         cgrp = task_cs(current)->css.cgroup;
5074         pr_cont(",cpuset=");
5075         pr_cont_cgroup_name(cgrp);
5076         pr_cont(",mems_allowed=%*pbl",
5077                 nodemask_pr_args(&current->mems_allowed));
5078 
5079         rcu_read_unlock();
5080 }
5081 
5082 /*
5083  * Collection of memory_pressure is suppressed unless
5084  * this flag is enabled by writing "1" to the special
5085  * cpuset file 'memory_pressure_enabled' in the root cpuset.
5086  */
5087 
5088 int cpuset_memory_pressure_enabled __read_mostly;
5089 
5090 /*
5091  * __cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims.
5092  *
5093  * Keep a running average of the rate of synchronous (direct)
5094  * page reclaim efforts initiated by tasks in each cpuset.
5095  *
5096  * This represents the rate at which some task in the cpuset
5097  * ran low on memory on all nodes it was allowed to use, and
5098  * had to enter the kernels page reclaim code in an effort to
5099  * create more free memory by tossing clean pages or swapping
5100  * or writing dirty pages.
5101  *
5102  * Display to user space in the per-cpuset read-only file
5103  * "memory_pressure".  Value displayed is an integer
5104  * representing the recent rate of entry into the synchronous
5105  * (direct) page reclaim by any task attached to the cpuset.
5106  */
5107 
5108 void __cpuset_memory_pressure_bump(void)
5109 {
5110         rcu_read_lock();
5111         fmeter_markevent(&task_cs(current)->fmeter);
5112         rcu_read_unlock();
5113 }
5114 
5115 #ifdef CONFIG_PROC_PID_CPUSET
5116 /*
5117  * proc_cpuset_show()
5118  *  - Print tasks cpuset path into seq_file.
5119  *  - Used for /proc/<pid>/cpuset.
5120  *  - No need to task_lock(tsk) on this tsk->cpuset reference, as it
5121  *    doesn't really matter if tsk->cpuset changes after we read it,
5122  *    and we take cpuset_mutex, keeping cpuset_attach() from changing it
5123  *    anyway.
5124  */
5125 int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
5126                      struct pid *pid, struct task_struct *tsk)
5127 {
5128         char *buf;
5129         struct cgroup_subsys_state *css;
5130         int retval;
5131 
5132         retval = -ENOMEM;
5133         buf = kmalloc(PATH_MAX, GFP_KERNEL);
5134         if (!buf)
5135                 goto out;
5136 
5137         rcu_read_lock();
5138         spin_lock_irq(&css_set_lock);
5139         css = task_css(tsk, cpuset_cgrp_id);
5140         retval = cgroup_path_ns_locked(css->cgroup, buf, PATH_MAX,
5141                                        current->nsproxy->cgroup_ns);
5142         spin_unlock_irq(&css_set_lock);
5143         rcu_read_unlock();
5144 
5145         if (retval == -E2BIG)
5146                 retval = -ENAMETOOLONG;
5147         if (retval < 0)
5148                 goto out_free;
5149         seq_puts(m, buf);
5150         seq_putc(m, '\n');
5151         retval = 0;
5152 out_free:
5153         kfree(buf);
5154 out:
5155         return retval;
5156 }
5157 #endif /* CONFIG_PROC_PID_CPUSET */
5158 
5159 /* Display task mems_allowed in /proc/<pid>/status file. */
5160 void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
5161 {
5162         seq_printf(m, "Mems_allowed:\t%*pb\n",
5163                    nodemask_pr_args(&task->mems_allowed));
5164         seq_printf(m, "Mems_allowed_list:\t%*pbl\n",
5165                    nodemask_pr_args(&task->mems_allowed));
5166 }
5167 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

sflogo.php