~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/mm/memcontrol-v1.c

Version: ~ [ linux-6.12-rc7 ] ~ [ linux-6.11.7 ] ~ [ linux-6.10.14 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.60 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.116 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.171 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.229 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.285 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.323 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.12 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

Diff markup

Differences between /mm/memcontrol-v1.c (Architecture sparc) and /mm/memcontrol-v1.c (Architecture sparc64)


  1 // SPDX-License-Identifier: GPL-2.0-or-later        1 // SPDX-License-Identifier: GPL-2.0-or-later
  2                                                     2 
  3 #include <linux/memcontrol.h>                       3 #include <linux/memcontrol.h>
  4 #include <linux/swap.h>                             4 #include <linux/swap.h>
  5 #include <linux/mm_inline.h>                        5 #include <linux/mm_inline.h>
  6 #include <linux/pagewalk.h>                         6 #include <linux/pagewalk.h>
  7 #include <linux/backing-dev.h>                      7 #include <linux/backing-dev.h>
  8 #include <linux/swap_cgroup.h>                      8 #include <linux/swap_cgroup.h>
  9 #include <linux/eventfd.h>                          9 #include <linux/eventfd.h>
 10 #include <linux/poll.h>                            10 #include <linux/poll.h>
 11 #include <linux/sort.h>                            11 #include <linux/sort.h>
 12 #include <linux/file.h>                            12 #include <linux/file.h>
 13 #include <linux/seq_buf.h>                         13 #include <linux/seq_buf.h>
 14                                                    14 
 15 #include "internal.h"                              15 #include "internal.h"
 16 #include "swap.h"                                  16 #include "swap.h"
 17 #include "memcontrol-v1.h"                         17 #include "memcontrol-v1.h"
 18                                                    18 
 19 /*                                                 19 /*
 20  * Cgroups above their limits are maintained i     20  * Cgroups above their limits are maintained in a RB-Tree, independent of
 21  * their hierarchy representation                  21  * their hierarchy representation
 22  */                                                22  */
 23                                                    23 
 24 struct mem_cgroup_tree_per_node {                  24 struct mem_cgroup_tree_per_node {
 25         struct rb_root rb_root;                    25         struct rb_root rb_root;
 26         struct rb_node *rb_rightmost;              26         struct rb_node *rb_rightmost;
 27         spinlock_t lock;                           27         spinlock_t lock;
 28 };                                                 28 };
 29                                                    29 
 30 struct mem_cgroup_tree {                           30 struct mem_cgroup_tree {
 31         struct mem_cgroup_tree_per_node *rb_tr     31         struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
 32 };                                                 32 };
 33                                                    33 
 34 static struct mem_cgroup_tree soft_limit_tree      34 static struct mem_cgroup_tree soft_limit_tree __read_mostly;
 35                                                    35 
 36 /*                                                 36 /*
 37  * Maximum loops in mem_cgroup_soft_reclaim(),     37  * Maximum loops in mem_cgroup_soft_reclaim(), used for soft
 38  * limit reclaim to prevent infinite loops, if     38  * limit reclaim to prevent infinite loops, if they ever occur.
 39  */                                                39  */
 40 #define MEM_CGROUP_MAX_RECLAIM_LOOPS               40 #define MEM_CGROUP_MAX_RECLAIM_LOOPS            100
 41 #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOP     41 #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
 42                                                    42 
 43 /* Stuffs for move charges at task migration.      43 /* Stuffs for move charges at task migration. */
 44 /*                                                 44 /*
 45  * Types of charges to be moved.                   45  * Types of charges to be moved.
 46  */                                                46  */
 47 #define MOVE_ANON       0x1ULL                     47 #define MOVE_ANON       0x1ULL
 48 #define MOVE_FILE       0x2ULL                     48 #define MOVE_FILE       0x2ULL
 49 #define MOVE_MASK       (MOVE_ANON | MOVE_FILE     49 #define MOVE_MASK       (MOVE_ANON | MOVE_FILE)
 50                                                    50 
 51 /* "mc" and its members are protected by cgrou     51 /* "mc" and its members are protected by cgroup_mutex */
 52 static struct move_charge_struct {                 52 static struct move_charge_struct {
 53         spinlock_t        lock; /* for from, t     53         spinlock_t        lock; /* for from, to */
 54         struct mm_struct  *mm;                     54         struct mm_struct  *mm;
 55         struct mem_cgroup *from;                   55         struct mem_cgroup *from;
 56         struct mem_cgroup *to;                     56         struct mem_cgroup *to;
 57         unsigned long flags;                       57         unsigned long flags;
 58         unsigned long precharge;                   58         unsigned long precharge;
 59         unsigned long moved_charge;                59         unsigned long moved_charge;
 60         unsigned long moved_swap;                  60         unsigned long moved_swap;
 61         struct task_struct *moving_task;           61         struct task_struct *moving_task;        /* a task moving charges */
 62         wait_queue_head_t waitq;                   62         wait_queue_head_t waitq;                /* a waitq for other context */
 63 } mc = {                                           63 } mc = {
 64         .lock = __SPIN_LOCK_UNLOCKED(mc.lock),     64         .lock = __SPIN_LOCK_UNLOCKED(mc.lock),
 65         .waitq = __WAIT_QUEUE_HEAD_INITIALIZER     65         .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
 66 };                                                 66 };
 67                                                    67 
 68 /* for OOM */                                      68 /* for OOM */
 69 struct mem_cgroup_eventfd_list {                   69 struct mem_cgroup_eventfd_list {
 70         struct list_head list;                     70         struct list_head list;
 71         struct eventfd_ctx *eventfd;               71         struct eventfd_ctx *eventfd;
 72 };                                                 72 };
 73                                                    73 
 74 /*                                                 74 /*
 75  * cgroup_event represents events which usersp     75  * cgroup_event represents events which userspace want to receive.
 76  */                                                76  */
 77 struct mem_cgroup_event {                          77 struct mem_cgroup_event {
 78         /*                                         78         /*
 79          * memcg which the event belongs to.       79          * memcg which the event belongs to.
 80          */                                        80          */
 81         struct mem_cgroup *memcg;                  81         struct mem_cgroup *memcg;
 82         /*                                         82         /*
 83          * eventfd to signal userspace about t     83          * eventfd to signal userspace about the event.
 84          */                                        84          */
 85         struct eventfd_ctx *eventfd;               85         struct eventfd_ctx *eventfd;
 86         /*                                         86         /*
 87          * Each of these stored in a list by t     87          * Each of these stored in a list by the cgroup.
 88          */                                        88          */
 89         struct list_head list;                     89         struct list_head list;
 90         /*                                         90         /*
 91          * register_event() callback will be u     91          * register_event() callback will be used to add new userspace
 92          * waiter for changes related to this      92          * waiter for changes related to this event.  Use eventfd_signal()
 93          * on eventfd to send notification to      93          * on eventfd to send notification to userspace.
 94          */                                        94          */
 95         int (*register_event)(struct mem_cgrou     95         int (*register_event)(struct mem_cgroup *memcg,
 96                               struct eventfd_c     96                               struct eventfd_ctx *eventfd, const char *args);
 97         /*                                         97         /*
 98          * unregister_event() callback will be     98          * unregister_event() callback will be called when userspace closes
 99          * the eventfd or on cgroup removing.      99          * the eventfd or on cgroup removing.  This callback must be set,
100          * if you want provide notification fu    100          * if you want provide notification functionality.
101          */                                       101          */
102         void (*unregister_event)(struct mem_cg    102         void (*unregister_event)(struct mem_cgroup *memcg,
103                                  struct eventf    103                                  struct eventfd_ctx *eventfd);
104         /*                                        104         /*
105          * All fields below needed to unregist    105          * All fields below needed to unregister event when
106          * userspace closes eventfd.              106          * userspace closes eventfd.
107          */                                       107          */
108         poll_table pt;                            108         poll_table pt;
109         wait_queue_head_t *wqh;                   109         wait_queue_head_t *wqh;
110         wait_queue_entry_t wait;                  110         wait_queue_entry_t wait;
111         struct work_struct remove;                111         struct work_struct remove;
112 };                                                112 };
113                                                   113 
114 #define MEMFILE_PRIVATE(x, val) ((x) << 16 | (    114 #define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
115 #define MEMFILE_TYPE(val)       ((val) >> 16 &    115 #define MEMFILE_TYPE(val)       ((val) >> 16 & 0xffff)
116 #define MEMFILE_ATTR(val)       ((val) & 0xfff    116 #define MEMFILE_ATTR(val)       ((val) & 0xffff)
117                                                   117 
118 enum {                                            118 enum {
119         RES_USAGE,                                119         RES_USAGE,
120         RES_LIMIT,                                120         RES_LIMIT,
121         RES_MAX_USAGE,                            121         RES_MAX_USAGE,
122         RES_FAILCNT,                              122         RES_FAILCNT,
123         RES_SOFT_LIMIT,                           123         RES_SOFT_LIMIT,
124 };                                                124 };
125                                                   125 
126 #ifdef CONFIG_LOCKDEP                             126 #ifdef CONFIG_LOCKDEP
127 static struct lockdep_map memcg_oom_lock_dep_m    127 static struct lockdep_map memcg_oom_lock_dep_map = {
128         .name = "memcg_oom_lock",                 128         .name = "memcg_oom_lock",
129 };                                                129 };
130 #endif                                            130 #endif
131                                                   131 
132 DEFINE_SPINLOCK(memcg_oom_lock);                  132 DEFINE_SPINLOCK(memcg_oom_lock);
133                                                   133 
134 static void __mem_cgroup_insert_exceeded(struc    134 static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
135                                          struc    135                                          struct mem_cgroup_tree_per_node *mctz,
136                                          unsig    136                                          unsigned long new_usage_in_excess)
137 {                                                 137 {
138         struct rb_node **p = &mctz->rb_root.rb    138         struct rb_node **p = &mctz->rb_root.rb_node;
139         struct rb_node *parent = NULL;            139         struct rb_node *parent = NULL;
140         struct mem_cgroup_per_node *mz_node;      140         struct mem_cgroup_per_node *mz_node;
141         bool rightmost = true;                    141         bool rightmost = true;
142                                                   142 
143         if (mz->on_tree)                          143         if (mz->on_tree)
144                 return;                           144                 return;
145                                                   145 
146         mz->usage_in_excess = new_usage_in_exc    146         mz->usage_in_excess = new_usage_in_excess;
147         if (!mz->usage_in_excess)                 147         if (!mz->usage_in_excess)
148                 return;                           148                 return;
149         while (*p) {                              149         while (*p) {
150                 parent = *p;                      150                 parent = *p;
151                 mz_node = rb_entry(parent, str    151                 mz_node = rb_entry(parent, struct mem_cgroup_per_node,
152                                         tree_n    152                                         tree_node);
153                 if (mz->usage_in_excess < mz_n    153                 if (mz->usage_in_excess < mz_node->usage_in_excess) {
154                         p = &(*p)->rb_left;       154                         p = &(*p)->rb_left;
155                         rightmost = false;        155                         rightmost = false;
156                 } else {                          156                 } else {
157                         p = &(*p)->rb_right;      157                         p = &(*p)->rb_right;
158                 }                                 158                 }
159         }                                         159         }
160                                                   160 
161         if (rightmost)                            161         if (rightmost)
162                 mctz->rb_rightmost = &mz->tree    162                 mctz->rb_rightmost = &mz->tree_node;
163                                                   163 
164         rb_link_node(&mz->tree_node, parent, p    164         rb_link_node(&mz->tree_node, parent, p);
165         rb_insert_color(&mz->tree_node, &mctz-    165         rb_insert_color(&mz->tree_node, &mctz->rb_root);
166         mz->on_tree = true;                       166         mz->on_tree = true;
167 }                                                 167 }
168                                                   168 
169 static void __mem_cgroup_remove_exceeded(struc    169 static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
170                                          struc    170                                          struct mem_cgroup_tree_per_node *mctz)
171 {                                                 171 {
172         if (!mz->on_tree)                         172         if (!mz->on_tree)
173                 return;                           173                 return;
174                                                   174 
175         if (&mz->tree_node == mctz->rb_rightmo    175         if (&mz->tree_node == mctz->rb_rightmost)
176                 mctz->rb_rightmost = rb_prev(&    176                 mctz->rb_rightmost = rb_prev(&mz->tree_node);
177                                                   177 
178         rb_erase(&mz->tree_node, &mctz->rb_roo    178         rb_erase(&mz->tree_node, &mctz->rb_root);
179         mz->on_tree = false;                      179         mz->on_tree = false;
180 }                                                 180 }
181                                                   181 
182 static void mem_cgroup_remove_exceeded(struct     182 static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
183                                        struct     183                                        struct mem_cgroup_tree_per_node *mctz)
184 {                                                 184 {
185         unsigned long flags;                      185         unsigned long flags;
186                                                   186 
187         spin_lock_irqsave(&mctz->lock, flags);    187         spin_lock_irqsave(&mctz->lock, flags);
188         __mem_cgroup_remove_exceeded(mz, mctz)    188         __mem_cgroup_remove_exceeded(mz, mctz);
189         spin_unlock_irqrestore(&mctz->lock, fl    189         spin_unlock_irqrestore(&mctz->lock, flags);
190 }                                                 190 }
191                                                   191 
192 static unsigned long soft_limit_excess(struct     192 static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
193 {                                                 193 {
194         unsigned long nr_pages = page_counter_    194         unsigned long nr_pages = page_counter_read(&memcg->memory);
195         unsigned long soft_limit = READ_ONCE(m    195         unsigned long soft_limit = READ_ONCE(memcg->soft_limit);
196         unsigned long excess = 0;                 196         unsigned long excess = 0;
197                                                   197 
198         if (nr_pages > soft_limit)                198         if (nr_pages > soft_limit)
199                 excess = nr_pages - soft_limit    199                 excess = nr_pages - soft_limit;
200                                                   200 
201         return excess;                            201         return excess;
202 }                                                 202 }
203                                                   203 
204 static void memcg1_update_tree(struct mem_cgro    204 static void memcg1_update_tree(struct mem_cgroup *memcg, int nid)
205 {                                                 205 {
206         unsigned long excess;                     206         unsigned long excess;
207         struct mem_cgroup_per_node *mz;           207         struct mem_cgroup_per_node *mz;
208         struct mem_cgroup_tree_per_node *mctz;    208         struct mem_cgroup_tree_per_node *mctz;
209                                                   209 
210         if (lru_gen_enabled()) {                  210         if (lru_gen_enabled()) {
211                 if (soft_limit_excess(memcg))     211                 if (soft_limit_excess(memcg))
212                         lru_gen_soft_reclaim(m    212                         lru_gen_soft_reclaim(memcg, nid);
213                 return;                           213                 return;
214         }                                         214         }
215                                                   215 
216         mctz = soft_limit_tree.rb_tree_per_nod    216         mctz = soft_limit_tree.rb_tree_per_node[nid];
217         if (!mctz)                                217         if (!mctz)
218                 return;                           218                 return;
219         /*                                        219         /*
220          * Necessary to update all ancestors w    220          * Necessary to update all ancestors when hierarchy is used.
221          * because their event counter is not     221          * because their event counter is not touched.
222          */                                       222          */
223         for (; memcg; memcg = parent_mem_cgrou    223         for (; memcg; memcg = parent_mem_cgroup(memcg)) {
224                 mz = memcg->nodeinfo[nid];        224                 mz = memcg->nodeinfo[nid];
225                 excess = soft_limit_excess(mem    225                 excess = soft_limit_excess(memcg);
226                 /*                                226                 /*
227                  * We have to update the tree     227                  * We have to update the tree if mz is on RB-tree or
228                  * mem is over its softlimit.     228                  * mem is over its softlimit.
229                  */                               229                  */
230                 if (excess || mz->on_tree) {      230                 if (excess || mz->on_tree) {
231                         unsigned long flags;      231                         unsigned long flags;
232                                                   232 
233                         spin_lock_irqsave(&mct    233                         spin_lock_irqsave(&mctz->lock, flags);
234                         /* if on-tree, remove     234                         /* if on-tree, remove it */
235                         if (mz->on_tree)          235                         if (mz->on_tree)
236                                 __mem_cgroup_r    236                                 __mem_cgroup_remove_exceeded(mz, mctz);
237                         /*                        237                         /*
238                          * Insert again. mz->u    238                          * Insert again. mz->usage_in_excess will be updated.
239                          * If excess is 0, no     239                          * If excess is 0, no tree ops.
240                          */                       240                          */
241                         __mem_cgroup_insert_ex    241                         __mem_cgroup_insert_exceeded(mz, mctz, excess);
242                         spin_unlock_irqrestore    242                         spin_unlock_irqrestore(&mctz->lock, flags);
243                 }                                 243                 }
244         }                                         244         }
245 }                                                 245 }
246                                                   246 
247 void memcg1_remove_from_trees(struct mem_cgrou    247 void memcg1_remove_from_trees(struct mem_cgroup *memcg)
248 {                                                 248 {
249         struct mem_cgroup_tree_per_node *mctz;    249         struct mem_cgroup_tree_per_node *mctz;
250         struct mem_cgroup_per_node *mz;           250         struct mem_cgroup_per_node *mz;
251         int nid;                                  251         int nid;
252                                                   252 
253         for_each_node(nid) {                      253         for_each_node(nid) {
254                 mz = memcg->nodeinfo[nid];        254                 mz = memcg->nodeinfo[nid];
255                 mctz = soft_limit_tree.rb_tree    255                 mctz = soft_limit_tree.rb_tree_per_node[nid];
256                 if (mctz)                         256                 if (mctz)
257                         mem_cgroup_remove_exce    257                         mem_cgroup_remove_exceeded(mz, mctz);
258         }                                         258         }
259 }                                                 259 }
260                                                   260 
261 static struct mem_cgroup_per_node *               261 static struct mem_cgroup_per_node *
262 __mem_cgroup_largest_soft_limit_node(struct me    262 __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
263 {                                                 263 {
264         struct mem_cgroup_per_node *mz;           264         struct mem_cgroup_per_node *mz;
265                                                   265 
266 retry:                                            266 retry:
267         mz = NULL;                                267         mz = NULL;
268         if (!mctz->rb_rightmost)                  268         if (!mctz->rb_rightmost)
269                 goto done;              /* Not    269                 goto done;              /* Nothing to reclaim from */
270                                                   270 
271         mz = rb_entry(mctz->rb_rightmost,         271         mz = rb_entry(mctz->rb_rightmost,
272                       struct mem_cgroup_per_no    272                       struct mem_cgroup_per_node, tree_node);
273         /*                                        273         /*
274          * Remove the node now but someone els    274          * Remove the node now but someone else can add it back,
275          * we will to add it back at the end o    275          * we will to add it back at the end of reclaim to its correct
276          * position in the tree.                  276          * position in the tree.
277          */                                       277          */
278         __mem_cgroup_remove_exceeded(mz, mctz)    278         __mem_cgroup_remove_exceeded(mz, mctz);
279         if (!soft_limit_excess(mz->memcg) ||      279         if (!soft_limit_excess(mz->memcg) ||
280             !css_tryget(&mz->memcg->css))         280             !css_tryget(&mz->memcg->css))
281                 goto retry;                       281                 goto retry;
282 done:                                             282 done:
283         return mz;                                283         return mz;
284 }                                                 284 }
285                                                   285 
286 static struct mem_cgroup_per_node *               286 static struct mem_cgroup_per_node *
287 mem_cgroup_largest_soft_limit_node(struct mem_    287 mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
288 {                                                 288 {
289         struct mem_cgroup_per_node *mz;           289         struct mem_cgroup_per_node *mz;
290                                                   290 
291         spin_lock_irq(&mctz->lock);               291         spin_lock_irq(&mctz->lock);
292         mz = __mem_cgroup_largest_soft_limit_n    292         mz = __mem_cgroup_largest_soft_limit_node(mctz);
293         spin_unlock_irq(&mctz->lock);             293         spin_unlock_irq(&mctz->lock);
294         return mz;                                294         return mz;
295 }                                                 295 }
296                                                   296 
297 static int mem_cgroup_soft_reclaim(struct mem_    297 static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
298                                    pg_data_t *    298                                    pg_data_t *pgdat,
299                                    gfp_t gfp_m    299                                    gfp_t gfp_mask,
300                                    unsigned lo    300                                    unsigned long *total_scanned)
301 {                                                 301 {
302         struct mem_cgroup *victim = NULL;         302         struct mem_cgroup *victim = NULL;
303         int total = 0;                            303         int total = 0;
304         int loop = 0;                             304         int loop = 0;
305         unsigned long excess;                     305         unsigned long excess;
306         unsigned long nr_scanned;                 306         unsigned long nr_scanned;
307         struct mem_cgroup_reclaim_cookie recla    307         struct mem_cgroup_reclaim_cookie reclaim = {
308                 .pgdat = pgdat,                   308                 .pgdat = pgdat,
309         };                                        309         };
310                                                   310 
311         excess = soft_limit_excess(root_memcg)    311         excess = soft_limit_excess(root_memcg);
312                                                   312 
313         while (1) {                               313         while (1) {
314                 victim = mem_cgroup_iter(root_    314                 victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
315                 if (!victim) {                    315                 if (!victim) {
316                         loop++;                   316                         loop++;
317                         if (loop >= 2) {          317                         if (loop >= 2) {
318                                 /*                318                                 /*
319                                  * If we have     319                                  * If we have not been able to reclaim
320                                  * anything, i    320                                  * anything, it might because there are
321                                  * no reclaima    321                                  * no reclaimable pages under this hierarchy
322                                  */               322                                  */
323                                 if (!total)       323                                 if (!total)
324                                         break;    324                                         break;
325                                 /*                325                                 /*
326                                  * We want to     326                                  * We want to do more targeted reclaim.
327                                  * excess >> 2    327                                  * excess >> 2 is not to excessive so as to
328                                  * reclaim too    328                                  * reclaim too much, nor too less that we keep
329                                  * coming back    329                                  * coming back to reclaim from this cgroup
330                                  */               330                                  */
331                                 if (total >= (    331                                 if (total >= (excess >> 2) ||
332                                         (loop     332                                         (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
333                                         break;    333                                         break;
334                         }                         334                         }
335                         continue;                 335                         continue;
336                 }                                 336                 }
337                 total += mem_cgroup_shrink_nod    337                 total += mem_cgroup_shrink_node(victim, gfp_mask, false,
338                                         pgdat,    338                                         pgdat, &nr_scanned);
339                 *total_scanned += nr_scanned;     339                 *total_scanned += nr_scanned;
340                 if (!soft_limit_excess(root_me    340                 if (!soft_limit_excess(root_memcg))
341                         break;                    341                         break;
342         }                                         342         }
343         mem_cgroup_iter_break(root_memcg, vict    343         mem_cgroup_iter_break(root_memcg, victim);
344         return total;                             344         return total;
345 }                                                 345 }
346                                                   346 
347 unsigned long memcg1_soft_limit_reclaim(pg_dat    347 unsigned long memcg1_soft_limit_reclaim(pg_data_t *pgdat, int order,
348                                             gf    348                                             gfp_t gfp_mask,
349                                             un    349                                             unsigned long *total_scanned)
350 {                                                 350 {
351         unsigned long nr_reclaimed = 0;           351         unsigned long nr_reclaimed = 0;
352         struct mem_cgroup_per_node *mz, *next_    352         struct mem_cgroup_per_node *mz, *next_mz = NULL;
353         unsigned long reclaimed;                  353         unsigned long reclaimed;
354         int loop = 0;                             354         int loop = 0;
355         struct mem_cgroup_tree_per_node *mctz;    355         struct mem_cgroup_tree_per_node *mctz;
356         unsigned long excess;                     356         unsigned long excess;
357                                                   357 
358         if (lru_gen_enabled())                    358         if (lru_gen_enabled())
359                 return 0;                         359                 return 0;
360                                                   360 
361         if (order > 0)                            361         if (order > 0)
362                 return 0;                         362                 return 0;
363                                                   363 
364         mctz = soft_limit_tree.rb_tree_per_nod    364         mctz = soft_limit_tree.rb_tree_per_node[pgdat->node_id];
365                                                   365 
366         /*                                        366         /*
367          * Do not even bother to check the lar    367          * Do not even bother to check the largest node if the root
368          * is empty. Do it lockless to prevent    368          * is empty. Do it lockless to prevent lock bouncing. Races
369          * are acceptable as soft limit is bes    369          * are acceptable as soft limit is best effort anyway.
370          */                                       370          */
371         if (!mctz || RB_EMPTY_ROOT(&mctz->rb_r    371         if (!mctz || RB_EMPTY_ROOT(&mctz->rb_root))
372                 return 0;                         372                 return 0;
373                                                   373 
374         /*                                        374         /*
375          * This loop can run a while, speciall    375          * This loop can run a while, specially if mem_cgroup's continuously
376          * keep exceeding their soft limit and    376          * keep exceeding their soft limit and putting the system under
377          * pressure                               377          * pressure
378          */                                       378          */
379         do {                                      379         do {
380                 if (next_mz)                      380                 if (next_mz)
381                         mz = next_mz;             381                         mz = next_mz;
382                 else                              382                 else
383                         mz = mem_cgroup_larges    383                         mz = mem_cgroup_largest_soft_limit_node(mctz);
384                 if (!mz)                          384                 if (!mz)
385                         break;                    385                         break;
386                                                   386 
387                 reclaimed = mem_cgroup_soft_re    387                 reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat,
388                                                   388                                                     gfp_mask, total_scanned);
389                 nr_reclaimed += reclaimed;        389                 nr_reclaimed += reclaimed;
390                 spin_lock_irq(&mctz->lock);       390                 spin_lock_irq(&mctz->lock);
391                                                   391 
392                 /*                                392                 /*
393                  * If we failed to reclaim any    393                  * If we failed to reclaim anything from this memory cgroup
394                  * it is time to move on to th    394                  * it is time to move on to the next cgroup
395                  */                               395                  */
396                 next_mz = NULL;                   396                 next_mz = NULL;
397                 if (!reclaimed)                   397                 if (!reclaimed)
398                         next_mz = __mem_cgroup    398                         next_mz = __mem_cgroup_largest_soft_limit_node(mctz);
399                                                   399 
400                 excess = soft_limit_excess(mz-    400                 excess = soft_limit_excess(mz->memcg);
401                 /*                                401                 /*
402                  * One school of thought says     402                  * One school of thought says that we should not add
403                  * back the node to the tree i    403                  * back the node to the tree if reclaim returns 0.
404                  * But our reclaim could retur    404                  * But our reclaim could return 0, simply because due
405                  * to priority we are exposing    405                  * to priority we are exposing a smaller subset of
406                  * memory to reclaim from. Con    406                  * memory to reclaim from. Consider this as a longer
407                  * term TODO.                     407                  * term TODO.
408                  */                               408                  */
409                 /* If excess == 0, no tree ops    409                 /* If excess == 0, no tree ops */
410                 __mem_cgroup_insert_exceeded(m    410                 __mem_cgroup_insert_exceeded(mz, mctz, excess);
411                 spin_unlock_irq(&mctz->lock);     411                 spin_unlock_irq(&mctz->lock);
412                 css_put(&mz->memcg->css);         412                 css_put(&mz->memcg->css);
413                 loop++;                           413                 loop++;
414                 /*                                414                 /*
415                  * Could not reclaim anything     415                  * Could not reclaim anything and there are no more
416                  * mem cgroups to try or we se    416                  * mem cgroups to try or we seem to be looping without
417                  * reclaiming anything.           417                  * reclaiming anything.
418                  */                               418                  */
419                 if (!nr_reclaimed &&              419                 if (!nr_reclaimed &&
420                         (next_mz == NULL ||       420                         (next_mz == NULL ||
421                         loop > MEM_CGROUP_MAX_    421                         loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
422                         break;                    422                         break;
423         } while (!nr_reclaimed);                  423         } while (!nr_reclaimed);
424         if (next_mz)                              424         if (next_mz)
425                 css_put(&next_mz->memcg->css);    425                 css_put(&next_mz->memcg->css);
426         return nr_reclaimed;                      426         return nr_reclaimed;
427 }                                                 427 }
428                                                   428 
429 /*                                                429 /*
430  * A routine for checking "mem" is under move_    430  * A routine for checking "mem" is under move_account() or not.
431  *                                                431  *
432  * Checking a cgroup is mc.from or mc.to or un    432  * Checking a cgroup is mc.from or mc.to or under hierarchy of
433  * moving cgroups. This is for waiting at high    433  * moving cgroups. This is for waiting at high-memory pressure
434  * caused by "move".                              434  * caused by "move".
435  */                                               435  */
436 static bool mem_cgroup_under_move(struct mem_c    436 static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
437 {                                                 437 {
438         struct mem_cgroup *from;                  438         struct mem_cgroup *from;
439         struct mem_cgroup *to;                    439         struct mem_cgroup *to;
440         bool ret = false;                         440         bool ret = false;
441         /*                                        441         /*
442          * Unlike task_move routines, we acces    442          * Unlike task_move routines, we access mc.to, mc.from not under
443          * mutual exclusion by cgroup_mutex. H    443          * mutual exclusion by cgroup_mutex. Here, we take spinlock instead.
444          */                                       444          */
445         spin_lock(&mc.lock);                      445         spin_lock(&mc.lock);
446         from = mc.from;                           446         from = mc.from;
447         to = mc.to;                               447         to = mc.to;
448         if (!from)                                448         if (!from)
449                 goto unlock;                      449                 goto unlock;
450                                                   450 
451         ret = mem_cgroup_is_descendant(from, m    451         ret = mem_cgroup_is_descendant(from, memcg) ||
452                 mem_cgroup_is_descendant(to, m    452                 mem_cgroup_is_descendant(to, memcg);
453 unlock:                                           453 unlock:
454         spin_unlock(&mc.lock);                    454         spin_unlock(&mc.lock);
455         return ret;                               455         return ret;
456 }                                                 456 }
457                                                   457 
458 bool memcg1_wait_acct_move(struct mem_cgroup *    458 bool memcg1_wait_acct_move(struct mem_cgroup *memcg)
459 {                                                 459 {
460         if (mc.moving_task && current != mc.mo    460         if (mc.moving_task && current != mc.moving_task) {
461                 if (mem_cgroup_under_move(memc    461                 if (mem_cgroup_under_move(memcg)) {
462                         DEFINE_WAIT(wait);        462                         DEFINE_WAIT(wait);
463                         prepare_to_wait(&mc.wa    463                         prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
464                         /* moving charge conte    464                         /* moving charge context might have finished. */
465                         if (mc.moving_task)       465                         if (mc.moving_task)
466                                 schedule();       466                                 schedule();
467                         finish_wait(&mc.waitq,    467                         finish_wait(&mc.waitq, &wait);
468                         return true;              468                         return true;
469                 }                                 469                 }
470         }                                         470         }
471         return false;                             471         return false;
472 }                                                 472 }
473                                                   473 
474 /**                                               474 /**
475  * folio_memcg_lock - Bind a folio to its memc    475  * folio_memcg_lock - Bind a folio to its memcg.
476  * @folio: The folio.                             476  * @folio: The folio.
477  *                                                477  *
478  * This function prevents unlocked LRU folios     478  * This function prevents unlocked LRU folios from being moved to
479  * another cgroup.                                479  * another cgroup.
480  *                                                480  *
481  * It ensures lifetime of the bound memcg.  Th    481  * It ensures lifetime of the bound memcg.  The caller is responsible
482  * for the lifetime of the folio.                 482  * for the lifetime of the folio.
483  */                                               483  */
484 void folio_memcg_lock(struct folio *folio)        484 void folio_memcg_lock(struct folio *folio)
485 {                                                 485 {
486         struct mem_cgroup *memcg;                 486         struct mem_cgroup *memcg;
487         unsigned long flags;                      487         unsigned long flags;
488                                                   488 
489         /*                                        489         /*
490          * The RCU lock is held throughout the    490          * The RCU lock is held throughout the transaction.  The fast
491          * path can get away without acquiring    491          * path can get away without acquiring the memcg->move_lock
492          * because page moving starts with an     492          * because page moving starts with an RCU grace period.
493          */                                       493          */
494         rcu_read_lock();                          494         rcu_read_lock();
495                                                   495 
496         if (mem_cgroup_disabled())                496         if (mem_cgroup_disabled())
497                 return;                           497                 return;
498 again:                                            498 again:
499         memcg = folio_memcg(folio);               499         memcg = folio_memcg(folio);
500         if (unlikely(!memcg))                     500         if (unlikely(!memcg))
501                 return;                           501                 return;
502                                                   502 
503 #ifdef CONFIG_PROVE_LOCKING                       503 #ifdef CONFIG_PROVE_LOCKING
504         local_irq_save(flags);                    504         local_irq_save(flags);
505         might_lock(&memcg->move_lock);            505         might_lock(&memcg->move_lock);
506         local_irq_restore(flags);                 506         local_irq_restore(flags);
507 #endif                                            507 #endif
508                                                   508 
509         if (atomic_read(&memcg->moving_account    509         if (atomic_read(&memcg->moving_account) <= 0)
510                 return;                           510                 return;
511                                                   511 
512         spin_lock_irqsave(&memcg->move_lock, f    512         spin_lock_irqsave(&memcg->move_lock, flags);
513         if (memcg != folio_memcg(folio)) {        513         if (memcg != folio_memcg(folio)) {
514                 spin_unlock_irqrestore(&memcg-    514                 spin_unlock_irqrestore(&memcg->move_lock, flags);
515                 goto again;                       515                 goto again;
516         }                                         516         }
517                                                   517 
518         /*                                        518         /*
519          * When charge migration first begins,    519          * When charge migration first begins, we can have multiple
520          * critical sections holding the fast-    520          * critical sections holding the fast-path RCU lock and one
521          * holding the slowpath move_lock. Tra    521          * holding the slowpath move_lock. Track the task who has the
522          * move_lock for folio_memcg_unlock().    522          * move_lock for folio_memcg_unlock().
523          */                                       523          */
524         memcg->move_lock_task = current;          524         memcg->move_lock_task = current;
525         memcg->move_lock_flags = flags;           525         memcg->move_lock_flags = flags;
526 }                                                 526 }
527                                                   527 
528 static void __folio_memcg_unlock(struct mem_cg    528 static void __folio_memcg_unlock(struct mem_cgroup *memcg)
529 {                                                 529 {
530         if (memcg && memcg->move_lock_task ==     530         if (memcg && memcg->move_lock_task == current) {
531                 unsigned long flags = memcg->m    531                 unsigned long flags = memcg->move_lock_flags;
532                                                   532 
533                 memcg->move_lock_task = NULL;     533                 memcg->move_lock_task = NULL;
534                 memcg->move_lock_flags = 0;       534                 memcg->move_lock_flags = 0;
535                                                   535 
536                 spin_unlock_irqrestore(&memcg-    536                 spin_unlock_irqrestore(&memcg->move_lock, flags);
537         }                                         537         }
538                                                   538 
539         rcu_read_unlock();                        539         rcu_read_unlock();
540 }                                                 540 }
541                                                   541 
542 /**                                               542 /**
543  * folio_memcg_unlock - Release the binding be    543  * folio_memcg_unlock - Release the binding between a folio and its memcg.
544  * @folio: The folio.                             544  * @folio: The folio.
545  *                                                545  *
546  * This releases the binding created by folio_    546  * This releases the binding created by folio_memcg_lock().  This does
547  * not change the accounting of this folio to     547  * not change the accounting of this folio to its memcg, but it does
548  * permit others to change it.                    548  * permit others to change it.
549  */                                               549  */
550 void folio_memcg_unlock(struct folio *folio)      550 void folio_memcg_unlock(struct folio *folio)
551 {                                                 551 {
552         __folio_memcg_unlock(folio_memcg(folio    552         __folio_memcg_unlock(folio_memcg(folio));
553 }                                                 553 }
554                                                   554 
555 #ifdef CONFIG_SWAP                                555 #ifdef CONFIG_SWAP
556 /**                                               556 /**
557  * mem_cgroup_move_swap_account - move swap ch    557  * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
558  * @entry: swap entry to be moved                 558  * @entry: swap entry to be moved
559  * @from:  mem_cgroup which the entry is moved    559  * @from:  mem_cgroup which the entry is moved from
560  * @to:  mem_cgroup which the entry is moved t    560  * @to:  mem_cgroup which the entry is moved to
561  *                                                561  *
562  * It succeeds only when the swap_cgroup's rec    562  * It succeeds only when the swap_cgroup's record for this entry is the same
563  * as the mem_cgroup's id of @from.               563  * as the mem_cgroup's id of @from.
564  *                                                564  *
565  * Returns 0 on success, -EINVAL on failure.      565  * Returns 0 on success, -EINVAL on failure.
566  *                                                566  *
567  * The caller must have charged to @to, IOW, c    567  * The caller must have charged to @to, IOW, called page_counter_charge() about
568  * both res and memsw, and called css_get().      568  * both res and memsw, and called css_get().
569  */                                               569  */
570 static int mem_cgroup_move_swap_account(swp_en    570 static int mem_cgroup_move_swap_account(swp_entry_t entry,
571                                 struct mem_cgr    571                                 struct mem_cgroup *from, struct mem_cgroup *to)
572 {                                                 572 {
573         unsigned short old_id, new_id;            573         unsigned short old_id, new_id;
574                                                   574 
575         old_id = mem_cgroup_id(from);             575         old_id = mem_cgroup_id(from);
576         new_id = mem_cgroup_id(to);               576         new_id = mem_cgroup_id(to);
577                                                   577 
578         if (swap_cgroup_cmpxchg(entry, old_id,    578         if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
579                 mod_memcg_state(from, MEMCG_SW    579                 mod_memcg_state(from, MEMCG_SWAP, -1);
580                 mod_memcg_state(to, MEMCG_SWAP    580                 mod_memcg_state(to, MEMCG_SWAP, 1);
581                 return 0;                         581                 return 0;
582         }                                         582         }
583         return -EINVAL;                           583         return -EINVAL;
584 }                                                 584 }
585 #else                                             585 #else
586 static inline int mem_cgroup_move_swap_account    586 static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
587                                 struct mem_cgr    587                                 struct mem_cgroup *from, struct mem_cgroup *to)
588 {                                                 588 {
589         return -EINVAL;                           589         return -EINVAL;
590 }                                                 590 }
591 #endif                                            591 #endif
592                                                   592 
593 static u64 mem_cgroup_move_charge_read(struct     593 static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css,
594                                 struct cftype     594                                 struct cftype *cft)
595 {                                                 595 {
596         return mem_cgroup_from_css(css)->move_    596         return mem_cgroup_from_css(css)->move_charge_at_immigrate;
597 }                                                 597 }
598                                                   598 
599 #ifdef CONFIG_MMU                                 599 #ifdef CONFIG_MMU
600 static int mem_cgroup_move_charge_write(struct    600 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
601                                  struct cftype    601                                  struct cftype *cft, u64 val)
602 {                                                 602 {
603         struct mem_cgroup *memcg = mem_cgroup_    603         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
604                                                   604 
605         pr_warn_once("Cgroup memory moving (mo    605         pr_warn_once("Cgroup memory moving (move_charge_at_immigrate) is deprecated. "
606                      "Please report your useca    606                      "Please report your usecase to linux-mm@kvack.org if you "
607                      "depend on this functiona    607                      "depend on this functionality.\n");
608                                                   608 
609         if (val & ~MOVE_MASK)                     609         if (val & ~MOVE_MASK)
610                 return -EINVAL;                   610                 return -EINVAL;
611                                                   611 
612         /*                                        612         /*
613          * No kind of locking is needed in her    613          * No kind of locking is needed in here, because ->can_attach() will
614          * check this value once in the beginn    614          * check this value once in the beginning of the process, and then carry
615          * on with stale data. This means that    615          * on with stale data. This means that changes to this value will only
616          * affect task migrations starting aft    616          * affect task migrations starting after the change.
617          */                                       617          */
618         memcg->move_charge_at_immigrate = val;    618         memcg->move_charge_at_immigrate = val;
619         return 0;                                 619         return 0;
620 }                                                 620 }
621 #else                                             621 #else
622 static int mem_cgroup_move_charge_write(struct    622 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
623                                  struct cftype    623                                  struct cftype *cft, u64 val)
624 {                                                 624 {
625         return -ENOSYS;                           625         return -ENOSYS;
626 }                                                 626 }
627 #endif                                            627 #endif
628                                                   628 
629 #ifdef CONFIG_MMU                                 629 #ifdef CONFIG_MMU
630 /* Handlers for move charge at task migration.    630 /* Handlers for move charge at task migration. */
631 static int mem_cgroup_do_precharge(unsigned lo    631 static int mem_cgroup_do_precharge(unsigned long count)
632 {                                                 632 {
633         int ret;                                  633         int ret;
634                                                   634 
635         /* Try a single bulk charge without re    635         /* Try a single bulk charge without reclaim first, kswapd may wake */
636         ret = try_charge(mc.to, GFP_KERNEL & ~    636         ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, count);
637         if (!ret) {                               637         if (!ret) {
638                 mc.precharge += count;            638                 mc.precharge += count;
639                 return ret;                       639                 return ret;
640         }                                         640         }
641                                                   641 
642         /* Try charges one by one with reclaim    642         /* Try charges one by one with reclaim, but do not retry */
643         while (count--) {                         643         while (count--) {
644                 ret = try_charge(mc.to, GFP_KE    644                 ret = try_charge(mc.to, GFP_KERNEL | __GFP_NORETRY, 1);
645                 if (ret)                          645                 if (ret)
646                         return ret;               646                         return ret;
647                 mc.precharge++;                   647                 mc.precharge++;
648                 cond_resched();                   648                 cond_resched();
649         }                                         649         }
650         return 0;                                 650         return 0;
651 }                                                 651 }
652                                                   652 
653 union mc_target {                                 653 union mc_target {
654         struct folio    *folio;                   654         struct folio    *folio;
655         swp_entry_t     ent;                      655         swp_entry_t     ent;
656 };                                                656 };
657                                                   657 
658 enum mc_target_type {                             658 enum mc_target_type {
659         MC_TARGET_NONE = 0,                       659         MC_TARGET_NONE = 0,
660         MC_TARGET_PAGE,                           660         MC_TARGET_PAGE,
661         MC_TARGET_SWAP,                           661         MC_TARGET_SWAP,
662         MC_TARGET_DEVICE,                         662         MC_TARGET_DEVICE,
663 };                                                663 };
664                                                   664 
665 static struct page *mc_handle_present_pte(stru    665 static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
666                                                   666                                                 unsigned long addr, pte_t ptent)
667 {                                                 667 {
668         struct page *page = vm_normal_page(vma    668         struct page *page = vm_normal_page(vma, addr, ptent);
669                                                   669 
670         if (!page)                                670         if (!page)
671                 return NULL;                      671                 return NULL;
672         if (PageAnon(page)) {                     672         if (PageAnon(page)) {
673                 if (!(mc.flags & MOVE_ANON))      673                 if (!(mc.flags & MOVE_ANON))
674                         return NULL;              674                         return NULL;
675         } else {                                  675         } else {
676                 if (!(mc.flags & MOVE_FILE))      676                 if (!(mc.flags & MOVE_FILE))
677                         return NULL;              677                         return NULL;
678         }                                         678         }
679         get_page(page);                           679         get_page(page);
680                                                   680 
681         return page;                              681         return page;
682 }                                                 682 }
683                                                   683 
684 #if defined(CONFIG_SWAP) || defined(CONFIG_DEV    684 #if defined(CONFIG_SWAP) || defined(CONFIG_DEVICE_PRIVATE)
685 static struct page *mc_handle_swap_pte(struct     685 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
686                         pte_t ptent, swp_entry    686                         pte_t ptent, swp_entry_t *entry)
687 {                                                 687 {
688         struct page *page = NULL;                 688         struct page *page = NULL;
689         swp_entry_t ent = pte_to_swp_entry(pte    689         swp_entry_t ent = pte_to_swp_entry(ptent);
690                                                   690 
691         if (!(mc.flags & MOVE_ANON))              691         if (!(mc.flags & MOVE_ANON))
692                 return NULL;                      692                 return NULL;
693                                                   693 
694         /*                                        694         /*
695          * Handle device private pages that ar    695          * Handle device private pages that are not accessible by the CPU, but
696          * stored as special swap entries in t    696          * stored as special swap entries in the page table.
697          */                                       697          */
698         if (is_device_private_entry(ent)) {       698         if (is_device_private_entry(ent)) {
699                 page = pfn_swap_entry_to_page(    699                 page = pfn_swap_entry_to_page(ent);
700                 if (!get_page_unless_zero(page    700                 if (!get_page_unless_zero(page))
701                         return NULL;              701                         return NULL;
702                 return page;                      702                 return page;
703         }                                         703         }
704                                                   704 
705         if (non_swap_entry(ent))                  705         if (non_swap_entry(ent))
706                 return NULL;                      706                 return NULL;
707                                                   707 
708         /*                                        708         /*
709          * Because swap_cache_get_folio() upda    709          * Because swap_cache_get_folio() updates some statistics counter,
710          * we call find_get_page() with swappe    710          * we call find_get_page() with swapper_space directly.
711          */                                       711          */
712         page = find_get_page(swap_address_spac    712         page = find_get_page(swap_address_space(ent), swap_cache_index(ent));
713         entry->val = ent.val;                     713         entry->val = ent.val;
714                                                   714 
715         return page;                              715         return page;
716 }                                                 716 }
717 #else                                             717 #else
718 static struct page *mc_handle_swap_pte(struct     718 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
719                         pte_t ptent, swp_entry    719                         pte_t ptent, swp_entry_t *entry)
720 {                                                 720 {
721         return NULL;                              721         return NULL;
722 }                                                 722 }
723 #endif                                            723 #endif
724                                                   724 
725 static struct page *mc_handle_file_pte(struct     725 static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
726                         unsigned long addr, pt    726                         unsigned long addr, pte_t ptent)
727 {                                                 727 {
728         unsigned long index;                      728         unsigned long index;
729         struct folio *folio;                      729         struct folio *folio;
730                                                   730 
731         if (!vma->vm_file) /* anonymous vma */    731         if (!vma->vm_file) /* anonymous vma */
732                 return NULL;                      732                 return NULL;
733         if (!(mc.flags & MOVE_FILE))              733         if (!(mc.flags & MOVE_FILE))
734                 return NULL;                      734                 return NULL;
735                                                   735 
736         /* folio is moved even if it's not RSS    736         /* folio is moved even if it's not RSS of this task(page-faulted). */
737         /* shmem/tmpfs may report page out on     737         /* shmem/tmpfs may report page out on swap: account for that too. */
738         index = linear_page_index(vma, addr);     738         index = linear_page_index(vma, addr);
739         folio = filemap_get_incore_folio(vma->    739         folio = filemap_get_incore_folio(vma->vm_file->f_mapping, index);
740         if (IS_ERR(folio))                        740         if (IS_ERR(folio))
741                 return NULL;                      741                 return NULL;
742         return folio_file_page(folio, index);     742         return folio_file_page(folio, index);
743 }                                                 743 }
744                                                   744 
745 static void memcg1_check_events(struct mem_cgr    745 static void memcg1_check_events(struct mem_cgroup *memcg, int nid);
746 static void memcg1_charge_statistics(struct me    746 static void memcg1_charge_statistics(struct mem_cgroup *memcg, int nr_pages);
747                                                   747 
748 /**                                               748 /**
749  * mem_cgroup_move_account - move account of t    749  * mem_cgroup_move_account - move account of the folio
750  * @folio: The folio.                             750  * @folio: The folio.
751  * @compound: charge the page as compound or s    751  * @compound: charge the page as compound or small page
752  * @from: mem_cgroup which the folio is moved     752  * @from: mem_cgroup which the folio is moved from.
753  * @to: mem_cgroup which the folio is moved to    753  * @to: mem_cgroup which the folio is moved to. @from != @to.
754  *                                                754  *
755  * The folio must be locked and not on the LRU    755  * The folio must be locked and not on the LRU.
756  *                                                756  *
757  * This function doesn't do "charge" to new cg    757  * This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
758  * from old cgroup.                               758  * from old cgroup.
759  */                                               759  */
760 static int mem_cgroup_move_account(struct foli    760 static int mem_cgroup_move_account(struct folio *folio,
761                                    bool compou    761                                    bool compound,
762                                    struct mem_    762                                    struct mem_cgroup *from,
763                                    struct mem_    763                                    struct mem_cgroup *to)
764 {                                                 764 {
765         struct lruvec *from_vec, *to_vec;         765         struct lruvec *from_vec, *to_vec;
766         struct pglist_data *pgdat;                766         struct pglist_data *pgdat;
767         unsigned int nr_pages = compound ? fol    767         unsigned int nr_pages = compound ? folio_nr_pages(folio) : 1;
768         int nid, ret;                             768         int nid, ret;
769                                                   769 
770         VM_BUG_ON(from == to);                    770         VM_BUG_ON(from == to);
771         VM_BUG_ON_FOLIO(!folio_test_locked(fol    771         VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
772         VM_BUG_ON_FOLIO(folio_test_lru(folio),    772         VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
773         VM_BUG_ON(compound && !folio_test_larg    773         VM_BUG_ON(compound && !folio_test_large(folio));
774                                                   774 
775         ret = -EINVAL;                            775         ret = -EINVAL;
776         if (folio_memcg(folio) != from)           776         if (folio_memcg(folio) != from)
777                 goto out;                         777                 goto out;
778                                                   778 
779         pgdat = folio_pgdat(folio);               779         pgdat = folio_pgdat(folio);
780         from_vec = mem_cgroup_lruvec(from, pgd    780         from_vec = mem_cgroup_lruvec(from, pgdat);
781         to_vec = mem_cgroup_lruvec(to, pgdat);    781         to_vec = mem_cgroup_lruvec(to, pgdat);
782                                                   782 
783         folio_memcg_lock(folio);                  783         folio_memcg_lock(folio);
784                                                   784 
785         if (folio_test_anon(folio)) {             785         if (folio_test_anon(folio)) {
786                 if (folio_mapped(folio)) {        786                 if (folio_mapped(folio)) {
787                         __mod_lruvec_state(fro    787                         __mod_lruvec_state(from_vec, NR_ANON_MAPPED, -nr_pages);
788                         __mod_lruvec_state(to_    788                         __mod_lruvec_state(to_vec, NR_ANON_MAPPED, nr_pages);
789                         if (folio_test_pmd_map    789                         if (folio_test_pmd_mappable(folio)) {
790                                 __mod_lruvec_s    790                                 __mod_lruvec_state(from_vec, NR_ANON_THPS,
791                                                   791                                                    -nr_pages);
792                                 __mod_lruvec_s    792                                 __mod_lruvec_state(to_vec, NR_ANON_THPS,
793                                                   793                                                    nr_pages);
794                         }                         794                         }
795                 }                                 795                 }
796         } else {                                  796         } else {
797                 __mod_lruvec_state(from_vec, N    797                 __mod_lruvec_state(from_vec, NR_FILE_PAGES, -nr_pages);
798                 __mod_lruvec_state(to_vec, NR_    798                 __mod_lruvec_state(to_vec, NR_FILE_PAGES, nr_pages);
799                                                   799 
800                 if (folio_test_swapbacked(foli    800                 if (folio_test_swapbacked(folio)) {
801                         __mod_lruvec_state(fro    801                         __mod_lruvec_state(from_vec, NR_SHMEM, -nr_pages);
802                         __mod_lruvec_state(to_    802                         __mod_lruvec_state(to_vec, NR_SHMEM, nr_pages);
803                 }                                 803                 }
804                                                   804 
805                 if (folio_mapped(folio)) {        805                 if (folio_mapped(folio)) {
806                         __mod_lruvec_state(fro    806                         __mod_lruvec_state(from_vec, NR_FILE_MAPPED, -nr_pages);
807                         __mod_lruvec_state(to_    807                         __mod_lruvec_state(to_vec, NR_FILE_MAPPED, nr_pages);
808                 }                                 808                 }
809                                                   809 
810                 if (folio_test_dirty(folio)) {    810                 if (folio_test_dirty(folio)) {
811                         struct address_space *    811                         struct address_space *mapping = folio_mapping(folio);
812                                                   812 
813                         if (mapping_can_writeb    813                         if (mapping_can_writeback(mapping)) {
814                                 __mod_lruvec_s    814                                 __mod_lruvec_state(from_vec, NR_FILE_DIRTY,
815                                                   815                                                    -nr_pages);
816                                 __mod_lruvec_s    816                                 __mod_lruvec_state(to_vec, NR_FILE_DIRTY,
817                                                   817                                                    nr_pages);
818                         }                         818                         }
819                 }                                 819                 }
820         }                                         820         }
821                                                   821 
822 #ifdef CONFIG_SWAP                                822 #ifdef CONFIG_SWAP
823         if (folio_test_swapcache(folio)) {        823         if (folio_test_swapcache(folio)) {
824                 __mod_lruvec_state(from_vec, N    824                 __mod_lruvec_state(from_vec, NR_SWAPCACHE, -nr_pages);
825                 __mod_lruvec_state(to_vec, NR_    825                 __mod_lruvec_state(to_vec, NR_SWAPCACHE, nr_pages);
826         }                                         826         }
827 #endif                                            827 #endif
828         if (folio_test_writeback(folio)) {        828         if (folio_test_writeback(folio)) {
829                 __mod_lruvec_state(from_vec, N    829                 __mod_lruvec_state(from_vec, NR_WRITEBACK, -nr_pages);
830                 __mod_lruvec_state(to_vec, NR_    830                 __mod_lruvec_state(to_vec, NR_WRITEBACK, nr_pages);
831         }                                         831         }
832                                                   832 
833         /*                                        833         /*
834          * All state has been migrated, let's     834          * All state has been migrated, let's switch to the new memcg.
835          *                                        835          *
836          * It is safe to change page's memcg h    836          * It is safe to change page's memcg here because the page
837          * is referenced, charged, isolated, a    837          * is referenced, charged, isolated, and locked: we can't race
838          * with (un)charging, migration, LRU p    838          * with (un)charging, migration, LRU putback, or anything else
839          * that would rely on a stable page's     839          * that would rely on a stable page's memory cgroup.
840          *                                        840          *
841          * Note that folio_memcg_lock is a mem    841          * Note that folio_memcg_lock is a memcg lock, not a page lock,
842          * to save space. As soon as we switch    842          * to save space. As soon as we switch page's memory cgroup to a
843          * new memcg that isn't locked, the ab    843          * new memcg that isn't locked, the above state can change
844          * concurrently again. Make sure we're    844          * concurrently again. Make sure we're truly done with it.
845          */                                       845          */
846         smp_mb();                                 846         smp_mb();
847                                                   847 
848         css_get(&to->css);                        848         css_get(&to->css);
849         css_put(&from->css);                      849         css_put(&from->css);
850                                                   850 
851         /* Warning should never happen, so don    851         /* Warning should never happen, so don't worry about refcount non-0 */
852         WARN_ON_ONCE(folio_unqueue_deferred_sp    852         WARN_ON_ONCE(folio_unqueue_deferred_split(folio));
853         folio->memcg_data = (unsigned long)to;    853         folio->memcg_data = (unsigned long)to;
854                                                   854 
855         __folio_memcg_unlock(from);               855         __folio_memcg_unlock(from);
856                                                   856 
857         ret = 0;                                  857         ret = 0;
858         nid = folio_nid(folio);                   858         nid = folio_nid(folio);
859                                                   859 
860         local_irq_disable();                      860         local_irq_disable();
861         memcg1_charge_statistics(to, nr_pages)    861         memcg1_charge_statistics(to, nr_pages);
862         memcg1_check_events(to, nid);             862         memcg1_check_events(to, nid);
863         memcg1_charge_statistics(from, -nr_pag    863         memcg1_charge_statistics(from, -nr_pages);
864         memcg1_check_events(from, nid);           864         memcg1_check_events(from, nid);
865         local_irq_enable();                       865         local_irq_enable();
866 out:                                              866 out:
867         return ret;                               867         return ret;
868 }                                                 868 }
869                                                   869 
870 /**                                               870 /**
871  * get_mctgt_type - get target type of moving     871  * get_mctgt_type - get target type of moving charge
872  * @vma: the vma the pte to be checked belongs    872  * @vma: the vma the pte to be checked belongs
873  * @addr: the address corresponding to the pte    873  * @addr: the address corresponding to the pte to be checked
874  * @ptent: the pte to be checked                  874  * @ptent: the pte to be checked
875  * @target: the pointer the target page or swa    875  * @target: the pointer the target page or swap ent will be stored(can be NULL)
876  *                                                876  *
877  * Context: Called with pte lock held.            877  * Context: Called with pte lock held.
878  * Return:                                        878  * Return:
879  * * MC_TARGET_NONE - If the pte is not a targ    879  * * MC_TARGET_NONE - If the pte is not a target for move charge.
880  * * MC_TARGET_PAGE - If the page correspondin    880  * * MC_TARGET_PAGE - If the page corresponding to this pte is a target for
881  *   move charge. If @target is not NULL, the     881  *   move charge. If @target is not NULL, the folio is stored in target->folio
882  *   with extra refcnt taken (Caller should re    882  *   with extra refcnt taken (Caller should release it).
883  * * MC_TARGET_SWAP - If the swap entry corres    883  * * MC_TARGET_SWAP - If the swap entry corresponding to this pte is a
884  *   target for charge migration.  If @target     884  *   target for charge migration.  If @target is not NULL, the entry is
885  *   stored in target->ent.                       885  *   stored in target->ent.
886  * * MC_TARGET_DEVICE - Like MC_TARGET_PAGE bu    886  * * MC_TARGET_DEVICE - Like MC_TARGET_PAGE but page is device memory and
887  *   thus not on the lru.  For now such page i    887  *   thus not on the lru.  For now such page is charged like a regular page
888  *   would be as it is just special memory tak    888  *   would be as it is just special memory taking the place of a regular page.
889  *   See Documentations/vm/hmm.txt and include    889  *   See Documentations/vm/hmm.txt and include/linux/hmm.h
890  */                                               890  */
891 static enum mc_target_type get_mctgt_type(stru    891 static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
892                 unsigned long addr, pte_t pten    892                 unsigned long addr, pte_t ptent, union mc_target *target)
893 {                                                 893 {
894         struct page *page = NULL;                 894         struct page *page = NULL;
895         struct folio *folio;                      895         struct folio *folio;
896         enum mc_target_type ret = MC_TARGET_NO    896         enum mc_target_type ret = MC_TARGET_NONE;
897         swp_entry_t ent = { .val = 0 };           897         swp_entry_t ent = { .val = 0 };
898                                                   898 
899         if (pte_present(ptent))                   899         if (pte_present(ptent))
900                 page = mc_handle_present_pte(v    900                 page = mc_handle_present_pte(vma, addr, ptent);
901         else if (pte_none_mostly(ptent))          901         else if (pte_none_mostly(ptent))
902                 /*                                902                 /*
903                  * PTE markers should be treat    903                  * PTE markers should be treated as a none pte here, separated
904                  * from other swap handling be    904                  * from other swap handling below.
905                  */                               905                  */
906                 page = mc_handle_file_pte(vma,    906                 page = mc_handle_file_pte(vma, addr, ptent);
907         else if (is_swap_pte(ptent))              907         else if (is_swap_pte(ptent))
908                 page = mc_handle_swap_pte(vma,    908                 page = mc_handle_swap_pte(vma, ptent, &ent);
909                                                   909 
910         if (page)                                 910         if (page)
911                 folio = page_folio(page);         911                 folio = page_folio(page);
912         if (target && page) {                     912         if (target && page) {
913                 if (!folio_trylock(folio)) {      913                 if (!folio_trylock(folio)) {
914                         folio_put(folio);         914                         folio_put(folio);
915                         return ret;               915                         return ret;
916                 }                                 916                 }
917                 /*                                917                 /*
918                  * page_mapped() must be stabl    918                  * page_mapped() must be stable during the move. This
919                  * pte is locked, so if it's p    919                  * pte is locked, so if it's present, the page cannot
920                  * become unmapped. If it isn'    920                  * become unmapped. If it isn't, we have only partial
921                  * control over the mapped sta    921                  * control over the mapped state: the page lock will
922                  * prevent new faults against     922                  * prevent new faults against pagecache and swapcache,
923                  * so an unmapped page cannot     923                  * so an unmapped page cannot become mapped. However,
924                  * if the page is already mapp    924                  * if the page is already mapped elsewhere, it can
925                  * unmap, and there is nothing    925                  * unmap, and there is nothing we can do about it.
926                  * Alas, skip moving the page     926                  * Alas, skip moving the page in this case.
927                  */                               927                  */
928                 if (!pte_present(ptent) && pag    928                 if (!pte_present(ptent) && page_mapped(page)) {
929                         folio_unlock(folio);      929                         folio_unlock(folio);
930                         folio_put(folio);         930                         folio_put(folio);
931                         return ret;               931                         return ret;
932                 }                                 932                 }
933         }                                         933         }
934                                                   934 
935         if (!page && !ent.val)                    935         if (!page && !ent.val)
936                 return ret;                       936                 return ret;
937         if (page) {                               937         if (page) {
938                 /*                                938                 /*
939                  * Do only loose check w/o ser    939                  * Do only loose check w/o serialization.
940                  * mem_cgroup_move_account() c    940                  * mem_cgroup_move_account() checks the page is valid or
941                  * not under LRU exclusion.       941                  * not under LRU exclusion.
942                  */                               942                  */
943                 if (folio_memcg(folio) == mc.f    943                 if (folio_memcg(folio) == mc.from) {
944                         ret = MC_TARGET_PAGE;     944                         ret = MC_TARGET_PAGE;
945                         if (folio_is_device_pr    945                         if (folio_is_device_private(folio) ||
946                             folio_is_device_co    946                             folio_is_device_coherent(folio))
947                                 ret = MC_TARGE    947                                 ret = MC_TARGET_DEVICE;
948                         if (target)               948                         if (target)
949                                 target->folio     949                                 target->folio = folio;
950                 }                                 950                 }
951                 if (!ret || !target) {            951                 if (!ret || !target) {
952                         if (target)               952                         if (target)
953                                 folio_unlock(f    953                                 folio_unlock(folio);
954                         folio_put(folio);         954                         folio_put(folio);
955                 }                                 955                 }
956         }                                         956         }
957         /*                                        957         /*
958          * There is a swap entry and a page do    958          * There is a swap entry and a page doesn't exist or isn't charged.
959          * But we cannot move a tail-page in a    959          * But we cannot move a tail-page in a THP.
960          */                                       960          */
961         if (ent.val && !ret && (!page || !Page    961         if (ent.val && !ret && (!page || !PageTransCompound(page)) &&
962             mem_cgroup_id(mc.from) == lookup_s    962             mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) {
963                 ret = MC_TARGET_SWAP;             963                 ret = MC_TARGET_SWAP;
964                 if (target)                       964                 if (target)
965                         target->ent = ent;        965                         target->ent = ent;
966         }                                         966         }
967         return ret;                               967         return ret;
968 }                                                 968 }
969                                                   969 
970 #ifdef CONFIG_TRANSPARENT_HUGEPAGE                970 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
971 /*                                                971 /*
972  * We don't consider PMD mapped swapping or fi    972  * We don't consider PMD mapped swapping or file mapped pages because THP does
973  * not support them for now.                      973  * not support them for now.
974  * Caller should make sure that pmd_trans_huge    974  * Caller should make sure that pmd_trans_huge(pmd) is true.
975  */                                               975  */
976 static enum mc_target_type get_mctgt_type_thp(    976 static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
977                 unsigned long addr, pmd_t pmd,    977                 unsigned long addr, pmd_t pmd, union mc_target *target)
978 {                                                 978 {
979         struct page *page = NULL;                 979         struct page *page = NULL;
980         struct folio *folio;                      980         struct folio *folio;
981         enum mc_target_type ret = MC_TARGET_NO    981         enum mc_target_type ret = MC_TARGET_NONE;
982                                                   982 
983         if (unlikely(is_swap_pmd(pmd))) {         983         if (unlikely(is_swap_pmd(pmd))) {
984                 VM_BUG_ON(thp_migration_suppor    984                 VM_BUG_ON(thp_migration_supported() &&
985                                   !is_pmd_migr    985                                   !is_pmd_migration_entry(pmd));
986                 return ret;                       986                 return ret;
987         }                                         987         }
988         page = pmd_page(pmd);                     988         page = pmd_page(pmd);
989         VM_BUG_ON_PAGE(!page || !PageHead(page    989         VM_BUG_ON_PAGE(!page || !PageHead(page), page);
990         folio = page_folio(page);                 990         folio = page_folio(page);
991         if (!(mc.flags & MOVE_ANON))              991         if (!(mc.flags & MOVE_ANON))
992                 return ret;                       992                 return ret;
993         if (folio_memcg(folio) == mc.from) {      993         if (folio_memcg(folio) == mc.from) {
994                 ret = MC_TARGET_PAGE;             994                 ret = MC_TARGET_PAGE;
995                 if (target) {                     995                 if (target) {
996                         folio_get(folio);         996                         folio_get(folio);
997                         if (!folio_trylock(fol    997                         if (!folio_trylock(folio)) {
998                                 folio_put(foli    998                                 folio_put(folio);
999                                 return MC_TARG    999                                 return MC_TARGET_NONE;
1000                         }                        1000                         }
1001                         target->folio = folio    1001                         target->folio = folio;
1002                 }                                1002                 }
1003         }                                        1003         }
1004         return ret;                              1004         return ret;
1005 }                                                1005 }
1006 #else                                            1006 #else
1007 static inline enum mc_target_type get_mctgt_t    1007 static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
1008                 unsigned long addr, pmd_t pmd    1008                 unsigned long addr, pmd_t pmd, union mc_target *target)
1009 {                                                1009 {
1010         return MC_TARGET_NONE;                   1010         return MC_TARGET_NONE;
1011 }                                                1011 }
1012 #endif                                           1012 #endif
1013                                                  1013 
1014 static int mem_cgroup_count_precharge_pte_ran    1014 static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
1015                                         unsig    1015                                         unsigned long addr, unsigned long end,
1016                                         struc    1016                                         struct mm_walk *walk)
1017 {                                                1017 {
1018         struct vm_area_struct *vma = walk->vm    1018         struct vm_area_struct *vma = walk->vma;
1019         pte_t *pte;                              1019         pte_t *pte;
1020         spinlock_t *ptl;                         1020         spinlock_t *ptl;
1021                                                  1021 
1022         ptl = pmd_trans_huge_lock(pmd, vma);     1022         ptl = pmd_trans_huge_lock(pmd, vma);
1023         if (ptl) {                               1023         if (ptl) {
1024                 /*                               1024                 /*
1025                  * Note their can not be MC_T    1025                  * Note their can not be MC_TARGET_DEVICE for now as we do not
1026                  * support transparent huge p    1026                  * support transparent huge page with MEMORY_DEVICE_PRIVATE but
1027                  * this might change.            1027                  * this might change.
1028                  */                              1028                  */
1029                 if (get_mctgt_type_thp(vma, a    1029                 if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
1030                         mc.precharge += HPAGE    1030                         mc.precharge += HPAGE_PMD_NR;
1031                 spin_unlock(ptl);                1031                 spin_unlock(ptl);
1032                 return 0;                        1032                 return 0;
1033         }                                        1033         }
1034                                                  1034 
1035         pte = pte_offset_map_lock(vma->vm_mm,    1035         pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
1036         if (!pte)                                1036         if (!pte)
1037                 return 0;                        1037                 return 0;
1038         for (; addr != end; pte++, addr += PA    1038         for (; addr != end; pte++, addr += PAGE_SIZE)
1039                 if (get_mctgt_type(vma, addr,    1039                 if (get_mctgt_type(vma, addr, ptep_get(pte), NULL))
1040                         mc.precharge++; /* in    1040                         mc.precharge++; /* increment precharge temporarily */
1041         pte_unmap_unlock(pte - 1, ptl);          1041         pte_unmap_unlock(pte - 1, ptl);
1042         cond_resched();                          1042         cond_resched();
1043                                                  1043 
1044         return 0;                                1044         return 0;
1045 }                                                1045 }
1046                                                  1046 
1047 static const struct mm_walk_ops precharge_wal    1047 static const struct mm_walk_ops precharge_walk_ops = {
1048         .pmd_entry      = mem_cgroup_count_pr    1048         .pmd_entry      = mem_cgroup_count_precharge_pte_range,
1049         .walk_lock      = PGWALK_RDLOCK,         1049         .walk_lock      = PGWALK_RDLOCK,
1050 };                                               1050 };
1051                                                  1051 
1052 static unsigned long mem_cgroup_count_prechar    1052 static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
1053 {                                                1053 {
1054         unsigned long precharge;                 1054         unsigned long precharge;
1055                                                  1055 
1056         mmap_read_lock(mm);                      1056         mmap_read_lock(mm);
1057         walk_page_range(mm, 0, ULONG_MAX, &pr    1057         walk_page_range(mm, 0, ULONG_MAX, &precharge_walk_ops, NULL);
1058         mmap_read_unlock(mm);                    1058         mmap_read_unlock(mm);
1059                                                  1059 
1060         precharge = mc.precharge;                1060         precharge = mc.precharge;
1061         mc.precharge = 0;                        1061         mc.precharge = 0;
1062                                                  1062 
1063         return precharge;                        1063         return precharge;
1064 }                                                1064 }
1065                                                  1065 
1066 static int mem_cgroup_precharge_mc(struct mm_    1066 static int mem_cgroup_precharge_mc(struct mm_struct *mm)
1067 {                                                1067 {
1068         unsigned long precharge = mem_cgroup_    1068         unsigned long precharge = mem_cgroup_count_precharge(mm);
1069                                                  1069 
1070         VM_BUG_ON(mc.moving_task);               1070         VM_BUG_ON(mc.moving_task);
1071         mc.moving_task = current;                1071         mc.moving_task = current;
1072         return mem_cgroup_do_precharge(precha    1072         return mem_cgroup_do_precharge(precharge);
1073 }                                                1073 }
1074                                                  1074 
1075 /* cancels all extra charges on mc.from and m    1075 /* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */
1076 static void __mem_cgroup_clear_mc(void)          1076 static void __mem_cgroup_clear_mc(void)
1077 {                                                1077 {
1078         struct mem_cgroup *from = mc.from;       1078         struct mem_cgroup *from = mc.from;
1079         struct mem_cgroup *to = mc.to;           1079         struct mem_cgroup *to = mc.to;
1080                                                  1080 
1081         /* we must uncharge all the leftover     1081         /* we must uncharge all the leftover precharges from mc.to */
1082         if (mc.precharge) {                      1082         if (mc.precharge) {
1083                 mem_cgroup_cancel_charge(mc.t    1083                 mem_cgroup_cancel_charge(mc.to, mc.precharge);
1084                 mc.precharge = 0;                1084                 mc.precharge = 0;
1085         }                                        1085         }
1086         /*                                       1086         /*
1087          * we didn't uncharge from mc.from at    1087          * we didn't uncharge from mc.from at mem_cgroup_move_account(), so
1088          * we must uncharge here.                1088          * we must uncharge here.
1089          */                                      1089          */
1090         if (mc.moved_charge) {                   1090         if (mc.moved_charge) {
1091                 mem_cgroup_cancel_charge(mc.f    1091                 mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
1092                 mc.moved_charge = 0;             1092                 mc.moved_charge = 0;
1093         }                                        1093         }
1094         /* we must fixup refcnts and charges     1094         /* we must fixup refcnts and charges */
1095         if (mc.moved_swap) {                     1095         if (mc.moved_swap) {
1096                 /* uncharge swap account from    1096                 /* uncharge swap account from the old cgroup */
1097                 if (!mem_cgroup_is_root(mc.fr    1097                 if (!mem_cgroup_is_root(mc.from))
1098                         page_counter_uncharge    1098                         page_counter_uncharge(&mc.from->memsw, mc.moved_swap);
1099                                                  1099 
1100                 mem_cgroup_id_put_many(mc.fro    1100                 mem_cgroup_id_put_many(mc.from, mc.moved_swap);
1101                                                  1101 
1102                 /*                               1102                 /*
1103                  * we charged both to->memory    1103                  * we charged both to->memory and to->memsw, so we
1104                  * should uncharge to->memory    1104                  * should uncharge to->memory.
1105                  */                              1105                  */
1106                 if (!mem_cgroup_is_root(mc.to    1106                 if (!mem_cgroup_is_root(mc.to))
1107                         page_counter_uncharge    1107                         page_counter_uncharge(&mc.to->memory, mc.moved_swap);
1108                                                  1108 
1109                 mc.moved_swap = 0;               1109                 mc.moved_swap = 0;
1110         }                                        1110         }
1111         memcg1_oom_recover(from);                1111         memcg1_oom_recover(from);
1112         memcg1_oom_recover(to);                  1112         memcg1_oom_recover(to);
1113         wake_up_all(&mc.waitq);                  1113         wake_up_all(&mc.waitq);
1114 }                                                1114 }
1115                                                  1115 
1116 static void mem_cgroup_clear_mc(void)            1116 static void mem_cgroup_clear_mc(void)
1117 {                                                1117 {
1118         struct mm_struct *mm = mc.mm;            1118         struct mm_struct *mm = mc.mm;
1119                                                  1119 
1120         /*                                       1120         /*
1121          * we must clear moving_task before w    1121          * we must clear moving_task before waking up waiters at the end of
1122          * task migration.                       1122          * task migration.
1123          */                                      1123          */
1124         mc.moving_task = NULL;                   1124         mc.moving_task = NULL;
1125         __mem_cgroup_clear_mc();                 1125         __mem_cgroup_clear_mc();
1126         spin_lock(&mc.lock);                     1126         spin_lock(&mc.lock);
1127         mc.from = NULL;                          1127         mc.from = NULL;
1128         mc.to = NULL;                            1128         mc.to = NULL;
1129         mc.mm = NULL;                            1129         mc.mm = NULL;
1130         spin_unlock(&mc.lock);                   1130         spin_unlock(&mc.lock);
1131                                                  1131 
1132         mmput(mm);                               1132         mmput(mm);
1133 }                                                1133 }
1134                                                  1134 
1135 int memcg1_can_attach(struct cgroup_taskset *    1135 int memcg1_can_attach(struct cgroup_taskset *tset)
1136 {                                                1136 {
1137         struct cgroup_subsys_state *css;         1137         struct cgroup_subsys_state *css;
1138         struct mem_cgroup *memcg = NULL; /* u    1138         struct mem_cgroup *memcg = NULL; /* unneeded init to make gcc happy */
1139         struct mem_cgroup *from;                 1139         struct mem_cgroup *from;
1140         struct task_struct *leader, *p;          1140         struct task_struct *leader, *p;
1141         struct mm_struct *mm;                    1141         struct mm_struct *mm;
1142         unsigned long move_flags;                1142         unsigned long move_flags;
1143         int ret = 0;                             1143         int ret = 0;
1144                                                  1144 
1145         /* charge immigration isn't supported    1145         /* charge immigration isn't supported on the default hierarchy */
1146         if (cgroup_subsys_on_dfl(memory_cgrp_    1146         if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
1147                 return 0;                        1147                 return 0;
1148                                                  1148 
1149         /*                                       1149         /*
1150          * Multi-process migrations only happ    1150          * Multi-process migrations only happen on the default hierarchy
1151          * where charge immigration is not us    1151          * where charge immigration is not used.  Perform charge
1152          * immigration if @tset contains a le    1152          * immigration if @tset contains a leader and whine if there are
1153          * multiple.                             1153          * multiple.
1154          */                                      1154          */
1155         p = NULL;                                1155         p = NULL;
1156         cgroup_taskset_for_each_leader(leader    1156         cgroup_taskset_for_each_leader(leader, css, tset) {
1157                 WARN_ON_ONCE(p);                 1157                 WARN_ON_ONCE(p);
1158                 p = leader;                      1158                 p = leader;
1159                 memcg = mem_cgroup_from_css(c    1159                 memcg = mem_cgroup_from_css(css);
1160         }                                        1160         }
1161         if (!p)                                  1161         if (!p)
1162                 return 0;                        1162                 return 0;
1163                                                  1163 
1164         /*                                       1164         /*
1165          * We are now committed to this value    1165          * We are now committed to this value whatever it is. Changes in this
1166          * tunable will only affect upcoming     1166          * tunable will only affect upcoming migrations, not the current one.
1167          * So we need to save it, and keep it    1167          * So we need to save it, and keep it going.
1168          */                                      1168          */
1169         move_flags = READ_ONCE(memcg->move_ch    1169         move_flags = READ_ONCE(memcg->move_charge_at_immigrate);
1170         if (!move_flags)                         1170         if (!move_flags)
1171                 return 0;                        1171                 return 0;
1172                                                  1172 
1173         from = mem_cgroup_from_task(p);          1173         from = mem_cgroup_from_task(p);
1174                                                  1174 
1175         VM_BUG_ON(from == memcg);                1175         VM_BUG_ON(from == memcg);
1176                                                  1176 
1177         mm = get_task_mm(p);                     1177         mm = get_task_mm(p);
1178         if (!mm)                                 1178         if (!mm)
1179                 return 0;                        1179                 return 0;
1180         /* We move charges only when we move     1180         /* We move charges only when we move a owner of the mm */
1181         if (mm->owner == p) {                    1181         if (mm->owner == p) {
1182                 VM_BUG_ON(mc.from);              1182                 VM_BUG_ON(mc.from);
1183                 VM_BUG_ON(mc.to);                1183                 VM_BUG_ON(mc.to);
1184                 VM_BUG_ON(mc.precharge);         1184                 VM_BUG_ON(mc.precharge);
1185                 VM_BUG_ON(mc.moved_charge);      1185                 VM_BUG_ON(mc.moved_charge);
1186                 VM_BUG_ON(mc.moved_swap);        1186                 VM_BUG_ON(mc.moved_swap);
1187                                                  1187 
1188                 spin_lock(&mc.lock);             1188                 spin_lock(&mc.lock);
1189                 mc.mm = mm;                      1189                 mc.mm = mm;
1190                 mc.from = from;                  1190                 mc.from = from;
1191                 mc.to = memcg;                   1191                 mc.to = memcg;
1192                 mc.flags = move_flags;           1192                 mc.flags = move_flags;
1193                 spin_unlock(&mc.lock);           1193                 spin_unlock(&mc.lock);
1194                 /* We set mc.moving_task late    1194                 /* We set mc.moving_task later */
1195                                                  1195 
1196                 ret = mem_cgroup_precharge_mc    1196                 ret = mem_cgroup_precharge_mc(mm);
1197                 if (ret)                         1197                 if (ret)
1198                         mem_cgroup_clear_mc()    1198                         mem_cgroup_clear_mc();
1199         } else {                                 1199         } else {
1200                 mmput(mm);                       1200                 mmput(mm);
1201         }                                        1201         }
1202         return ret;                              1202         return ret;
1203 }                                                1203 }
1204                                                  1204 
1205 void memcg1_cancel_attach(struct cgroup_tasks    1205 void memcg1_cancel_attach(struct cgroup_taskset *tset)
1206 {                                                1206 {
1207         if (mc.to)                               1207         if (mc.to)
1208                 mem_cgroup_clear_mc();           1208                 mem_cgroup_clear_mc();
1209 }                                                1209 }
1210                                                  1210 
1211 static int mem_cgroup_move_charge_pte_range(p    1211 static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
1212                                 unsigned long    1212                                 unsigned long addr, unsigned long end,
1213                                 struct mm_wal    1213                                 struct mm_walk *walk)
1214 {                                                1214 {
1215         int ret = 0;                             1215         int ret = 0;
1216         struct vm_area_struct *vma = walk->vm    1216         struct vm_area_struct *vma = walk->vma;
1217         pte_t *pte;                              1217         pte_t *pte;
1218         spinlock_t *ptl;                         1218         spinlock_t *ptl;
1219         enum mc_target_type target_type;         1219         enum mc_target_type target_type;
1220         union mc_target target;                  1220         union mc_target target;
1221         struct folio *folio;                     1221         struct folio *folio;
1222         bool tried_split_before = false;         1222         bool tried_split_before = false;
1223                                                  1223 
1224 retry_pmd:                                       1224 retry_pmd:
1225         ptl = pmd_trans_huge_lock(pmd, vma);     1225         ptl = pmd_trans_huge_lock(pmd, vma);
1226         if (ptl) {                               1226         if (ptl) {
1227                 if (mc.precharge < HPAGE_PMD_    1227                 if (mc.precharge < HPAGE_PMD_NR) {
1228                         spin_unlock(ptl);        1228                         spin_unlock(ptl);
1229                         return 0;                1229                         return 0;
1230                 }                                1230                 }
1231                 target_type = get_mctgt_type_    1231                 target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
1232                 if (target_type == MC_TARGET_    1232                 if (target_type == MC_TARGET_PAGE) {
1233                         folio = target.folio;    1233                         folio = target.folio;
1234                         /*                       1234                         /*
1235                          * Deferred split que    1235                          * Deferred split queue locking depends on memcg,
1236                          * and unqueue is uns    1236                          * and unqueue is unsafe unless folio refcount is 0:
1237                          * split or skip if o    1237                          * split or skip if on the queue? first try to split.
1238                          */                      1238                          */
1239                         if (!list_empty(&foli    1239                         if (!list_empty(&folio->_deferred_list)) {
1240                                 spin_unlock(p    1240                                 spin_unlock(ptl);
1241                                 if (!tried_sp    1241                                 if (!tried_split_before)
1242                                         split    1242                                         split_folio(folio);
1243                                 folio_unlock(    1243                                 folio_unlock(folio);
1244                                 folio_put(fol    1244                                 folio_put(folio);
1245                                 if (tried_spl    1245                                 if (tried_split_before)
1246                                         retur    1246                                         return 0;
1247                                 tried_split_b    1247                                 tried_split_before = true;
1248                                 goto retry_pm    1248                                 goto retry_pmd;
1249                         }                        1249                         }
1250                         /*                       1250                         /*
1251                          * So long as that pm    1251                          * So long as that pmd lock is held, the folio cannot
1252                          * be racily added to    1252                          * be racily added to the _deferred_list, because
1253                          * __folio_remove_rma    1253                          * __folio_remove_rmap() will find !partially_mapped.
1254                          */                      1254                          */
1255                         if (folio_isolate_lru    1255                         if (folio_isolate_lru(folio)) {
1256                                 if (!mem_cgro    1256                                 if (!mem_cgroup_move_account(folio, true,
1257                                                  1257                                                              mc.from, mc.to)) {
1258                                         mc.pr    1258                                         mc.precharge -= HPAGE_PMD_NR;
1259                                         mc.mo    1259                                         mc.moved_charge += HPAGE_PMD_NR;
1260                                 }                1260                                 }
1261                                 folio_putback    1261                                 folio_putback_lru(folio);
1262                         }                        1262                         }
1263                         folio_unlock(folio);     1263                         folio_unlock(folio);
1264                         folio_put(folio);        1264                         folio_put(folio);
1265                 } else if (target_type == MC_    1265                 } else if (target_type == MC_TARGET_DEVICE) {
1266                         folio = target.folio;    1266                         folio = target.folio;
1267                         if (!mem_cgroup_move_    1267                         if (!mem_cgroup_move_account(folio, true,
1268                                                  1268                                                      mc.from, mc.to)) {
1269                                 mc.precharge     1269                                 mc.precharge -= HPAGE_PMD_NR;
1270                                 mc.moved_char    1270                                 mc.moved_charge += HPAGE_PMD_NR;
1271                         }                        1271                         }
1272                         folio_unlock(folio);     1272                         folio_unlock(folio);
1273                         folio_put(folio);        1273                         folio_put(folio);
1274                 }                                1274                 }
1275                 spin_unlock(ptl);                1275                 spin_unlock(ptl);
1276                 return 0;                        1276                 return 0;
1277         }                                        1277         }
1278                                                  1278 
1279 retry:                                           1279 retry:
1280         pte = pte_offset_map_lock(vma->vm_mm,    1280         pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
1281         if (!pte)                                1281         if (!pte)
1282                 return 0;                        1282                 return 0;
1283         for (; addr != end; addr += PAGE_SIZE    1283         for (; addr != end; addr += PAGE_SIZE) {
1284                 pte_t ptent = ptep_get(pte++)    1284                 pte_t ptent = ptep_get(pte++);
1285                 bool device = false;             1285                 bool device = false;
1286                 swp_entry_t ent;                 1286                 swp_entry_t ent;
1287                                                  1287 
1288                 if (!mc.precharge)               1288                 if (!mc.precharge)
1289                         break;                   1289                         break;
1290                                                  1290 
1291                 switch (get_mctgt_type(vma, a    1291                 switch (get_mctgt_type(vma, addr, ptent, &target)) {
1292                 case MC_TARGET_DEVICE:           1292                 case MC_TARGET_DEVICE:
1293                         device = true;           1293                         device = true;
1294                         fallthrough;             1294                         fallthrough;
1295                 case MC_TARGET_PAGE:             1295                 case MC_TARGET_PAGE:
1296                         folio = target.folio;    1296                         folio = target.folio;
1297                         /*                       1297                         /*
1298                          * We can have a part    1298                          * We can have a part of the split pmd here. Moving it
1299                          * can be done but it    1299                          * can be done but it would be too convoluted so simply
1300                          * ignore such a part    1300                          * ignore such a partial THP and keep it in original
1301                          * memcg. There shoul    1301                          * memcg. There should be somebody mapping the head.
1302                          */                      1302                          */
1303                         if (folio_test_large(    1303                         if (folio_test_large(folio))
1304                                 goto put;        1304                                 goto put;
1305                         if (!device && !folio    1305                         if (!device && !folio_isolate_lru(folio))
1306                                 goto put;        1306                                 goto put;
1307                         if (!mem_cgroup_move_    1307                         if (!mem_cgroup_move_account(folio, false,
1308                                                  1308                                                 mc.from, mc.to)) {
1309                                 mc.precharge-    1309                                 mc.precharge--;
1310                                 /* we uncharg    1310                                 /* we uncharge from mc.from later. */
1311                                 mc.moved_char    1311                                 mc.moved_charge++;
1312                         }                        1312                         }
1313                         if (!device)             1313                         if (!device)
1314                                 folio_putback    1314                                 folio_putback_lru(folio);
1315 put:                    /* get_mctgt_type() g    1315 put:                    /* get_mctgt_type() gets & locks the page */
1316                         folio_unlock(folio);     1316                         folio_unlock(folio);
1317                         folio_put(folio);        1317                         folio_put(folio);
1318                         break;                   1318                         break;
1319                 case MC_TARGET_SWAP:             1319                 case MC_TARGET_SWAP:
1320                         ent = target.ent;        1320                         ent = target.ent;
1321                         if (!mem_cgroup_move_    1321                         if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) {
1322                                 mc.precharge-    1322                                 mc.precharge--;
1323                                 mem_cgroup_id    1323                                 mem_cgroup_id_get_many(mc.to, 1);
1324                                 /* we fixup o    1324                                 /* we fixup other refcnts and charges later. */
1325                                 mc.moved_swap    1325                                 mc.moved_swap++;
1326                         }                        1326                         }
1327                         break;                   1327                         break;
1328                 default:                         1328                 default:
1329                         break;                   1329                         break;
1330                 }                                1330                 }
1331         }                                        1331         }
1332         pte_unmap_unlock(pte - 1, ptl);          1332         pte_unmap_unlock(pte - 1, ptl);
1333         cond_resched();                          1333         cond_resched();
1334                                                  1334 
1335         if (addr != end) {                       1335         if (addr != end) {
1336                 /*                               1336                 /*
1337                  * We have consumed all prech    1337                  * We have consumed all precharges we got in can_attach().
1338                  * We try charge one by one,     1338                  * We try charge one by one, but don't do any additional
1339                  * charges to mc.to if we hav    1339                  * charges to mc.to if we have failed in charge once in attach()
1340                  * phase.                        1340                  * phase.
1341                  */                              1341                  */
1342                 ret = mem_cgroup_do_precharge    1342                 ret = mem_cgroup_do_precharge(1);
1343                 if (!ret)                        1343                 if (!ret)
1344                         goto retry;              1344                         goto retry;
1345         }                                        1345         }
1346                                                  1346 
1347         return ret;                              1347         return ret;
1348 }                                                1348 }
1349                                                  1349 
1350 static const struct mm_walk_ops charge_walk_o    1350 static const struct mm_walk_ops charge_walk_ops = {
1351         .pmd_entry      = mem_cgroup_move_cha    1351         .pmd_entry      = mem_cgroup_move_charge_pte_range,
1352         .walk_lock      = PGWALK_RDLOCK,         1352         .walk_lock      = PGWALK_RDLOCK,
1353 };                                               1353 };
1354                                                  1354 
1355 static void mem_cgroup_move_charge(void)         1355 static void mem_cgroup_move_charge(void)
1356 {                                                1356 {
1357         lru_add_drain_all();                     1357         lru_add_drain_all();
1358         /*                                       1358         /*
1359          * Signal folio_memcg_lock() to take     1359          * Signal folio_memcg_lock() to take the memcg's move_lock
1360          * while we're moving its pages to an    1360          * while we're moving its pages to another memcg. Then wait
1361          * for already started RCU-only updat    1361          * for already started RCU-only updates to finish.
1362          */                                      1362          */
1363         atomic_inc(&mc.from->moving_account);    1363         atomic_inc(&mc.from->moving_account);
1364         synchronize_rcu();                       1364         synchronize_rcu();
1365 retry:                                           1365 retry:
1366         if (unlikely(!mmap_read_trylock(mc.mm    1366         if (unlikely(!mmap_read_trylock(mc.mm))) {
1367                 /*                               1367                 /*
1368                  * Someone who are holding th    1368                  * Someone who are holding the mmap_lock might be waiting in
1369                  * waitq. So we cancel all ex    1369                  * waitq. So we cancel all extra charges, wake up all waiters,
1370                  * and retry. Because we canc    1370                  * and retry. Because we cancel precharges, we might not be able
1371                  * to move enough charges, bu    1371                  * to move enough charges, but moving charge is a best-effort
1372                  * feature anyway, so it woul    1372                  * feature anyway, so it wouldn't be a big problem.
1373                  */                              1373                  */
1374                 __mem_cgroup_clear_mc();         1374                 __mem_cgroup_clear_mc();
1375                 cond_resched();                  1375                 cond_resched();
1376                 goto retry;                      1376                 goto retry;
1377         }                                        1377         }
1378         /*                                       1378         /*
1379          * When we have consumed all precharg    1379          * When we have consumed all precharges and failed in doing
1380          * additional charge, the page walk j    1380          * additional charge, the page walk just aborts.
1381          */                                      1381          */
1382         walk_page_range(mc.mm, 0, ULONG_MAX,     1382         walk_page_range(mc.mm, 0, ULONG_MAX, &charge_walk_ops, NULL);
1383         mmap_read_unlock(mc.mm);                 1383         mmap_read_unlock(mc.mm);
1384         atomic_dec(&mc.from->moving_account);    1384         atomic_dec(&mc.from->moving_account);
1385 }                                                1385 }
1386                                                  1386 
1387 void memcg1_move_task(void)                      1387 void memcg1_move_task(void)
1388 {                                                1388 {
1389         if (mc.to) {                             1389         if (mc.to) {
1390                 mem_cgroup_move_charge();        1390                 mem_cgroup_move_charge();
1391                 mem_cgroup_clear_mc();           1391                 mem_cgroup_clear_mc();
1392         }                                        1392         }
1393 }                                                1393 }
1394                                                  1394 
1395 #else   /* !CONFIG_MMU */                        1395 #else   /* !CONFIG_MMU */
1396 int memcg1_can_attach(struct cgroup_taskset *    1396 int memcg1_can_attach(struct cgroup_taskset *tset)
1397 {                                                1397 {
1398         return 0;                                1398         return 0;
1399 }                                                1399 }
1400 void memcg1_cancel_attach(struct cgroup_tasks    1400 void memcg1_cancel_attach(struct cgroup_taskset *tset)
1401 {                                                1401 {
1402 }                                                1402 }
1403 void memcg1_move_task(void)                      1403 void memcg1_move_task(void)
1404 {                                                1404 {
1405 }                                                1405 }
1406 #endif                                           1406 #endif
1407                                                  1407 
1408 static void __mem_cgroup_threshold(struct mem    1408 static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
1409 {                                                1409 {
1410         struct mem_cgroup_threshold_ary *t;      1410         struct mem_cgroup_threshold_ary *t;
1411         unsigned long usage;                     1411         unsigned long usage;
1412         int i;                                   1412         int i;
1413                                                  1413 
1414         rcu_read_lock();                         1414         rcu_read_lock();
1415         if (!swap)                               1415         if (!swap)
1416                 t = rcu_dereference(memcg->th    1416                 t = rcu_dereference(memcg->thresholds.primary);
1417         else                                     1417         else
1418                 t = rcu_dereference(memcg->me    1418                 t = rcu_dereference(memcg->memsw_thresholds.primary);
1419                                                  1419 
1420         if (!t)                                  1420         if (!t)
1421                 goto unlock;                     1421                 goto unlock;
1422                                                  1422 
1423         usage = mem_cgroup_usage(memcg, swap)    1423         usage = mem_cgroup_usage(memcg, swap);
1424                                                  1424 
1425         /*                                       1425         /*
1426          * current_threshold points to thresh    1426          * current_threshold points to threshold just below or equal to usage.
1427          * If it's not true, a threshold was     1427          * If it's not true, a threshold was crossed after last
1428          * call of __mem_cgroup_threshold().     1428          * call of __mem_cgroup_threshold().
1429          */                                      1429          */
1430         i = t->current_threshold;                1430         i = t->current_threshold;
1431                                                  1431 
1432         /*                                       1432         /*
1433          * Iterate backward over array of thr    1433          * Iterate backward over array of thresholds starting from
1434          * current_threshold and check if a t    1434          * current_threshold and check if a threshold is crossed.
1435          * If none of thresholds below usage     1435          * If none of thresholds below usage is crossed, we read
1436          * only one element of the array here    1436          * only one element of the array here.
1437          */                                      1437          */
1438         for (; i >= 0 && unlikely(t->entries[    1438         for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
1439                 eventfd_signal(t->entries[i].    1439                 eventfd_signal(t->entries[i].eventfd);
1440                                                  1440 
1441         /* i = current_threshold + 1 */          1441         /* i = current_threshold + 1 */
1442         i++;                                     1442         i++;
1443                                                  1443 
1444         /*                                       1444         /*
1445          * Iterate forward over array of thre    1445          * Iterate forward over array of thresholds starting from
1446          * current_threshold+1 and check if a    1446          * current_threshold+1 and check if a threshold is crossed.
1447          * If none of thresholds above usage     1447          * If none of thresholds above usage is crossed, we read
1448          * only one element of the array here    1448          * only one element of the array here.
1449          */                                      1449          */
1450         for (; i < t->size && unlikely(t->ent    1450         for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
1451                 eventfd_signal(t->entries[i].    1451                 eventfd_signal(t->entries[i].eventfd);
1452                                                  1452 
1453         /* Update current_threshold */           1453         /* Update current_threshold */
1454         t->current_threshold = i - 1;            1454         t->current_threshold = i - 1;
1455 unlock:                                          1455 unlock:
1456         rcu_read_unlock();                       1456         rcu_read_unlock();
1457 }                                                1457 }
1458                                                  1458 
1459 static void mem_cgroup_threshold(struct mem_c    1459 static void mem_cgroup_threshold(struct mem_cgroup *memcg)
1460 {                                                1460 {
1461         while (memcg) {                          1461         while (memcg) {
1462                 __mem_cgroup_threshold(memcg,    1462                 __mem_cgroup_threshold(memcg, false);
1463                 if (do_memsw_account())          1463                 if (do_memsw_account())
1464                         __mem_cgroup_threshol    1464                         __mem_cgroup_threshold(memcg, true);
1465                                                  1465 
1466                 memcg = parent_mem_cgroup(mem    1466                 memcg = parent_mem_cgroup(memcg);
1467         }                                        1467         }
1468 }                                                1468 }
1469                                                  1469 
1470 /* Cgroup1: threshold notifications & softlim    1470 /* Cgroup1: threshold notifications & softlimit tree updates */
1471 struct memcg1_events_percpu {                    1471 struct memcg1_events_percpu {
1472         unsigned long nr_page_events;            1472         unsigned long nr_page_events;
1473         unsigned long targets[MEM_CGROUP_NTAR    1473         unsigned long targets[MEM_CGROUP_NTARGETS];
1474 };                                               1474 };
1475                                                  1475 
1476 static void memcg1_charge_statistics(struct m    1476 static void memcg1_charge_statistics(struct mem_cgroup *memcg, int nr_pages)
1477 {                                                1477 {
1478         /* pagein of a big page is an event.     1478         /* pagein of a big page is an event. So, ignore page size */
1479         if (nr_pages > 0)                        1479         if (nr_pages > 0)
1480                 __count_memcg_events(memcg, P    1480                 __count_memcg_events(memcg, PGPGIN, 1);
1481         else {                                   1481         else {
1482                 __count_memcg_events(memcg, P    1482                 __count_memcg_events(memcg, PGPGOUT, 1);
1483                 nr_pages = -nr_pages; /* for     1483                 nr_pages = -nr_pages; /* for event */
1484         }                                        1484         }
1485                                                  1485 
1486         __this_cpu_add(memcg->events_percpu->    1486         __this_cpu_add(memcg->events_percpu->nr_page_events, nr_pages);
1487 }                                                1487 }
1488                                                  1488 
1489 #define THRESHOLDS_EVENTS_TARGET 128             1489 #define THRESHOLDS_EVENTS_TARGET 128
1490 #define SOFTLIMIT_EVENTS_TARGET 1024             1490 #define SOFTLIMIT_EVENTS_TARGET 1024
1491                                                  1491 
1492 static bool memcg1_event_ratelimit(struct mem    1492 static bool memcg1_event_ratelimit(struct mem_cgroup *memcg,
1493                                 enum mem_cgro    1493                                 enum mem_cgroup_events_target target)
1494 {                                                1494 {
1495         unsigned long val, next;                 1495         unsigned long val, next;
1496                                                  1496 
1497         val = __this_cpu_read(memcg->events_p    1497         val = __this_cpu_read(memcg->events_percpu->nr_page_events);
1498         next = __this_cpu_read(memcg->events_    1498         next = __this_cpu_read(memcg->events_percpu->targets[target]);
1499         /* from time_after() in jiffies.h */     1499         /* from time_after() in jiffies.h */
1500         if ((long)(next - val) < 0) {            1500         if ((long)(next - val) < 0) {
1501                 switch (target) {                1501                 switch (target) {
1502                 case MEM_CGROUP_TARGET_THRESH    1502                 case MEM_CGROUP_TARGET_THRESH:
1503                         next = val + THRESHOL    1503                         next = val + THRESHOLDS_EVENTS_TARGET;
1504                         break;                   1504                         break;
1505                 case MEM_CGROUP_TARGET_SOFTLI    1505                 case MEM_CGROUP_TARGET_SOFTLIMIT:
1506                         next = val + SOFTLIMI    1506                         next = val + SOFTLIMIT_EVENTS_TARGET;
1507                         break;                   1507                         break;
1508                 default:                         1508                 default:
1509                         break;                   1509                         break;
1510                 }                                1510                 }
1511                 __this_cpu_write(memcg->event    1511                 __this_cpu_write(memcg->events_percpu->targets[target], next);
1512                 return true;                     1512                 return true;
1513         }                                        1513         }
1514         return false;                            1514         return false;
1515 }                                                1515 }
1516                                                  1516 
1517 /*                                               1517 /*
1518  * Check events in order.                        1518  * Check events in order.
1519  *                                               1519  *
1520  */                                              1520  */
1521 static void memcg1_check_events(struct mem_cg    1521 static void memcg1_check_events(struct mem_cgroup *memcg, int nid)
1522 {                                                1522 {
1523         if (IS_ENABLED(CONFIG_PREEMPT_RT))       1523         if (IS_ENABLED(CONFIG_PREEMPT_RT))
1524                 return;                          1524                 return;
1525                                                  1525 
1526         /* threshold event is triggered in fi    1526         /* threshold event is triggered in finer grain than soft limit */
1527         if (unlikely(memcg1_event_ratelimit(m    1527         if (unlikely(memcg1_event_ratelimit(memcg,
1528                                                  1528                                                 MEM_CGROUP_TARGET_THRESH))) {
1529                 bool do_softlimit;               1529                 bool do_softlimit;
1530                                                  1530 
1531                 do_softlimit = memcg1_event_r    1531                 do_softlimit = memcg1_event_ratelimit(memcg,
1532                                                  1532                                                 MEM_CGROUP_TARGET_SOFTLIMIT);
1533                 mem_cgroup_threshold(memcg);     1533                 mem_cgroup_threshold(memcg);
1534                 if (unlikely(do_softlimit))      1534                 if (unlikely(do_softlimit))
1535                         memcg1_update_tree(me    1535                         memcg1_update_tree(memcg, nid);
1536         }                                        1536         }
1537 }                                                1537 }
1538                                                  1538 
1539 void memcg1_commit_charge(struct folio *folio    1539 void memcg1_commit_charge(struct folio *folio, struct mem_cgroup *memcg)
1540 {                                                1540 {
1541         unsigned long flags;                     1541         unsigned long flags;
1542                                                  1542 
1543         local_irq_save(flags);                   1543         local_irq_save(flags);
1544         memcg1_charge_statistics(memcg, folio    1544         memcg1_charge_statistics(memcg, folio_nr_pages(folio));
1545         memcg1_check_events(memcg, folio_nid(    1545         memcg1_check_events(memcg, folio_nid(folio));
1546         local_irq_restore(flags);                1546         local_irq_restore(flags);
1547 }                                                1547 }
1548                                                  1548 
1549 void memcg1_swapout(struct folio *folio, stru    1549 void memcg1_swapout(struct folio *folio, struct mem_cgroup *memcg)
1550 {                                                1550 {
1551         /*                                       1551         /*
1552          * Interrupts should be disabled here    1552          * Interrupts should be disabled here because the caller holds the
1553          * i_pages lock which is taken with i    1553          * i_pages lock which is taken with interrupts-off. It is
1554          * important here to have the interru    1554          * important here to have the interrupts disabled because it is the
1555          * only synchronisation we have for u    1555          * only synchronisation we have for updating the per-CPU variables.
1556          */                                      1556          */
1557         preempt_disable_nested();                1557         preempt_disable_nested();
1558         VM_WARN_ON_IRQS_ENABLED();               1558         VM_WARN_ON_IRQS_ENABLED();
1559         memcg1_charge_statistics(memcg, -foli    1559         memcg1_charge_statistics(memcg, -folio_nr_pages(folio));
1560         preempt_enable_nested();                 1560         preempt_enable_nested();
1561         memcg1_check_events(memcg, folio_nid(    1561         memcg1_check_events(memcg, folio_nid(folio));
1562 }                                                1562 }
1563                                                  1563 
1564 void memcg1_uncharge_batch(struct mem_cgroup     1564 void memcg1_uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
1565                            unsigned long nr_m    1565                            unsigned long nr_memory, int nid)
1566 {                                                1566 {
1567         unsigned long flags;                     1567         unsigned long flags;
1568                                                  1568 
1569         local_irq_save(flags);                   1569         local_irq_save(flags);
1570         __count_memcg_events(memcg, PGPGOUT,     1570         __count_memcg_events(memcg, PGPGOUT, pgpgout);
1571         __this_cpu_add(memcg->events_percpu->    1571         __this_cpu_add(memcg->events_percpu->nr_page_events, nr_memory);
1572         memcg1_check_events(memcg, nid);         1572         memcg1_check_events(memcg, nid);
1573         local_irq_restore(flags);                1573         local_irq_restore(flags);
1574 }                                                1574 }
1575                                                  1575 
1576 static int compare_thresholds(const void *a,     1576 static int compare_thresholds(const void *a, const void *b)
1577 {                                                1577 {
1578         const struct mem_cgroup_threshold *_a    1578         const struct mem_cgroup_threshold *_a = a;
1579         const struct mem_cgroup_threshold *_b    1579         const struct mem_cgroup_threshold *_b = b;
1580                                                  1580 
1581         if (_a->threshold > _b->threshold)       1581         if (_a->threshold > _b->threshold)
1582                 return 1;                        1582                 return 1;
1583                                                  1583 
1584         if (_a->threshold < _b->threshold)       1584         if (_a->threshold < _b->threshold)
1585                 return -1;                       1585                 return -1;
1586                                                  1586 
1587         return 0;                                1587         return 0;
1588 }                                                1588 }
1589                                                  1589 
1590 static int mem_cgroup_oom_notify_cb(struct me    1590 static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
1591 {                                                1591 {
1592         struct mem_cgroup_eventfd_list *ev;      1592         struct mem_cgroup_eventfd_list *ev;
1593                                                  1593 
1594         spin_lock(&memcg_oom_lock);              1594         spin_lock(&memcg_oom_lock);
1595                                                  1595 
1596         list_for_each_entry(ev, &memcg->oom_n    1596         list_for_each_entry(ev, &memcg->oom_notify, list)
1597                 eventfd_signal(ev->eventfd);     1597                 eventfd_signal(ev->eventfd);
1598                                                  1598 
1599         spin_unlock(&memcg_oom_lock);            1599         spin_unlock(&memcg_oom_lock);
1600         return 0;                                1600         return 0;
1601 }                                                1601 }
1602                                                  1602 
1603 static void mem_cgroup_oom_notify(struct mem_    1603 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
1604 {                                                1604 {
1605         struct mem_cgroup *iter;                 1605         struct mem_cgroup *iter;
1606                                                  1606 
1607         for_each_mem_cgroup_tree(iter, memcg)    1607         for_each_mem_cgroup_tree(iter, memcg)
1608                 mem_cgroup_oom_notify_cb(iter    1608                 mem_cgroup_oom_notify_cb(iter);
1609 }                                                1609 }
1610                                                  1610 
1611 static int __mem_cgroup_usage_register_event(    1611 static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
1612         struct eventfd_ctx *eventfd, const ch    1612         struct eventfd_ctx *eventfd, const char *args, enum res_type type)
1613 {                                                1613 {
1614         struct mem_cgroup_thresholds *thresho    1614         struct mem_cgroup_thresholds *thresholds;
1615         struct mem_cgroup_threshold_ary *new;    1615         struct mem_cgroup_threshold_ary *new;
1616         unsigned long threshold;                 1616         unsigned long threshold;
1617         unsigned long usage;                     1617         unsigned long usage;
1618         int i, size, ret;                        1618         int i, size, ret;
1619                                                  1619 
1620         ret = page_counter_memparse(args, "-1    1620         ret = page_counter_memparse(args, "-1", &threshold);
1621         if (ret)                                 1621         if (ret)
1622                 return ret;                      1622                 return ret;
1623                                                  1623 
1624         mutex_lock(&memcg->thresholds_lock);     1624         mutex_lock(&memcg->thresholds_lock);
1625                                                  1625 
1626         if (type == _MEM) {                      1626         if (type == _MEM) {
1627                 thresholds = &memcg->threshol    1627                 thresholds = &memcg->thresholds;
1628                 usage = mem_cgroup_usage(memc    1628                 usage = mem_cgroup_usage(memcg, false);
1629         } else if (type == _MEMSWAP) {           1629         } else if (type == _MEMSWAP) {
1630                 thresholds = &memcg->memsw_th    1630                 thresholds = &memcg->memsw_thresholds;
1631                 usage = mem_cgroup_usage(memc    1631                 usage = mem_cgroup_usage(memcg, true);
1632         } else                                   1632         } else
1633                 BUG();                           1633                 BUG();
1634                                                  1634 
1635         /* Check if a threshold crossed befor    1635         /* Check if a threshold crossed before adding a new one */
1636         if (thresholds->primary)                 1636         if (thresholds->primary)
1637                 __mem_cgroup_threshold(memcg,    1637                 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
1638                                                  1638 
1639         size = thresholds->primary ? threshol    1639         size = thresholds->primary ? thresholds->primary->size + 1 : 1;
1640                                                  1640 
1641         /* Allocate memory for new array of t    1641         /* Allocate memory for new array of thresholds */
1642         new = kmalloc(struct_size(new, entrie    1642         new = kmalloc(struct_size(new, entries, size), GFP_KERNEL);
1643         if (!new) {                              1643         if (!new) {
1644                 ret = -ENOMEM;                   1644                 ret = -ENOMEM;
1645                 goto unlock;                     1645                 goto unlock;
1646         }                                        1646         }
1647         new->size = size;                        1647         new->size = size;
1648                                                  1648 
1649         /* Copy thresholds (if any) to new ar    1649         /* Copy thresholds (if any) to new array */
1650         if (thresholds->primary)                 1650         if (thresholds->primary)
1651                 memcpy(new->entries, threshol    1651                 memcpy(new->entries, thresholds->primary->entries,
1652                        flex_array_size(new, e    1652                        flex_array_size(new, entries, size - 1));
1653                                                  1653 
1654         /* Add new threshold */                  1654         /* Add new threshold */
1655         new->entries[size - 1].eventfd = even    1655         new->entries[size - 1].eventfd = eventfd;
1656         new->entries[size - 1].threshold = th    1656         new->entries[size - 1].threshold = threshold;
1657                                                  1657 
1658         /* Sort thresholds. Registering of ne    1658         /* Sort thresholds. Registering of new threshold isn't time-critical */
1659         sort(new->entries, size, sizeof(*new-    1659         sort(new->entries, size, sizeof(*new->entries),
1660                         compare_thresholds, N    1660                         compare_thresholds, NULL);
1661                                                  1661 
1662         /* Find current threshold */             1662         /* Find current threshold */
1663         new->current_threshold = -1;             1663         new->current_threshold = -1;
1664         for (i = 0; i < size; i++) {             1664         for (i = 0; i < size; i++) {
1665                 if (new->entries[i].threshold    1665                 if (new->entries[i].threshold <= usage) {
1666                         /*                       1666                         /*
1667                          * new->current_thres    1667                          * new->current_threshold will not be used until
1668                          * rcu_assign_pointer    1668                          * rcu_assign_pointer(), so it's safe to increment
1669                          * it here.              1669                          * it here.
1670                          */                      1670                          */
1671                         ++new->current_thresh    1671                         ++new->current_threshold;
1672                 } else                           1672                 } else
1673                         break;                   1673                         break;
1674         }                                        1674         }
1675                                                  1675 
1676         /* Free old spare buffer and save old    1676         /* Free old spare buffer and save old primary buffer as spare */
1677         kfree(thresholds->spare);                1677         kfree(thresholds->spare);
1678         thresholds->spare = thresholds->prima    1678         thresholds->spare = thresholds->primary;
1679                                                  1679 
1680         rcu_assign_pointer(thresholds->primar    1680         rcu_assign_pointer(thresholds->primary, new);
1681                                                  1681 
1682         /* To be sure that nobody uses thresh    1682         /* To be sure that nobody uses thresholds */
1683         synchronize_rcu();                       1683         synchronize_rcu();
1684                                                  1684 
1685 unlock:                                          1685 unlock:
1686         mutex_unlock(&memcg->thresholds_lock)    1686         mutex_unlock(&memcg->thresholds_lock);
1687                                                  1687 
1688         return ret;                              1688         return ret;
1689 }                                                1689 }
1690                                                  1690 
1691 static int mem_cgroup_usage_register_event(st    1691 static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
1692         struct eventfd_ctx *eventfd, const ch    1692         struct eventfd_ctx *eventfd, const char *args)
1693 {                                                1693 {
1694         return __mem_cgroup_usage_register_ev    1694         return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
1695 }                                                1695 }
1696                                                  1696 
1697 static int memsw_cgroup_usage_register_event(    1697 static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
1698         struct eventfd_ctx *eventfd, const ch    1698         struct eventfd_ctx *eventfd, const char *args)
1699 {                                                1699 {
1700         return __mem_cgroup_usage_register_ev    1700         return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
1701 }                                                1701 }
1702                                                  1702 
1703 static void __mem_cgroup_usage_unregister_eve    1703 static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
1704         struct eventfd_ctx *eventfd, enum res    1704         struct eventfd_ctx *eventfd, enum res_type type)
1705 {                                                1705 {
1706         struct mem_cgroup_thresholds *thresho    1706         struct mem_cgroup_thresholds *thresholds;
1707         struct mem_cgroup_threshold_ary *new;    1707         struct mem_cgroup_threshold_ary *new;
1708         unsigned long usage;                     1708         unsigned long usage;
1709         int i, j, size, entries;                 1709         int i, j, size, entries;
1710                                                  1710 
1711         mutex_lock(&memcg->thresholds_lock);     1711         mutex_lock(&memcg->thresholds_lock);
1712                                                  1712 
1713         if (type == _MEM) {                      1713         if (type == _MEM) {
1714                 thresholds = &memcg->threshol    1714                 thresholds = &memcg->thresholds;
1715                 usage = mem_cgroup_usage(memc    1715                 usage = mem_cgroup_usage(memcg, false);
1716         } else if (type == _MEMSWAP) {           1716         } else if (type == _MEMSWAP) {
1717                 thresholds = &memcg->memsw_th    1717                 thresholds = &memcg->memsw_thresholds;
1718                 usage = mem_cgroup_usage(memc    1718                 usage = mem_cgroup_usage(memcg, true);
1719         } else                                   1719         } else
1720                 BUG();                           1720                 BUG();
1721                                                  1721 
1722         if (!thresholds->primary)                1722         if (!thresholds->primary)
1723                 goto unlock;                     1723                 goto unlock;
1724                                                  1724 
1725         /* Check if a threshold crossed befor    1725         /* Check if a threshold crossed before removing */
1726         __mem_cgroup_threshold(memcg, type ==    1726         __mem_cgroup_threshold(memcg, type == _MEMSWAP);
1727                                                  1727 
1728         /* Calculate new number of threshold     1728         /* Calculate new number of threshold */
1729         size = entries = 0;                      1729         size = entries = 0;
1730         for (i = 0; i < thresholds->primary->    1730         for (i = 0; i < thresholds->primary->size; i++) {
1731                 if (thresholds->primary->entr    1731                 if (thresholds->primary->entries[i].eventfd != eventfd)
1732                         size++;                  1732                         size++;
1733                 else                             1733                 else
1734                         entries++;               1734                         entries++;
1735         }                                        1735         }
1736                                                  1736 
1737         new = thresholds->spare;                 1737         new = thresholds->spare;
1738                                                  1738 
1739         /* If no items related to eventfd hav    1739         /* If no items related to eventfd have been cleared, nothing to do */
1740         if (!entries)                            1740         if (!entries)
1741                 goto unlock;                     1741                 goto unlock;
1742                                                  1742 
1743         /* Set thresholds array to NULL if we    1743         /* Set thresholds array to NULL if we don't have thresholds */
1744         if (!size) {                             1744         if (!size) {
1745                 kfree(new);                      1745                 kfree(new);
1746                 new = NULL;                      1746                 new = NULL;
1747                 goto swap_buffers;               1747                 goto swap_buffers;
1748         }                                        1748         }
1749                                                  1749 
1750         new->size = size;                        1750         new->size = size;
1751                                                  1751 
1752         /* Copy thresholds and find current t    1752         /* Copy thresholds and find current threshold */
1753         new->current_threshold = -1;             1753         new->current_threshold = -1;
1754         for (i = 0, j = 0; i < thresholds->pr    1754         for (i = 0, j = 0; i < thresholds->primary->size; i++) {
1755                 if (thresholds->primary->entr    1755                 if (thresholds->primary->entries[i].eventfd == eventfd)
1756                         continue;                1756                         continue;
1757                                                  1757 
1758                 new->entries[j] = thresholds-    1758                 new->entries[j] = thresholds->primary->entries[i];
1759                 if (new->entries[j].threshold    1759                 if (new->entries[j].threshold <= usage) {
1760                         /*                       1760                         /*
1761                          * new->current_thres    1761                          * new->current_threshold will not be used
1762                          * until rcu_assign_p    1762                          * until rcu_assign_pointer(), so it's safe to increment
1763                          * it here.              1763                          * it here.
1764                          */                      1764                          */
1765                         ++new->current_thresh    1765                         ++new->current_threshold;
1766                 }                                1766                 }
1767                 j++;                             1767                 j++;
1768         }                                        1768         }
1769                                                  1769 
1770 swap_buffers:                                    1770 swap_buffers:
1771         /* Swap primary and spare array */       1771         /* Swap primary and spare array */
1772         thresholds->spare = thresholds->prima    1772         thresholds->spare = thresholds->primary;
1773                                                  1773 
1774         rcu_assign_pointer(thresholds->primar    1774         rcu_assign_pointer(thresholds->primary, new);
1775                                                  1775 
1776         /* To be sure that nobody uses thresh    1776         /* To be sure that nobody uses thresholds */
1777         synchronize_rcu();                       1777         synchronize_rcu();
1778                                                  1778 
1779         /* If all events are unregistered, fr    1779         /* If all events are unregistered, free the spare array */
1780         if (!new) {                              1780         if (!new) {
1781                 kfree(thresholds->spare);        1781                 kfree(thresholds->spare);
1782                 thresholds->spare = NULL;        1782                 thresholds->spare = NULL;
1783         }                                        1783         }
1784 unlock:                                          1784 unlock:
1785         mutex_unlock(&memcg->thresholds_lock)    1785         mutex_unlock(&memcg->thresholds_lock);
1786 }                                                1786 }
1787                                                  1787 
1788 static void mem_cgroup_usage_unregister_event    1788 static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
1789         struct eventfd_ctx *eventfd)             1789         struct eventfd_ctx *eventfd)
1790 {                                                1790 {
1791         return __mem_cgroup_usage_unregister_    1791         return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
1792 }                                                1792 }
1793                                                  1793 
1794 static void memsw_cgroup_usage_unregister_eve    1794 static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
1795         struct eventfd_ctx *eventfd)             1795         struct eventfd_ctx *eventfd)
1796 {                                                1796 {
1797         return __mem_cgroup_usage_unregister_    1797         return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
1798 }                                                1798 }
1799                                                  1799 
1800 static int mem_cgroup_oom_register_event(stru    1800 static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
1801         struct eventfd_ctx *eventfd, const ch    1801         struct eventfd_ctx *eventfd, const char *args)
1802 {                                                1802 {
1803         struct mem_cgroup_eventfd_list *event    1803         struct mem_cgroup_eventfd_list *event;
1804                                                  1804 
1805         event = kmalloc(sizeof(*event), GFP_K    1805         event = kmalloc(sizeof(*event), GFP_KERNEL);
1806         if (!event)                              1806         if (!event)
1807                 return -ENOMEM;                  1807                 return -ENOMEM;
1808                                                  1808 
1809         spin_lock(&memcg_oom_lock);              1809         spin_lock(&memcg_oom_lock);
1810                                                  1810 
1811         event->eventfd = eventfd;                1811         event->eventfd = eventfd;
1812         list_add(&event->list, &memcg->oom_no    1812         list_add(&event->list, &memcg->oom_notify);
1813                                                  1813 
1814         /* already in OOM ? */                   1814         /* already in OOM ? */
1815         if (memcg->under_oom)                    1815         if (memcg->under_oom)
1816                 eventfd_signal(eventfd);         1816                 eventfd_signal(eventfd);
1817         spin_unlock(&memcg_oom_lock);            1817         spin_unlock(&memcg_oom_lock);
1818                                                  1818 
1819         return 0;                                1819         return 0;
1820 }                                                1820 }
1821                                                  1821 
1822 static void mem_cgroup_oom_unregister_event(s    1822 static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
1823         struct eventfd_ctx *eventfd)             1823         struct eventfd_ctx *eventfd)
1824 {                                                1824 {
1825         struct mem_cgroup_eventfd_list *ev, *    1825         struct mem_cgroup_eventfd_list *ev, *tmp;
1826                                                  1826 
1827         spin_lock(&memcg_oom_lock);              1827         spin_lock(&memcg_oom_lock);
1828                                                  1828 
1829         list_for_each_entry_safe(ev, tmp, &me    1829         list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
1830                 if (ev->eventfd == eventfd) {    1830                 if (ev->eventfd == eventfd) {
1831                         list_del(&ev->list);     1831                         list_del(&ev->list);
1832                         kfree(ev);               1832                         kfree(ev);
1833                 }                                1833                 }
1834         }                                        1834         }
1835                                                  1835 
1836         spin_unlock(&memcg_oom_lock);            1836         spin_unlock(&memcg_oom_lock);
1837 }                                                1837 }
1838                                                  1838 
1839 /*                                               1839 /*
1840  * DO NOT USE IN NEW FILES.                      1840  * DO NOT USE IN NEW FILES.
1841  *                                               1841  *
1842  * "cgroup.event_control" implementation.        1842  * "cgroup.event_control" implementation.
1843  *                                               1843  *
1844  * This is way over-engineered.  It tries to     1844  * This is way over-engineered.  It tries to support fully configurable
1845  * events for each user.  Such level of flexi    1845  * events for each user.  Such level of flexibility is completely
1846  * unnecessary especially in the light of the    1846  * unnecessary especially in the light of the planned unified hierarchy.
1847  *                                               1847  *
1848  * Please deprecate this and replace with som    1848  * Please deprecate this and replace with something simpler if at all
1849  * possible.                                     1849  * possible.
1850  */                                              1850  */
1851                                                  1851 
1852 /*                                               1852 /*
1853  * Unregister event and free resources.          1853  * Unregister event and free resources.
1854  *                                               1854  *
1855  * Gets called from workqueue.                   1855  * Gets called from workqueue.
1856  */                                              1856  */
1857 static void memcg_event_remove(struct work_st    1857 static void memcg_event_remove(struct work_struct *work)
1858 {                                                1858 {
1859         struct mem_cgroup_event *event =         1859         struct mem_cgroup_event *event =
1860                 container_of(work, struct mem    1860                 container_of(work, struct mem_cgroup_event, remove);
1861         struct mem_cgroup *memcg = event->mem    1861         struct mem_cgroup *memcg = event->memcg;
1862                                                  1862 
1863         remove_wait_queue(event->wqh, &event-    1863         remove_wait_queue(event->wqh, &event->wait);
1864                                                  1864 
1865         event->unregister_event(memcg, event-    1865         event->unregister_event(memcg, event->eventfd);
1866                                                  1866 
1867         /* Notify userspace the event is goin    1867         /* Notify userspace the event is going away. */
1868         eventfd_signal(event->eventfd);          1868         eventfd_signal(event->eventfd);
1869                                                  1869 
1870         eventfd_ctx_put(event->eventfd);         1870         eventfd_ctx_put(event->eventfd);
1871         kfree(event);                            1871         kfree(event);
1872         css_put(&memcg->css);                    1872         css_put(&memcg->css);
1873 }                                                1873 }
1874                                                  1874 
1875 /*                                               1875 /*
1876  * Gets called on EPOLLHUP on eventfd when us    1876  * Gets called on EPOLLHUP on eventfd when user closes it.
1877  *                                               1877  *
1878  * Called with wqh->lock held and interrupts     1878  * Called with wqh->lock held and interrupts disabled.
1879  */                                              1879  */
1880 static int memcg_event_wake(wait_queue_entry_    1880 static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode,
1881                             int sync, void *k    1881                             int sync, void *key)
1882 {                                                1882 {
1883         struct mem_cgroup_event *event =         1883         struct mem_cgroup_event *event =
1884                 container_of(wait, struct mem    1884                 container_of(wait, struct mem_cgroup_event, wait);
1885         struct mem_cgroup *memcg = event->mem    1885         struct mem_cgroup *memcg = event->memcg;
1886         __poll_t flags = key_to_poll(key);       1886         __poll_t flags = key_to_poll(key);
1887                                                  1887 
1888         if (flags & EPOLLHUP) {                  1888         if (flags & EPOLLHUP) {
1889                 /*                               1889                 /*
1890                  * If the event has been deta    1890                  * If the event has been detached at cgroup removal, we
1891                  * can simply return knowing     1891                  * can simply return knowing the other side will cleanup
1892                  * for us.                       1892                  * for us.
1893                  *                               1893                  *
1894                  * We can't race against even    1894                  * We can't race against event freeing since the other
1895                  * side will require wqh->loc    1895                  * side will require wqh->lock via remove_wait_queue(),
1896                  * which we hold.                1896                  * which we hold.
1897                  */                              1897                  */
1898                 spin_lock(&memcg->event_list_    1898                 spin_lock(&memcg->event_list_lock);
1899                 if (!list_empty(&event->list)    1899                 if (!list_empty(&event->list)) {
1900                         list_del_init(&event-    1900                         list_del_init(&event->list);
1901                         /*                       1901                         /*
1902                          * We are in atomic c    1902                          * We are in atomic context, but cgroup_event_remove()
1903                          * may sleep, so we h    1903                          * may sleep, so we have to call it in workqueue.
1904                          */                      1904                          */
1905                         schedule_work(&event-    1905                         schedule_work(&event->remove);
1906                 }                                1906                 }
1907                 spin_unlock(&memcg->event_lis    1907                 spin_unlock(&memcg->event_list_lock);
1908         }                                        1908         }
1909                                                  1909 
1910         return 0;                                1910         return 0;
1911 }                                                1911 }
1912                                                  1912 
1913 static void memcg_event_ptable_queue_proc(str    1913 static void memcg_event_ptable_queue_proc(struct file *file,
1914                 wait_queue_head_t *wqh, poll_    1914                 wait_queue_head_t *wqh, poll_table *pt)
1915 {                                                1915 {
1916         struct mem_cgroup_event *event =         1916         struct mem_cgroup_event *event =
1917                 container_of(pt, struct mem_c    1917                 container_of(pt, struct mem_cgroup_event, pt);
1918                                                  1918 
1919         event->wqh = wqh;                        1919         event->wqh = wqh;
1920         add_wait_queue(wqh, &event->wait);       1920         add_wait_queue(wqh, &event->wait);
1921 }                                                1921 }
1922                                                  1922 
1923 /*                                               1923 /*
1924  * DO NOT USE IN NEW FILES.                      1924  * DO NOT USE IN NEW FILES.
1925  *                                               1925  *
1926  * Parse input and register new cgroup event     1926  * Parse input and register new cgroup event handler.
1927  *                                               1927  *
1928  * Input must be in format '<event_fd> <contr    1928  * Input must be in format '<event_fd> <control_fd> <args>'.
1929  * Interpretation of args is defined by contr    1929  * Interpretation of args is defined by control file implementation.
1930  */                                              1930  */
1931 static ssize_t memcg_write_event_control(stru    1931 static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
1932                                          char    1932                                          char *buf, size_t nbytes, loff_t off)
1933 {                                                1933 {
1934         struct cgroup_subsys_state *css = of_    1934         struct cgroup_subsys_state *css = of_css(of);
1935         struct mem_cgroup *memcg = mem_cgroup    1935         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
1936         struct mem_cgroup_event *event;          1936         struct mem_cgroup_event *event;
1937         struct cgroup_subsys_state *cfile_css    1937         struct cgroup_subsys_state *cfile_css;
1938         unsigned int efd, cfd;                   1938         unsigned int efd, cfd;
1939         struct fd efile;                         1939         struct fd efile;
1940         struct fd cfile;                         1940         struct fd cfile;
1941         struct dentry *cdentry;                  1941         struct dentry *cdentry;
1942         const char *name;                        1942         const char *name;
1943         char *endp;                              1943         char *endp;
1944         int ret;                                 1944         int ret;
1945                                                  1945 
1946         if (IS_ENABLED(CONFIG_PREEMPT_RT))       1946         if (IS_ENABLED(CONFIG_PREEMPT_RT))
1947                 return -EOPNOTSUPP;              1947                 return -EOPNOTSUPP;
1948                                                  1948 
1949         buf = strstrip(buf);                     1949         buf = strstrip(buf);
1950                                                  1950 
1951         efd = simple_strtoul(buf, &endp, 10);    1951         efd = simple_strtoul(buf, &endp, 10);
1952         if (*endp != ' ')                        1952         if (*endp != ' ')
1953                 return -EINVAL;                  1953                 return -EINVAL;
1954         buf = endp + 1;                          1954         buf = endp + 1;
1955                                                  1955 
1956         cfd = simple_strtoul(buf, &endp, 10);    1956         cfd = simple_strtoul(buf, &endp, 10);
1957         if (*endp == '\0')                       1957         if (*endp == '\0')
1958                 buf = endp;                      1958                 buf = endp;
1959         else if (*endp == ' ')                   1959         else if (*endp == ' ')
1960                 buf = endp + 1;                  1960                 buf = endp + 1;
1961         else                                     1961         else
1962                 return -EINVAL;                  1962                 return -EINVAL;
1963                                                  1963 
1964         event = kzalloc(sizeof(*event), GFP_K    1964         event = kzalloc(sizeof(*event), GFP_KERNEL);
1965         if (!event)                              1965         if (!event)
1966                 return -ENOMEM;                  1966                 return -ENOMEM;
1967                                                  1967 
1968         event->memcg = memcg;                    1968         event->memcg = memcg;
1969         INIT_LIST_HEAD(&event->list);            1969         INIT_LIST_HEAD(&event->list);
1970         init_poll_funcptr(&event->pt, memcg_e    1970         init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
1971         init_waitqueue_func_entry(&event->wai    1971         init_waitqueue_func_entry(&event->wait, memcg_event_wake);
1972         INIT_WORK(&event->remove, memcg_event    1972         INIT_WORK(&event->remove, memcg_event_remove);
1973                                                  1973 
1974         efile = fdget(efd);                      1974         efile = fdget(efd);
1975         if (!fd_file(efile)) {                   1975         if (!fd_file(efile)) {
1976                 ret = -EBADF;                    1976                 ret = -EBADF;
1977                 goto out_kfree;                  1977                 goto out_kfree;
1978         }                                        1978         }
1979                                                  1979 
1980         event->eventfd = eventfd_ctx_fileget(    1980         event->eventfd = eventfd_ctx_fileget(fd_file(efile));
1981         if (IS_ERR(event->eventfd)) {            1981         if (IS_ERR(event->eventfd)) {
1982                 ret = PTR_ERR(event->eventfd)    1982                 ret = PTR_ERR(event->eventfd);
1983                 goto out_put_efile;              1983                 goto out_put_efile;
1984         }                                        1984         }
1985                                                  1985 
1986         cfile = fdget(cfd);                      1986         cfile = fdget(cfd);
1987         if (!fd_file(cfile)) {                   1987         if (!fd_file(cfile)) {
1988                 ret = -EBADF;                    1988                 ret = -EBADF;
1989                 goto out_put_eventfd;            1989                 goto out_put_eventfd;
1990         }                                        1990         }
1991                                                  1991 
1992         /* the process need read permission o    1992         /* the process need read permission on control file */
1993         /* AV: shouldn't we check that it's b    1993         /* AV: shouldn't we check that it's been opened for read instead? */
1994         ret = file_permission(fd_file(cfile),    1994         ret = file_permission(fd_file(cfile), MAY_READ);
1995         if (ret < 0)                             1995         if (ret < 0)
1996                 goto out_put_cfile;              1996                 goto out_put_cfile;
1997                                                  1997 
1998         /*                                       1998         /*
1999          * The control file must be a regular    1999          * The control file must be a regular cgroup1 file. As a regular cgroup
2000          * file can't be renamed, it's safe t    2000          * file can't be renamed, it's safe to access its name afterwards.
2001          */                                      2001          */
2002         cdentry = fd_file(cfile)->f_path.dent    2002         cdentry = fd_file(cfile)->f_path.dentry;
2003         if (cdentry->d_sb->s_type != &cgroup_    2003         if (cdentry->d_sb->s_type != &cgroup_fs_type || !d_is_reg(cdentry)) {
2004                 ret = -EINVAL;                   2004                 ret = -EINVAL;
2005                 goto out_put_cfile;              2005                 goto out_put_cfile;
2006         }                                        2006         }
2007                                                  2007 
2008         /*                                       2008         /*
2009          * Determine the event callbacks and     2009          * Determine the event callbacks and set them in @event.  This used
2010          * to be done via struct cftype but c    2010          * to be done via struct cftype but cgroup core no longer knows
2011          * about these events.  The following    2011          * about these events.  The following is crude but the whole thing
2012          * is for compatibility anyway.          2012          * is for compatibility anyway.
2013          *                                       2013          *
2014          * DO NOT ADD NEW FILES.                 2014          * DO NOT ADD NEW FILES.
2015          */                                      2015          */
2016         name = cdentry->d_name.name;             2016         name = cdentry->d_name.name;
2017                                                  2017 
2018         if (!strcmp(name, "memory.usage_in_by    2018         if (!strcmp(name, "memory.usage_in_bytes")) {
2019                 event->register_event = mem_c    2019                 event->register_event = mem_cgroup_usage_register_event;
2020                 event->unregister_event = mem    2020                 event->unregister_event = mem_cgroup_usage_unregister_event;
2021         } else if (!strcmp(name, "memory.oom_    2021         } else if (!strcmp(name, "memory.oom_control")) {
2022                 pr_warn_once("oom_control is     2022                 pr_warn_once("oom_control is deprecated and will be removed. "
2023                              "Please report y    2023                              "Please report your usecase to linux-mm-@kvack.org"
2024                              " if you depend     2024                              " if you depend on this functionality. \n");
2025                 event->register_event = mem_c    2025                 event->register_event = mem_cgroup_oom_register_event;
2026                 event->unregister_event = mem    2026                 event->unregister_event = mem_cgroup_oom_unregister_event;
2027         } else if (!strcmp(name, "memory.pres    2027         } else if (!strcmp(name, "memory.pressure_level")) {
2028                 pr_warn_once("pressure_level     2028                 pr_warn_once("pressure_level is deprecated and will be removed. "
2029                              "Please report y    2029                              "Please report your usecase to linux-mm-@kvack.org "
2030                              "if you depend o    2030                              "if you depend on this functionality. \n");
2031                 event->register_event = vmpre    2031                 event->register_event = vmpressure_register_event;
2032                 event->unregister_event = vmp    2032                 event->unregister_event = vmpressure_unregister_event;
2033         } else if (!strcmp(name, "memory.mems    2033         } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
2034                 event->register_event = memsw    2034                 event->register_event = memsw_cgroup_usage_register_event;
2035                 event->unregister_event = mem    2035                 event->unregister_event = memsw_cgroup_usage_unregister_event;
2036         } else {                                 2036         } else {
2037                 ret = -EINVAL;                   2037                 ret = -EINVAL;
2038                 goto out_put_cfile;              2038                 goto out_put_cfile;
2039         }                                        2039         }
2040                                                  2040 
2041         /*                                       2041         /*
2042          * Verify @cfile should belong to @cs    2042          * Verify @cfile should belong to @css.  Also, remaining events are
2043          * automatically removed on cgroup de    2043          * automatically removed on cgroup destruction but the removal is
2044          * asynchronous, so take an extra ref    2044          * asynchronous, so take an extra ref on @css.
2045          */                                      2045          */
2046         cfile_css = css_tryget_online_from_di    2046         cfile_css = css_tryget_online_from_dir(cdentry->d_parent,
2047                                                  2047                                                &memory_cgrp_subsys);
2048         ret = -EINVAL;                           2048         ret = -EINVAL;
2049         if (IS_ERR(cfile_css))                   2049         if (IS_ERR(cfile_css))
2050                 goto out_put_cfile;              2050                 goto out_put_cfile;
2051         if (cfile_css != css) {                  2051         if (cfile_css != css) {
2052                 css_put(cfile_css);              2052                 css_put(cfile_css);
2053                 goto out_put_cfile;              2053                 goto out_put_cfile;
2054         }                                        2054         }
2055                                                  2055 
2056         ret = event->register_event(memcg, ev    2056         ret = event->register_event(memcg, event->eventfd, buf);
2057         if (ret)                                 2057         if (ret)
2058                 goto out_put_css;                2058                 goto out_put_css;
2059                                                  2059 
2060         vfs_poll(fd_file(efile), &event->pt);    2060         vfs_poll(fd_file(efile), &event->pt);
2061                                                  2061 
2062         spin_lock_irq(&memcg->event_list_lock    2062         spin_lock_irq(&memcg->event_list_lock);
2063         list_add(&event->list, &memcg->event_    2063         list_add(&event->list, &memcg->event_list);
2064         spin_unlock_irq(&memcg->event_list_lo    2064         spin_unlock_irq(&memcg->event_list_lock);
2065                                                  2065 
2066         fdput(cfile);                            2066         fdput(cfile);
2067         fdput(efile);                            2067         fdput(efile);
2068                                                  2068 
2069         return nbytes;                           2069         return nbytes;
2070                                                  2070 
2071 out_put_css:                                     2071 out_put_css:
2072         css_put(css);                            2072         css_put(css);
2073 out_put_cfile:                                   2073 out_put_cfile:
2074         fdput(cfile);                            2074         fdput(cfile);
2075 out_put_eventfd:                                 2075 out_put_eventfd:
2076         eventfd_ctx_put(event->eventfd);         2076         eventfd_ctx_put(event->eventfd);
2077 out_put_efile:                                   2077 out_put_efile:
2078         fdput(efile);                            2078         fdput(efile);
2079 out_kfree:                                       2079 out_kfree:
2080         kfree(event);                            2080         kfree(event);
2081                                                  2081 
2082         return ret;                              2082         return ret;
2083 }                                                2083 }
2084                                                  2084 
2085 void memcg1_memcg_init(struct mem_cgroup *mem    2085 void memcg1_memcg_init(struct mem_cgroup *memcg)
2086 {                                                2086 {
2087         INIT_LIST_HEAD(&memcg->oom_notify);      2087         INIT_LIST_HEAD(&memcg->oom_notify);
2088         mutex_init(&memcg->thresholds_lock);     2088         mutex_init(&memcg->thresholds_lock);
2089         spin_lock_init(&memcg->move_lock);       2089         spin_lock_init(&memcg->move_lock);
2090         INIT_LIST_HEAD(&memcg->event_list);      2090         INIT_LIST_HEAD(&memcg->event_list);
2091         spin_lock_init(&memcg->event_list_loc    2091         spin_lock_init(&memcg->event_list_lock);
2092 }                                                2092 }
2093                                                  2093 
2094 void memcg1_css_offline(struct mem_cgroup *me    2094 void memcg1_css_offline(struct mem_cgroup *memcg)
2095 {                                                2095 {
2096         struct mem_cgroup_event *event, *tmp;    2096         struct mem_cgroup_event *event, *tmp;
2097                                                  2097 
2098         /*                                       2098         /*
2099          * Unregister events and notify users    2099          * Unregister events and notify userspace.
2100          * Notify userspace about cgroup remo    2100          * Notify userspace about cgroup removing only after rmdir of cgroup
2101          * directory to avoid race between us    2101          * directory to avoid race between userspace and kernelspace.
2102          */                                      2102          */
2103         spin_lock_irq(&memcg->event_list_lock    2103         spin_lock_irq(&memcg->event_list_lock);
2104         list_for_each_entry_safe(event, tmp,     2104         list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
2105                 list_del_init(&event->list);     2105                 list_del_init(&event->list);
2106                 schedule_work(&event->remove)    2106                 schedule_work(&event->remove);
2107         }                                        2107         }
2108         spin_unlock_irq(&memcg->event_list_lo    2108         spin_unlock_irq(&memcg->event_list_lock);
2109 }                                                2109 }
2110                                                  2110 
2111 /*                                               2111 /*
2112  * Check OOM-Killer is already running under     2112  * Check OOM-Killer is already running under our hierarchy.
2113  * If someone is running, return false.          2113  * If someone is running, return false.
2114  */                                              2114  */
2115 static bool mem_cgroup_oom_trylock(struct mem    2115 static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
2116 {                                                2116 {
2117         struct mem_cgroup *iter, *failed = NU    2117         struct mem_cgroup *iter, *failed = NULL;
2118                                                  2118 
2119         spin_lock(&memcg_oom_lock);              2119         spin_lock(&memcg_oom_lock);
2120                                                  2120 
2121         for_each_mem_cgroup_tree(iter, memcg)    2121         for_each_mem_cgroup_tree(iter, memcg) {
2122                 if (iter->oom_lock) {            2122                 if (iter->oom_lock) {
2123                         /*                       2123                         /*
2124                          * this subtree of ou    2124                          * this subtree of our hierarchy is already locked
2125                          * so we cannot give     2125                          * so we cannot give a lock.
2126                          */                      2126                          */
2127                         failed = iter;           2127                         failed = iter;
2128                         mem_cgroup_iter_break    2128                         mem_cgroup_iter_break(memcg, iter);
2129                         break;                   2129                         break;
2130                 } else                           2130                 } else
2131                         iter->oom_lock = true    2131                         iter->oom_lock = true;
2132         }                                        2132         }
2133                                                  2133 
2134         if (failed) {                            2134         if (failed) {
2135                 /*                               2135                 /*
2136                  * OK, we failed to lock the     2136                  * OK, we failed to lock the whole subtree so we have
2137                  * to clean up what we set up    2137                  * to clean up what we set up to the failing subtree
2138                  */                              2138                  */
2139                 for_each_mem_cgroup_tree(iter    2139                 for_each_mem_cgroup_tree(iter, memcg) {
2140                         if (iter == failed) {    2140                         if (iter == failed) {
2141                                 mem_cgroup_it    2141                                 mem_cgroup_iter_break(memcg, iter);
2142                                 break;           2142                                 break;
2143                         }                        2143                         }
2144                         iter->oom_lock = fals    2144                         iter->oom_lock = false;
2145                 }                                2145                 }
2146         } else                                   2146         } else
2147                 mutex_acquire(&memcg_oom_lock    2147                 mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_);
2148                                                  2148 
2149         spin_unlock(&memcg_oom_lock);            2149         spin_unlock(&memcg_oom_lock);
2150                                                  2150 
2151         return !failed;                          2151         return !failed;
2152 }                                                2152 }
2153                                                  2153 
2154 static void mem_cgroup_oom_unlock(struct mem_    2154 static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
2155 {                                                2155 {
2156         struct mem_cgroup *iter;                 2156         struct mem_cgroup *iter;
2157                                                  2157 
2158         spin_lock(&memcg_oom_lock);              2158         spin_lock(&memcg_oom_lock);
2159         mutex_release(&memcg_oom_lock_dep_map    2159         mutex_release(&memcg_oom_lock_dep_map, _RET_IP_);
2160         for_each_mem_cgroup_tree(iter, memcg)    2160         for_each_mem_cgroup_tree(iter, memcg)
2161                 iter->oom_lock = false;          2161                 iter->oom_lock = false;
2162         spin_unlock(&memcg_oom_lock);            2162         spin_unlock(&memcg_oom_lock);
2163 }                                                2163 }
2164                                                  2164 
2165 static void mem_cgroup_mark_under_oom(struct     2165 static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
2166 {                                                2166 {
2167         struct mem_cgroup *iter;                 2167         struct mem_cgroup *iter;
2168                                                  2168 
2169         spin_lock(&memcg_oom_lock);              2169         spin_lock(&memcg_oom_lock);
2170         for_each_mem_cgroup_tree(iter, memcg)    2170         for_each_mem_cgroup_tree(iter, memcg)
2171                 iter->under_oom++;               2171                 iter->under_oom++;
2172         spin_unlock(&memcg_oom_lock);            2172         spin_unlock(&memcg_oom_lock);
2173 }                                                2173 }
2174                                                  2174 
2175 static void mem_cgroup_unmark_under_oom(struc    2175 static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
2176 {                                                2176 {
2177         struct mem_cgroup *iter;                 2177         struct mem_cgroup *iter;
2178                                                  2178 
2179         /*                                       2179         /*
2180          * Be careful about under_oom underfl    2180          * Be careful about under_oom underflows because a child memcg
2181          * could have been added after mem_cg    2181          * could have been added after mem_cgroup_mark_under_oom.
2182          */                                      2182          */
2183         spin_lock(&memcg_oom_lock);              2183         spin_lock(&memcg_oom_lock);
2184         for_each_mem_cgroup_tree(iter, memcg)    2184         for_each_mem_cgroup_tree(iter, memcg)
2185                 if (iter->under_oom > 0)         2185                 if (iter->under_oom > 0)
2186                         iter->under_oom--;       2186                         iter->under_oom--;
2187         spin_unlock(&memcg_oom_lock);            2187         spin_unlock(&memcg_oom_lock);
2188 }                                                2188 }
2189                                                  2189 
2190 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_wait    2190 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
2191                                                  2191 
2192 struct oom_wait_info {                           2192 struct oom_wait_info {
2193         struct mem_cgroup *memcg;                2193         struct mem_cgroup *memcg;
2194         wait_queue_entry_t      wait;            2194         wait_queue_entry_t      wait;
2195 };                                               2195 };
2196                                                  2196 
2197 static int memcg_oom_wake_function(wait_queue    2197 static int memcg_oom_wake_function(wait_queue_entry_t *wait,
2198         unsigned mode, int sync, void *arg)      2198         unsigned mode, int sync, void *arg)
2199 {                                                2199 {
2200         struct mem_cgroup *wake_memcg = (stru    2200         struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
2201         struct mem_cgroup *oom_wait_memcg;       2201         struct mem_cgroup *oom_wait_memcg;
2202         struct oom_wait_info *oom_wait_info;     2202         struct oom_wait_info *oom_wait_info;
2203                                                  2203 
2204         oom_wait_info = container_of(wait, st    2204         oom_wait_info = container_of(wait, struct oom_wait_info, wait);
2205         oom_wait_memcg = oom_wait_info->memcg    2205         oom_wait_memcg = oom_wait_info->memcg;
2206                                                  2206 
2207         if (!mem_cgroup_is_descendant(wake_me    2207         if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) &&
2208             !mem_cgroup_is_descendant(oom_wai    2208             !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg))
2209                 return 0;                        2209                 return 0;
2210         return autoremove_wake_function(wait,    2210         return autoremove_wake_function(wait, mode, sync, arg);
2211 }                                                2211 }
2212                                                  2212 
2213 void memcg1_oom_recover(struct mem_cgroup *me    2213 void memcg1_oom_recover(struct mem_cgroup *memcg)
2214 {                                                2214 {
2215         /*                                       2215         /*
2216          * For the following lockless ->under    2216          * For the following lockless ->under_oom test, the only required
2217          * guarantee is that it must see the     2217          * guarantee is that it must see the state asserted by an OOM when
2218          * this function is called as a resul    2218          * this function is called as a result of userland actions
2219          * triggered by the notification of t    2219          * triggered by the notification of the OOM.  This is trivially
2220          * achieved by invoking mem_cgroup_ma    2220          * achieved by invoking mem_cgroup_mark_under_oom() before
2221          * triggering notification.              2221          * triggering notification.
2222          */                                      2222          */
2223         if (memcg && memcg->under_oom)           2223         if (memcg && memcg->under_oom)
2224                 __wake_up(&memcg_oom_waitq, T    2224                 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
2225 }                                                2225 }
2226                                                  2226 
2227 /**                                              2227 /**
2228  * mem_cgroup_oom_synchronize - complete memc    2228  * mem_cgroup_oom_synchronize - complete memcg OOM handling
2229  * @handle: actually kill/wait or just clean     2229  * @handle: actually kill/wait or just clean up the OOM state
2230  *                                               2230  *
2231  * This has to be called at the end of a page    2231  * This has to be called at the end of a page fault if the memcg OOM
2232  * handler was enabled.                          2232  * handler was enabled.
2233  *                                               2233  *
2234  * Memcg supports userspace OOM handling wher    2234  * Memcg supports userspace OOM handling where failed allocations must
2235  * sleep on a waitqueue until the userspace t    2235  * sleep on a waitqueue until the userspace task resolves the
2236  * situation.  Sleeping directly in the charg    2236  * situation.  Sleeping directly in the charge context with all kinds
2237  * of locks held is not a good idea, instead     2237  * of locks held is not a good idea, instead we remember an OOM state
2238  * in the task and mem_cgroup_oom_synchronize    2238  * in the task and mem_cgroup_oom_synchronize() has to be called at
2239  * the end of the page fault to complete the     2239  * the end of the page fault to complete the OOM handling.
2240  *                                               2240  *
2241  * Returns %true if an ongoing memcg OOM situ    2241  * Returns %true if an ongoing memcg OOM situation was detected and
2242  * completed, %false otherwise.                  2242  * completed, %false otherwise.
2243  */                                              2243  */
2244 bool mem_cgroup_oom_synchronize(bool handle)     2244 bool mem_cgroup_oom_synchronize(bool handle)
2245 {                                                2245 {
2246         struct mem_cgroup *memcg = current->m    2246         struct mem_cgroup *memcg = current->memcg_in_oom;
2247         struct oom_wait_info owait;              2247         struct oom_wait_info owait;
2248         bool locked;                             2248         bool locked;
2249                                                  2249 
2250         /* OOM is global, do not handle */       2250         /* OOM is global, do not handle */
2251         if (!memcg)                              2251         if (!memcg)
2252                 return false;                    2252                 return false;
2253                                                  2253 
2254         if (!handle)                             2254         if (!handle)
2255                 goto cleanup;                    2255                 goto cleanup;
2256                                                  2256 
2257         owait.memcg = memcg;                     2257         owait.memcg = memcg;
2258         owait.wait.flags = 0;                    2258         owait.wait.flags = 0;
2259         owait.wait.func = memcg_oom_wake_func    2259         owait.wait.func = memcg_oom_wake_function;
2260         owait.wait.private = current;            2260         owait.wait.private = current;
2261         INIT_LIST_HEAD(&owait.wait.entry);       2261         INIT_LIST_HEAD(&owait.wait.entry);
2262                                                  2262 
2263         prepare_to_wait(&memcg_oom_waitq, &ow    2263         prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
2264         mem_cgroup_mark_under_oom(memcg);        2264         mem_cgroup_mark_under_oom(memcg);
2265                                                  2265 
2266         locked = mem_cgroup_oom_trylock(memcg    2266         locked = mem_cgroup_oom_trylock(memcg);
2267                                                  2267 
2268         if (locked)                              2268         if (locked)
2269                 mem_cgroup_oom_notify(memcg);    2269                 mem_cgroup_oom_notify(memcg);
2270                                                  2270 
2271         schedule();                              2271         schedule();
2272         mem_cgroup_unmark_under_oom(memcg);      2272         mem_cgroup_unmark_under_oom(memcg);
2273         finish_wait(&memcg_oom_waitq, &owait.    2273         finish_wait(&memcg_oom_waitq, &owait.wait);
2274                                                  2274 
2275         if (locked)                              2275         if (locked)
2276                 mem_cgroup_oom_unlock(memcg);    2276                 mem_cgroup_oom_unlock(memcg);
2277 cleanup:                                         2277 cleanup:
2278         current->memcg_in_oom = NULL;            2278         current->memcg_in_oom = NULL;
2279         css_put(&memcg->css);                    2279         css_put(&memcg->css);
2280         return true;                             2280         return true;
2281 }                                                2281 }
2282                                                  2282 
2283                                                  2283 
2284 bool memcg1_oom_prepare(struct mem_cgroup *me    2284 bool memcg1_oom_prepare(struct mem_cgroup *memcg, bool *locked)
2285 {                                                2285 {
2286         /*                                       2286         /*
2287          * We are in the middle of the charge    2287          * We are in the middle of the charge context here, so we
2288          * don't want to block when potential    2288          * don't want to block when potentially sitting on a callstack
2289          * that holds all kinds of filesystem    2289          * that holds all kinds of filesystem and mm locks.
2290          *                                       2290          *
2291          * cgroup1 allows disabling the OOM k    2291          * cgroup1 allows disabling the OOM killer and waiting for outside
2292          * handling until the charge can succ    2292          * handling until the charge can succeed; remember the context and put
2293          * the task to sleep at the end of th    2293          * the task to sleep at the end of the page fault when all locks are
2294          * released.                             2294          * released.
2295          *                                       2295          *
2296          * On the other hand, in-kernel OOM k    2296          * On the other hand, in-kernel OOM killer allows for an async victim
2297          * memory reclaim (oom_reaper) and th    2297          * memory reclaim (oom_reaper) and that means that we are not solely
2298          * relying on the oom victim to make     2298          * relying on the oom victim to make a forward progress and we can
2299          * invoke the oom killer here.           2299          * invoke the oom killer here.
2300          *                                       2300          *
2301          * Please note that mem_cgroup_out_of    2301          * Please note that mem_cgroup_out_of_memory might fail to find a
2302          * victim and then we have to bail ou    2302          * victim and then we have to bail out from the charge path.
2303          */                                      2303          */
2304         if (READ_ONCE(memcg->oom_kill_disable    2304         if (READ_ONCE(memcg->oom_kill_disable)) {
2305                 if (current->in_user_fault) {    2305                 if (current->in_user_fault) {
2306                         css_get(&memcg->css);    2306                         css_get(&memcg->css);
2307                         current->memcg_in_oom    2307                         current->memcg_in_oom = memcg;
2308                 }                                2308                 }
2309                 return false;                    2309                 return false;
2310         }                                        2310         }
2311                                                  2311 
2312         mem_cgroup_mark_under_oom(memcg);        2312         mem_cgroup_mark_under_oom(memcg);
2313                                                  2313 
2314         *locked = mem_cgroup_oom_trylock(memc    2314         *locked = mem_cgroup_oom_trylock(memcg);
2315                                                  2315 
2316         if (*locked)                             2316         if (*locked)
2317                 mem_cgroup_oom_notify(memcg);    2317                 mem_cgroup_oom_notify(memcg);
2318                                                  2318 
2319         mem_cgroup_unmark_under_oom(memcg);      2319         mem_cgroup_unmark_under_oom(memcg);
2320                                                  2320 
2321         return true;                             2321         return true;
2322 }                                                2322 }
2323                                                  2323 
2324 void memcg1_oom_finish(struct mem_cgroup *mem    2324 void memcg1_oom_finish(struct mem_cgroup *memcg, bool locked)
2325 {                                                2325 {
2326         if (locked)                              2326         if (locked)
2327                 mem_cgroup_oom_unlock(memcg);    2327                 mem_cgroup_oom_unlock(memcg);
2328 }                                                2328 }
2329                                                  2329 
2330 static DEFINE_MUTEX(memcg_max_mutex);            2330 static DEFINE_MUTEX(memcg_max_mutex);
2331                                                  2331 
2332 static int mem_cgroup_resize_max(struct mem_c    2332 static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
2333                                  unsigned lon    2333                                  unsigned long max, bool memsw)
2334 {                                                2334 {
2335         bool enlarge = false;                    2335         bool enlarge = false;
2336         bool drained = false;                    2336         bool drained = false;
2337         int ret;                                 2337         int ret;
2338         bool limits_invariant;                   2338         bool limits_invariant;
2339         struct page_counter *counter = memsw     2339         struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory;
2340                                                  2340 
2341         do {                                     2341         do {
2342                 if (signal_pending(current))     2342                 if (signal_pending(current)) {
2343                         ret = -EINTR;            2343                         ret = -EINTR;
2344                         break;                   2344                         break;
2345                 }                                2345                 }
2346                                                  2346 
2347                 mutex_lock(&memcg_max_mutex);    2347                 mutex_lock(&memcg_max_mutex);
2348                 /*                               2348                 /*
2349                  * Make sure that the new lim    2349                  * Make sure that the new limit (memsw or memory limit) doesn't
2350                  * break our basic invariant     2350                  * break our basic invariant rule memory.max <= memsw.max.
2351                  */                              2351                  */
2352                 limits_invariant = memsw ? ma    2352                 limits_invariant = memsw ? max >= READ_ONCE(memcg->memory.max) :
2353                                            ma    2353                                            max <= memcg->memsw.max;
2354                 if (!limits_invariant) {         2354                 if (!limits_invariant) {
2355                         mutex_unlock(&memcg_m    2355                         mutex_unlock(&memcg_max_mutex);
2356                         ret = -EINVAL;           2356                         ret = -EINVAL;
2357                         break;                   2357                         break;
2358                 }                                2358                 }
2359                 if (max > counter->max)          2359                 if (max > counter->max)
2360                         enlarge = true;          2360                         enlarge = true;
2361                 ret = page_counter_set_max(co    2361                 ret = page_counter_set_max(counter, max);
2362                 mutex_unlock(&memcg_max_mutex    2362                 mutex_unlock(&memcg_max_mutex);
2363                                                  2363 
2364                 if (!ret)                        2364                 if (!ret)
2365                         break;                   2365                         break;
2366                                                  2366 
2367                 if (!drained) {                  2367                 if (!drained) {
2368                         drain_all_stock(memcg    2368                         drain_all_stock(memcg);
2369                         drained = true;          2369                         drained = true;
2370                         continue;                2370                         continue;
2371                 }                                2371                 }
2372                                                  2372 
2373                 if (!try_to_free_mem_cgroup_p    2373                 if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
2374                                 memsw ? 0 : M    2374                                 memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP, NULL)) {
2375                         ret = -EBUSY;            2375                         ret = -EBUSY;
2376                         break;                   2376                         break;
2377                 }                                2377                 }
2378         } while (true);                          2378         } while (true);
2379                                                  2379 
2380         if (!ret && enlarge)                     2380         if (!ret && enlarge)
2381                 memcg1_oom_recover(memcg);       2381                 memcg1_oom_recover(memcg);
2382                                                  2382 
2383         return ret;                              2383         return ret;
2384 }                                                2384 }
2385                                                  2385 
2386 /*                                               2386 /*
2387  * Reclaims as many pages from the given memc    2387  * Reclaims as many pages from the given memcg as possible.
2388  *                                               2388  *
2389  * Caller is responsible for holding css refe    2389  * Caller is responsible for holding css reference for memcg.
2390  */                                              2390  */
2391 static int mem_cgroup_force_empty(struct mem_    2391 static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
2392 {                                                2392 {
2393         int nr_retries = MAX_RECLAIM_RETRIES;    2393         int nr_retries = MAX_RECLAIM_RETRIES;
2394                                                  2394 
2395         /* we call try-to-free pages for make    2395         /* we call try-to-free pages for make this cgroup empty */
2396         lru_add_drain_all();                     2396         lru_add_drain_all();
2397                                                  2397 
2398         drain_all_stock(memcg);                  2398         drain_all_stock(memcg);
2399                                                  2399 
2400         /* try to free all pages in this cgro    2400         /* try to free all pages in this cgroup */
2401         while (nr_retries && page_counter_rea    2401         while (nr_retries && page_counter_read(&memcg->memory)) {
2402                 if (signal_pending(current))     2402                 if (signal_pending(current))
2403                         return -EINTR;           2403                         return -EINTR;
2404                                                  2404 
2405                 if (!try_to_free_mem_cgroup_p    2405                 if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
2406                                                  2406                                                   MEMCG_RECLAIM_MAY_SWAP, NULL))
2407                         nr_retries--;            2407                         nr_retries--;
2408         }                                        2408         }
2409                                                  2409 
2410         return 0;                                2410         return 0;
2411 }                                                2411 }
2412                                                  2412 
2413 static ssize_t mem_cgroup_force_empty_write(s    2413 static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of,
2414                                             c    2414                                             char *buf, size_t nbytes,
2415                                             l    2415                                             loff_t off)
2416 {                                                2416 {
2417         struct mem_cgroup *memcg = mem_cgroup    2417         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
2418                                                  2418 
2419         if (mem_cgroup_is_root(memcg))           2419         if (mem_cgroup_is_root(memcg))
2420                 return -EINVAL;                  2420                 return -EINVAL;
2421         return mem_cgroup_force_empty(memcg)     2421         return mem_cgroup_force_empty(memcg) ?: nbytes;
2422 }                                                2422 }
2423                                                  2423 
2424 static u64 mem_cgroup_hierarchy_read(struct c    2424 static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css,
2425                                      struct c    2425                                      struct cftype *cft)
2426 {                                                2426 {
2427         return 1;                                2427         return 1;
2428 }                                                2428 }
2429                                                  2429 
2430 static int mem_cgroup_hierarchy_write(struct     2430 static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
2431                                       struct     2431                                       struct cftype *cft, u64 val)
2432 {                                                2432 {
2433         if (val == 1)                            2433         if (val == 1)
2434                 return 0;                        2434                 return 0;
2435                                                  2435 
2436         pr_warn_once("Non-hierarchical mode i    2436         pr_warn_once("Non-hierarchical mode is deprecated. "
2437                      "Please report your usec    2437                      "Please report your usecase to linux-mm@kvack.org if you "
2438                      "depend on this function    2438                      "depend on this functionality.\n");
2439                                                  2439 
2440         return -EINVAL;                          2440         return -EINVAL;
2441 }                                                2441 }
2442                                                  2442 
2443 static u64 mem_cgroup_read_u64(struct cgroup_    2443 static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
2444                                struct cftype     2444                                struct cftype *cft)
2445 {                                                2445 {
2446         struct mem_cgroup *memcg = mem_cgroup    2446         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
2447         struct page_counter *counter;            2447         struct page_counter *counter;
2448                                                  2448 
2449         switch (MEMFILE_TYPE(cft->private)) {    2449         switch (MEMFILE_TYPE(cft->private)) {
2450         case _MEM:                               2450         case _MEM:
2451                 counter = &memcg->memory;        2451                 counter = &memcg->memory;
2452                 break;                           2452                 break;
2453         case _MEMSWAP:                           2453         case _MEMSWAP:
2454                 counter = &memcg->memsw;         2454                 counter = &memcg->memsw;
2455                 break;                           2455                 break;
2456         case _KMEM:                              2456         case _KMEM:
2457                 counter = &memcg->kmem;          2457                 counter = &memcg->kmem;
2458                 break;                           2458                 break;
2459         case _TCP:                               2459         case _TCP:
2460                 counter = &memcg->tcpmem;        2460                 counter = &memcg->tcpmem;
2461                 break;                           2461                 break;
2462         default:                                 2462         default:
2463                 BUG();                           2463                 BUG();
2464         }                                        2464         }
2465                                                  2465 
2466         switch (MEMFILE_ATTR(cft->private)) {    2466         switch (MEMFILE_ATTR(cft->private)) {
2467         case RES_USAGE:                          2467         case RES_USAGE:
2468                 if (counter == &memcg->memory    2468                 if (counter == &memcg->memory)
2469                         return (u64)mem_cgrou    2469                         return (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE;
2470                 if (counter == &memcg->memsw)    2470                 if (counter == &memcg->memsw)
2471                         return (u64)mem_cgrou    2471                         return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE;
2472                 return (u64)page_counter_read    2472                 return (u64)page_counter_read(counter) * PAGE_SIZE;
2473         case RES_LIMIT:                          2473         case RES_LIMIT:
2474                 return (u64)counter->max * PA    2474                 return (u64)counter->max * PAGE_SIZE;
2475         case RES_MAX_USAGE:                      2475         case RES_MAX_USAGE:
2476                 return (u64)counter->watermar    2476                 return (u64)counter->watermark * PAGE_SIZE;
2477         case RES_FAILCNT:                        2477         case RES_FAILCNT:
2478                 return counter->failcnt;         2478                 return counter->failcnt;
2479         case RES_SOFT_LIMIT:                     2479         case RES_SOFT_LIMIT:
2480                 return (u64)READ_ONCE(memcg->    2480                 return (u64)READ_ONCE(memcg->soft_limit) * PAGE_SIZE;
2481         default:                                 2481         default:
2482                 BUG();                           2482                 BUG();
2483         }                                        2483         }
2484 }                                                2484 }
2485                                                  2485 
2486 /*                                               2486 /*
2487  * This function doesn't do anything useful.     2487  * This function doesn't do anything useful. Its only job is to provide a read
2488  * handler for a file so that cgroup_file_mod    2488  * handler for a file so that cgroup_file_mode() will add read permissions.
2489  */                                              2489  */
2490 static int mem_cgroup_dummy_seq_show(__always    2490 static int mem_cgroup_dummy_seq_show(__always_unused struct seq_file *m,
2491                                      __always    2491                                      __always_unused void *v)
2492 {                                                2492 {
2493         return -EINVAL;                          2493         return -EINVAL;
2494 }                                                2494 }
2495                                                  2495 
2496 static int memcg_update_tcp_max(struct mem_cg    2496 static int memcg_update_tcp_max(struct mem_cgroup *memcg, unsigned long max)
2497 {                                                2497 {
2498         int ret;                                 2498         int ret;
2499                                                  2499 
2500         mutex_lock(&memcg_max_mutex);            2500         mutex_lock(&memcg_max_mutex);
2501                                                  2501 
2502         ret = page_counter_set_max(&memcg->tc    2502         ret = page_counter_set_max(&memcg->tcpmem, max);
2503         if (ret)                                 2503         if (ret)
2504                 goto out;                        2504                 goto out;
2505                                                  2505 
2506         if (!memcg->tcpmem_active) {             2506         if (!memcg->tcpmem_active) {
2507                 /*                               2507                 /*
2508                  * The active flag needs to b    2508                  * The active flag needs to be written after the static_key
2509                  * update. This is what guara    2509                  * update. This is what guarantees that the socket activation
2510                  * function is the last one t    2510                  * function is the last one to run. See mem_cgroup_sk_alloc()
2511                  * for details, and note that    2511                  * for details, and note that we don't mark any socket as
2512                  * belonging to this memcg un    2512                  * belonging to this memcg until that flag is up.
2513                  *                               2513                  *
2514                  * We need to do this, becaus    2514                  * We need to do this, because static_keys will span multiple
2515                  * sites, but we can't contro    2515                  * sites, but we can't control their order. If we mark a socket
2516                  * as accounted, but the acco    2516                  * as accounted, but the accounting functions are not patched in
2517                  * yet, we'll lose accounting    2517                  * yet, we'll lose accounting.
2518                  *                               2518                  *
2519                  * We never race with the rea    2519                  * We never race with the readers in mem_cgroup_sk_alloc(),
2520                  * because when this value ch    2520                  * because when this value change, the code to process it is not
2521                  * patched in yet.               2521                  * patched in yet.
2522                  */                              2522                  */
2523                 static_branch_inc(&memcg_sock    2523                 static_branch_inc(&memcg_sockets_enabled_key);
2524                 memcg->tcpmem_active = true;     2524                 memcg->tcpmem_active = true;
2525         }                                        2525         }
2526 out:                                             2526 out:
2527         mutex_unlock(&memcg_max_mutex);          2527         mutex_unlock(&memcg_max_mutex);
2528         return ret;                              2528         return ret;
2529 }                                                2529 }
2530                                                  2530 
2531 /*                                               2531 /*
2532  * The user of this function is...               2532  * The user of this function is...
2533  * RES_LIMIT.                                    2533  * RES_LIMIT.
2534  */                                              2534  */
2535 static ssize_t mem_cgroup_write(struct kernfs    2535 static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
2536                                 char *buf, si    2536                                 char *buf, size_t nbytes, loff_t off)
2537 {                                                2537 {
2538         struct mem_cgroup *memcg = mem_cgroup    2538         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
2539         unsigned long nr_pages;                  2539         unsigned long nr_pages;
2540         int ret;                                 2540         int ret;
2541                                                  2541 
2542         buf = strstrip(buf);                     2542         buf = strstrip(buf);
2543         ret = page_counter_memparse(buf, "-1"    2543         ret = page_counter_memparse(buf, "-1", &nr_pages);
2544         if (ret)                                 2544         if (ret)
2545                 return ret;                      2545                 return ret;
2546                                                  2546 
2547         switch (MEMFILE_ATTR(of_cft(of)->priv    2547         switch (MEMFILE_ATTR(of_cft(of)->private)) {
2548         case RES_LIMIT:                          2548         case RES_LIMIT:
2549                 if (mem_cgroup_is_root(memcg)    2549                 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
2550                         ret = -EINVAL;           2550                         ret = -EINVAL;
2551                         break;                   2551                         break;
2552                 }                                2552                 }
2553                 switch (MEMFILE_TYPE(of_cft(o    2553                 switch (MEMFILE_TYPE(of_cft(of)->private)) {
2554                 case _MEM:                       2554                 case _MEM:
2555                         ret = mem_cgroup_resi    2555                         ret = mem_cgroup_resize_max(memcg, nr_pages, false);
2556                         break;                   2556                         break;
2557                 case _MEMSWAP:                   2557                 case _MEMSWAP:
2558                         ret = mem_cgroup_resi    2558                         ret = mem_cgroup_resize_max(memcg, nr_pages, true);
2559                         break;                   2559                         break;
2560                 case _KMEM:                      2560                 case _KMEM:
2561                         pr_warn_once("kmem.li    2561                         pr_warn_once("kmem.limit_in_bytes is deprecated and will be removed. "
2562                                      "Writing    2562                                      "Writing any value to this file has no effect. "
2563                                      "Please     2563                                      "Please report your usecase to linux-mm@kvack.org if you "
2564                                      "depend     2564                                      "depend on this functionality.\n");
2565                         ret = 0;                 2565                         ret = 0;
2566                         break;                   2566                         break;
2567                 case _TCP:                       2567                 case _TCP:
2568                         pr_warn_once("kmem.tc    2568                         pr_warn_once("kmem.tcp.limit_in_bytes is deprecated and will be removed. "
2569                                      "Please     2569                                      "Please report your usecase to linux-mm@kvack.org if you "
2570                                      "depend     2570                                      "depend on this functionality.\n");
2571                         ret = memcg_update_tc    2571                         ret = memcg_update_tcp_max(memcg, nr_pages);
2572                         break;                   2572                         break;
2573                 }                                2573                 }
2574                 break;                           2574                 break;
2575         case RES_SOFT_LIMIT:                     2575         case RES_SOFT_LIMIT:
2576                 if (IS_ENABLED(CONFIG_PREEMPT    2576                 if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
2577                         ret = -EOPNOTSUPP;       2577                         ret = -EOPNOTSUPP;
2578                 } else {                         2578                 } else {
2579                         pr_warn_once("soft_li    2579                         pr_warn_once("soft_limit_in_bytes is deprecated and will be removed. "
2580                                      "Please     2580                                      "Please report your usecase to linux-mm@kvack.org if you "
2581                                      "depend     2581                                      "depend on this functionality.\n");
2582                         WRITE_ONCE(memcg->sof    2582                         WRITE_ONCE(memcg->soft_limit, nr_pages);
2583                         ret = 0;                 2583                         ret = 0;
2584                 }                                2584                 }
2585                 break;                           2585                 break;
2586         }                                        2586         }
2587         return ret ?: nbytes;                    2587         return ret ?: nbytes;
2588 }                                                2588 }
2589                                                  2589 
2590 static ssize_t mem_cgroup_reset(struct kernfs    2590 static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf,
2591                                 size_t nbytes    2591                                 size_t nbytes, loff_t off)
2592 {                                                2592 {
2593         struct mem_cgroup *memcg = mem_cgroup    2593         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
2594         struct page_counter *counter;            2594         struct page_counter *counter;
2595                                                  2595 
2596         switch (MEMFILE_TYPE(of_cft(of)->priv    2596         switch (MEMFILE_TYPE(of_cft(of)->private)) {
2597         case _MEM:                               2597         case _MEM:
2598                 counter = &memcg->memory;        2598                 counter = &memcg->memory;
2599                 break;                           2599                 break;
2600         case _MEMSWAP:                           2600         case _MEMSWAP:
2601                 counter = &memcg->memsw;         2601                 counter = &memcg->memsw;
2602                 break;                           2602                 break;
2603         case _KMEM:                              2603         case _KMEM:
2604                 counter = &memcg->kmem;          2604                 counter = &memcg->kmem;
2605                 break;                           2605                 break;
2606         case _TCP:                               2606         case _TCP:
2607                 counter = &memcg->tcpmem;        2607                 counter = &memcg->tcpmem;
2608                 break;                           2608                 break;
2609         default:                                 2609         default:
2610                 BUG();                           2610                 BUG();
2611         }                                        2611         }
2612                                                  2612 
2613         switch (MEMFILE_ATTR(of_cft(of)->priv    2613         switch (MEMFILE_ATTR(of_cft(of)->private)) {
2614         case RES_MAX_USAGE:                      2614         case RES_MAX_USAGE:
2615                 page_counter_reset_watermark(    2615                 page_counter_reset_watermark(counter);
2616                 break;                           2616                 break;
2617         case RES_FAILCNT:                        2617         case RES_FAILCNT:
2618                 counter->failcnt = 0;            2618                 counter->failcnt = 0;
2619                 break;                           2619                 break;
2620         default:                                 2620         default:
2621                 BUG();                           2621                 BUG();
2622         }                                        2622         }
2623                                                  2623 
2624         return nbytes;                           2624         return nbytes;
2625 }                                                2625 }
2626                                                  2626 
2627 #ifdef CONFIG_NUMA                               2627 #ifdef CONFIG_NUMA
2628                                                  2628 
2629 #define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE)     2629 #define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE))
2630 #define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON)     2630 #define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON))
2631 #define LRU_ALL      ((1 << NR_LRU_LISTS) - 1    2631 #define LRU_ALL      ((1 << NR_LRU_LISTS) - 1)
2632                                                  2632 
2633 static unsigned long mem_cgroup_node_nr_lru_p    2633 static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
2634                                 int nid, unsi    2634                                 int nid, unsigned int lru_mask, bool tree)
2635 {                                                2635 {
2636         struct lruvec *lruvec = mem_cgroup_lr    2636         struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
2637         unsigned long nr = 0;                    2637         unsigned long nr = 0;
2638         enum lru_list lru;                       2638         enum lru_list lru;
2639                                                  2639 
2640         VM_BUG_ON((unsigned)nid >= nr_node_id    2640         VM_BUG_ON((unsigned)nid >= nr_node_ids);
2641                                                  2641 
2642         for_each_lru(lru) {                      2642         for_each_lru(lru) {
2643                 if (!(BIT(lru) & lru_mask))      2643                 if (!(BIT(lru) & lru_mask))
2644                         continue;                2644                         continue;
2645                 if (tree)                        2645                 if (tree)
2646                         nr += lruvec_page_sta    2646                         nr += lruvec_page_state(lruvec, NR_LRU_BASE + lru);
2647                 else                             2647                 else
2648                         nr += lruvec_page_sta    2648                         nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru);
2649         }                                        2649         }
2650         return nr;                               2650         return nr;
2651 }                                                2651 }
2652                                                  2652 
2653 static unsigned long mem_cgroup_nr_lru_pages(    2653 static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
2654                                                  2654                                              unsigned int lru_mask,
2655                                                  2655                                              bool tree)
2656 {                                                2656 {
2657         unsigned long nr = 0;                    2657         unsigned long nr = 0;
2658         enum lru_list lru;                       2658         enum lru_list lru;
2659                                                  2659 
2660         for_each_lru(lru) {                      2660         for_each_lru(lru) {
2661                 if (!(BIT(lru) & lru_mask))      2661                 if (!(BIT(lru) & lru_mask))
2662                         continue;                2662                         continue;
2663                 if (tree)                        2663                 if (tree)
2664                         nr += memcg_page_stat    2664                         nr += memcg_page_state(memcg, NR_LRU_BASE + lru);
2665                 else                             2665                 else
2666                         nr += memcg_page_stat    2666                         nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru);
2667         }                                        2667         }
2668         return nr;                               2668         return nr;
2669 }                                                2669 }
2670                                                  2670 
2671 static int memcg_numa_stat_show(struct seq_fi    2671 static int memcg_numa_stat_show(struct seq_file *m, void *v)
2672 {                                                2672 {
2673         struct numa_stat {                       2673         struct numa_stat {
2674                 const char *name;                2674                 const char *name;
2675                 unsigned int lru_mask;           2675                 unsigned int lru_mask;
2676         };                                       2676         };
2677                                                  2677 
2678         static const struct numa_stat stats[]    2678         static const struct numa_stat stats[] = {
2679                 { "total", LRU_ALL },            2679                 { "total", LRU_ALL },
2680                 { "file", LRU_ALL_FILE },        2680                 { "file", LRU_ALL_FILE },
2681                 { "anon", LRU_ALL_ANON },        2681                 { "anon", LRU_ALL_ANON },
2682                 { "unevictable", BIT(LRU_UNEV    2682                 { "unevictable", BIT(LRU_UNEVICTABLE) },
2683         };                                       2683         };
2684         const struct numa_stat *stat;            2684         const struct numa_stat *stat;
2685         int nid;                                 2685         int nid;
2686         struct mem_cgroup *memcg = mem_cgroup    2686         struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
2687                                                  2687 
2688         mem_cgroup_flush_stats(memcg);           2688         mem_cgroup_flush_stats(memcg);
2689                                                  2689 
2690         for (stat = stats; stat < stats + ARR    2690         for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
2691                 seq_printf(m, "%s=%lu", stat-    2691                 seq_printf(m, "%s=%lu", stat->name,
2692                            mem_cgroup_nr_lru_    2692                            mem_cgroup_nr_lru_pages(memcg, stat->lru_mask,
2693                                                  2693                                                    false));
2694                 for_each_node_state(nid, N_ME    2694                 for_each_node_state(nid, N_MEMORY)
2695                         seq_printf(m, " N%d=%    2695                         seq_printf(m, " N%d=%lu", nid,
2696                                    mem_cgroup    2696                                    mem_cgroup_node_nr_lru_pages(memcg, nid,
2697                                                  2697                                                         stat->lru_mask, false));
2698                 seq_putc(m, '\n');               2698                 seq_putc(m, '\n');
2699         }                                        2699         }
2700                                                  2700 
2701         for (stat = stats; stat < stats + ARR    2701         for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
2702                                                  2702 
2703                 seq_printf(m, "hierarchical_%    2703                 seq_printf(m, "hierarchical_%s=%lu", stat->name,
2704                            mem_cgroup_nr_lru_    2704                            mem_cgroup_nr_lru_pages(memcg, stat->lru_mask,
2705                                                  2705                                                    true));
2706                 for_each_node_state(nid, N_ME    2706                 for_each_node_state(nid, N_MEMORY)
2707                         seq_printf(m, " N%d=%    2707                         seq_printf(m, " N%d=%lu", nid,
2708                                    mem_cgroup    2708                                    mem_cgroup_node_nr_lru_pages(memcg, nid,
2709                                                  2709                                                         stat->lru_mask, true));
2710                 seq_putc(m, '\n');               2710                 seq_putc(m, '\n');
2711         }                                        2711         }
2712                                                  2712 
2713         return 0;                                2713         return 0;
2714 }                                                2714 }
2715 #endif /* CONFIG_NUMA */                         2715 #endif /* CONFIG_NUMA */
2716                                                  2716 
2717 static const unsigned int memcg1_stats[] = {     2717 static const unsigned int memcg1_stats[] = {
2718         NR_FILE_PAGES,                           2718         NR_FILE_PAGES,
2719         NR_ANON_MAPPED,                          2719         NR_ANON_MAPPED,
2720 #ifdef CONFIG_TRANSPARENT_HUGEPAGE               2720 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
2721         NR_ANON_THPS,                            2721         NR_ANON_THPS,
2722 #endif                                           2722 #endif
2723         NR_SHMEM,                                2723         NR_SHMEM,
2724         NR_FILE_MAPPED,                          2724         NR_FILE_MAPPED,
2725         NR_FILE_DIRTY,                           2725         NR_FILE_DIRTY,
2726         NR_WRITEBACK,                            2726         NR_WRITEBACK,
2727         WORKINGSET_REFAULT_ANON,                 2727         WORKINGSET_REFAULT_ANON,
2728         WORKINGSET_REFAULT_FILE,                 2728         WORKINGSET_REFAULT_FILE,
2729 #ifdef CONFIG_SWAP                               2729 #ifdef CONFIG_SWAP
2730         MEMCG_SWAP,                              2730         MEMCG_SWAP,
2731         NR_SWAPCACHE,                            2731         NR_SWAPCACHE,
2732 #endif                                           2732 #endif
2733 };                                               2733 };
2734                                                  2734 
2735 static const char *const memcg1_stat_names[]     2735 static const char *const memcg1_stat_names[] = {
2736         "cache",                                 2736         "cache",
2737         "rss",                                   2737         "rss",
2738 #ifdef CONFIG_TRANSPARENT_HUGEPAGE               2738 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
2739         "rss_huge",                              2739         "rss_huge",
2740 #endif                                           2740 #endif
2741         "shmem",                                 2741         "shmem",
2742         "mapped_file",                           2742         "mapped_file",
2743         "dirty",                                 2743         "dirty",
2744         "writeback",                             2744         "writeback",
2745         "workingset_refault_anon",               2745         "workingset_refault_anon",
2746         "workingset_refault_file",               2746         "workingset_refault_file",
2747 #ifdef CONFIG_SWAP                               2747 #ifdef CONFIG_SWAP
2748         "swap",                                  2748         "swap",
2749         "swapcached",                            2749         "swapcached",
2750 #endif                                           2750 #endif
2751 };                                               2751 };
2752                                                  2752 
2753 /* Universal VM events cgroup1 shows, origina    2753 /* Universal VM events cgroup1 shows, original sort order */
2754 static const unsigned int memcg1_events[] = {    2754 static const unsigned int memcg1_events[] = {
2755         PGPGIN,                                  2755         PGPGIN,
2756         PGPGOUT,                                 2756         PGPGOUT,
2757         PGFAULT,                                 2757         PGFAULT,
2758         PGMAJFAULT,                              2758         PGMAJFAULT,
2759 };                                               2759 };
2760                                                  2760 
2761 void memcg1_stat_format(struct mem_cgroup *me    2761 void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
2762 {                                                2762 {
2763         unsigned long memory, memsw;             2763         unsigned long memory, memsw;
2764         struct mem_cgroup *mi;                   2764         struct mem_cgroup *mi;
2765         unsigned int i;                          2765         unsigned int i;
2766                                                  2766 
2767         BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_n    2767         BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
2768                                                  2768 
2769         mem_cgroup_flush_stats(memcg);           2769         mem_cgroup_flush_stats(memcg);
2770                                                  2770 
2771         for (i = 0; i < ARRAY_SIZE(memcg1_sta    2771         for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
2772                 unsigned long nr;                2772                 unsigned long nr;
2773                                                  2773 
2774                 nr = memcg_page_state_local_o    2774                 nr = memcg_page_state_local_output(memcg, memcg1_stats[i]);
2775                 seq_buf_printf(s, "%s %lu\n",    2775                 seq_buf_printf(s, "%s %lu\n", memcg1_stat_names[i], nr);
2776         }                                        2776         }
2777                                                  2777 
2778         for (i = 0; i < ARRAY_SIZE(memcg1_eve    2778         for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
2779                 seq_buf_printf(s, "%s %lu\n",    2779                 seq_buf_printf(s, "%s %lu\n", vm_event_name(memcg1_events[i]),
2780                                memcg_events_l    2780                                memcg_events_local(memcg, memcg1_events[i]));
2781                                                  2781 
2782         for (i = 0; i < NR_LRU_LISTS; i++)       2782         for (i = 0; i < NR_LRU_LISTS; i++)
2783                 seq_buf_printf(s, "%s %lu\n",    2783                 seq_buf_printf(s, "%s %lu\n", lru_list_name(i),
2784                                memcg_page_sta    2784                                memcg_page_state_local(memcg, NR_LRU_BASE + i) *
2785                                PAGE_SIZE);       2785                                PAGE_SIZE);
2786                                                  2786 
2787         /* Hierarchical information */           2787         /* Hierarchical information */
2788         memory = memsw = PAGE_COUNTER_MAX;       2788         memory = memsw = PAGE_COUNTER_MAX;
2789         for (mi = memcg; mi; mi = parent_mem_    2789         for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) {
2790                 memory = min(memory, READ_ONC    2790                 memory = min(memory, READ_ONCE(mi->memory.max));
2791                 memsw = min(memsw, READ_ONCE(    2791                 memsw = min(memsw, READ_ONCE(mi->memsw.max));
2792         }                                        2792         }
2793         seq_buf_printf(s, "hierarchical_memor    2793         seq_buf_printf(s, "hierarchical_memory_limit %llu\n",
2794                        (u64)memory * PAGE_SIZ    2794                        (u64)memory * PAGE_SIZE);
2795         seq_buf_printf(s, "hierarchical_memsw    2795         seq_buf_printf(s, "hierarchical_memsw_limit %llu\n",
2796                        (u64)memsw * PAGE_SIZE    2796                        (u64)memsw * PAGE_SIZE);
2797                                                  2797 
2798         for (i = 0; i < ARRAY_SIZE(memcg1_sta    2798         for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
2799                 unsigned long nr;                2799                 unsigned long nr;
2800                                                  2800 
2801                 nr = memcg_page_state_output(    2801                 nr = memcg_page_state_output(memcg, memcg1_stats[i]);
2802                 seq_buf_printf(s, "total_%s %    2802                 seq_buf_printf(s, "total_%s %llu\n", memcg1_stat_names[i],
2803                                (u64)nr);         2803                                (u64)nr);
2804         }                                        2804         }
2805                                                  2805 
2806         for (i = 0; i < ARRAY_SIZE(memcg1_eve    2806         for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
2807                 seq_buf_printf(s, "total_%s %    2807                 seq_buf_printf(s, "total_%s %llu\n",
2808                                vm_event_name(    2808                                vm_event_name(memcg1_events[i]),
2809                                (u64)memcg_eve    2809                                (u64)memcg_events(memcg, memcg1_events[i]));
2810                                                  2810 
2811         for (i = 0; i < NR_LRU_LISTS; i++)       2811         for (i = 0; i < NR_LRU_LISTS; i++)
2812                 seq_buf_printf(s, "total_%s %    2812                 seq_buf_printf(s, "total_%s %llu\n", lru_list_name(i),
2813                                (u64)memcg_pag    2813                                (u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
2814                                PAGE_SIZE);       2814                                PAGE_SIZE);
2815                                                  2815 
2816 #ifdef CONFIG_DEBUG_VM                           2816 #ifdef CONFIG_DEBUG_VM
2817         {                                        2817         {
2818                 pg_data_t *pgdat;                2818                 pg_data_t *pgdat;
2819                 struct mem_cgroup_per_node *m    2819                 struct mem_cgroup_per_node *mz;
2820                 unsigned long anon_cost = 0;     2820                 unsigned long anon_cost = 0;
2821                 unsigned long file_cost = 0;     2821                 unsigned long file_cost = 0;
2822                                                  2822 
2823                 for_each_online_pgdat(pgdat)     2823                 for_each_online_pgdat(pgdat) {
2824                         mz = memcg->nodeinfo[    2824                         mz = memcg->nodeinfo[pgdat->node_id];
2825                                                  2825 
2826                         anon_cost += mz->lruv    2826                         anon_cost += mz->lruvec.anon_cost;
2827                         file_cost += mz->lruv    2827                         file_cost += mz->lruvec.file_cost;
2828                 }                                2828                 }
2829                 seq_buf_printf(s, "anon_cost     2829                 seq_buf_printf(s, "anon_cost %lu\n", anon_cost);
2830                 seq_buf_printf(s, "file_cost     2830                 seq_buf_printf(s, "file_cost %lu\n", file_cost);
2831         }                                        2831         }
2832 #endif                                           2832 #endif
2833 }                                                2833 }
2834                                                  2834 
2835 static u64 mem_cgroup_swappiness_read(struct     2835 static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css,
2836                                       struct     2836                                       struct cftype *cft)
2837 {                                                2837 {
2838         struct mem_cgroup *memcg = mem_cgroup    2838         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
2839                                                  2839 
2840         return mem_cgroup_swappiness(memcg);     2840         return mem_cgroup_swappiness(memcg);
2841 }                                                2841 }
2842                                                  2842 
2843 static int mem_cgroup_swappiness_write(struct    2843 static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
2844                                        struct    2844                                        struct cftype *cft, u64 val)
2845 {                                                2845 {
2846         struct mem_cgroup *memcg = mem_cgroup    2846         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
2847                                                  2847 
2848         if (val > MAX_SWAPPINESS)                2848         if (val > MAX_SWAPPINESS)
2849                 return -EINVAL;                  2849                 return -EINVAL;
2850                                                  2850 
2851         if (!mem_cgroup_is_root(memcg))          2851         if (!mem_cgroup_is_root(memcg))
2852                 WRITE_ONCE(memcg->swappiness,    2852                 WRITE_ONCE(memcg->swappiness, val);
2853         else                                     2853         else
2854                 WRITE_ONCE(vm_swappiness, val    2854                 WRITE_ONCE(vm_swappiness, val);
2855                                                  2855 
2856         return 0;                                2856         return 0;
2857 }                                                2857 }
2858                                                  2858 
2859 static int mem_cgroup_oom_control_read(struct    2859 static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
2860 {                                                2860 {
2861         struct mem_cgroup *memcg = mem_cgroup    2861         struct mem_cgroup *memcg = mem_cgroup_from_seq(sf);
2862                                                  2862 
2863         seq_printf(sf, "oom_kill_disable %d\n    2863         seq_printf(sf, "oom_kill_disable %d\n", READ_ONCE(memcg->oom_kill_disable));
2864         seq_printf(sf, "under_oom %d\n", (boo    2864         seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom);
2865         seq_printf(sf, "oom_kill %lu\n",         2865         seq_printf(sf, "oom_kill %lu\n",
2866                    atomic_long_read(&memcg->m    2866                    atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL]));
2867         return 0;                                2867         return 0;
2868 }                                                2868 }
2869                                                  2869 
2870 static int mem_cgroup_oom_control_write(struc    2870 static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
2871         struct cftype *cft, u64 val)             2871         struct cftype *cft, u64 val)
2872 {                                                2872 {
2873         struct mem_cgroup *memcg = mem_cgroup    2873         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
2874                                                  2874 
2875         pr_warn_once("oom_control is deprecat    2875         pr_warn_once("oom_control is deprecated and will be removed. "
2876                      "Please report your usec    2876                      "Please report your usecase to linux-mm-@kvack.org if you "
2877                      "depend on this function    2877                      "depend on this functionality. \n");
2878                                                  2878 
2879         /* cannot set to root cgroup and only    2879         /* cannot set to root cgroup and only 0 and 1 are allowed */
2880         if (mem_cgroup_is_root(memcg) || !((v    2880         if (mem_cgroup_is_root(memcg) || !((val == 0) || (val == 1)))
2881                 return -EINVAL;                  2881                 return -EINVAL;
2882                                                  2882 
2883         WRITE_ONCE(memcg->oom_kill_disable, v    2883         WRITE_ONCE(memcg->oom_kill_disable, val);
2884         if (!val)                                2884         if (!val)
2885                 memcg1_oom_recover(memcg);       2885                 memcg1_oom_recover(memcg);
2886                                                  2886 
2887         return 0;                                2887         return 0;
2888 }                                                2888 }
2889                                                  2889 
2890 #ifdef CONFIG_SLUB_DEBUG                         2890 #ifdef CONFIG_SLUB_DEBUG
2891 static int mem_cgroup_slab_show(struct seq_fi    2891 static int mem_cgroup_slab_show(struct seq_file *m, void *p)
2892 {                                                2892 {
2893         /*                                       2893         /*
2894          * Deprecated.                           2894          * Deprecated.
2895          * Please, take a look at tools/cgrou    2895          * Please, take a look at tools/cgroup/memcg_slabinfo.py .
2896          */                                      2896          */
2897         return 0;                                2897         return 0;
2898 }                                                2898 }
2899 #endif                                           2899 #endif
2900                                                  2900 
2901 struct cftype mem_cgroup_legacy_files[] = {      2901 struct cftype mem_cgroup_legacy_files[] = {
2902         {                                        2902         {
2903                 .name = "usage_in_bytes",        2903                 .name = "usage_in_bytes",
2904                 .private = MEMFILE_PRIVATE(_M    2904                 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
2905                 .read_u64 = mem_cgroup_read_u    2905                 .read_u64 = mem_cgroup_read_u64,
2906         },                                       2906         },
2907         {                                        2907         {
2908                 .name = "max_usage_in_bytes",    2908                 .name = "max_usage_in_bytes",
2909                 .private = MEMFILE_PRIVATE(_M    2909                 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
2910                 .write = mem_cgroup_reset,       2910                 .write = mem_cgroup_reset,
2911                 .read_u64 = mem_cgroup_read_u    2911                 .read_u64 = mem_cgroup_read_u64,
2912         },                                       2912         },
2913         {                                        2913         {
2914                 .name = "limit_in_bytes",        2914                 .name = "limit_in_bytes",
2915                 .private = MEMFILE_PRIVATE(_M    2915                 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
2916                 .write = mem_cgroup_write,       2916                 .write = mem_cgroup_write,
2917                 .read_u64 = mem_cgroup_read_u    2917                 .read_u64 = mem_cgroup_read_u64,
2918         },                                       2918         },
2919         {                                        2919         {
2920                 .name = "soft_limit_in_bytes"    2920                 .name = "soft_limit_in_bytes",
2921                 .private = MEMFILE_PRIVATE(_M    2921                 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
2922                 .write = mem_cgroup_write,       2922                 .write = mem_cgroup_write,
2923                 .read_u64 = mem_cgroup_read_u    2923                 .read_u64 = mem_cgroup_read_u64,
2924         },                                       2924         },
2925         {                                        2925         {
2926                 .name = "failcnt",               2926                 .name = "failcnt",
2927                 .private = MEMFILE_PRIVATE(_M    2927                 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
2928                 .write = mem_cgroup_reset,       2928                 .write = mem_cgroup_reset,
2929                 .read_u64 = mem_cgroup_read_u    2929                 .read_u64 = mem_cgroup_read_u64,
2930         },                                       2930         },
2931         {                                        2931         {
2932                 .name = "stat",                  2932                 .name = "stat",
2933                 .seq_show = memory_stat_show,    2933                 .seq_show = memory_stat_show,
2934         },                                       2934         },
2935         {                                        2935         {
2936                 .name = "force_empty",           2936                 .name = "force_empty",
2937                 .write = mem_cgroup_force_emp    2937                 .write = mem_cgroup_force_empty_write,
2938         },                                       2938         },
2939         {                                        2939         {
2940                 .name = "use_hierarchy",         2940                 .name = "use_hierarchy",
2941                 .write_u64 = mem_cgroup_hiera    2941                 .write_u64 = mem_cgroup_hierarchy_write,
2942                 .read_u64 = mem_cgroup_hierar    2942                 .read_u64 = mem_cgroup_hierarchy_read,
2943         },                                       2943         },
2944         {                                        2944         {
2945                 .name = "cgroup.event_control    2945                 .name = "cgroup.event_control",         /* XXX: for compat */
2946                 .write = memcg_write_event_co    2946                 .write = memcg_write_event_control,
2947                 .flags = CFTYPE_NO_PREFIX | C    2947                 .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE,
2948         },                                       2948         },
2949         {                                        2949         {
2950                 .name = "swappiness",            2950                 .name = "swappiness",
2951                 .read_u64 = mem_cgroup_swappi    2951                 .read_u64 = mem_cgroup_swappiness_read,
2952                 .write_u64 = mem_cgroup_swapp    2952                 .write_u64 = mem_cgroup_swappiness_write,
2953         },                                       2953         },
2954         {                                        2954         {
2955                 .name = "move_charge_at_immig    2955                 .name = "move_charge_at_immigrate",
2956                 .read_u64 = mem_cgroup_move_c    2956                 .read_u64 = mem_cgroup_move_charge_read,
2957                 .write_u64 = mem_cgroup_move_    2957                 .write_u64 = mem_cgroup_move_charge_write,
2958         },                                       2958         },
2959         {                                        2959         {
2960                 .name = "oom_control",           2960                 .name = "oom_control",
2961                 .seq_show = mem_cgroup_oom_co    2961                 .seq_show = mem_cgroup_oom_control_read,
2962                 .write_u64 = mem_cgroup_oom_c    2962                 .write_u64 = mem_cgroup_oom_control_write,
2963         },                                       2963         },
2964         {                                        2964         {
2965                 .name = "pressure_level",        2965                 .name = "pressure_level",
2966                 .seq_show = mem_cgroup_dummy_    2966                 .seq_show = mem_cgroup_dummy_seq_show,
2967         },                                       2967         },
2968 #ifdef CONFIG_NUMA                               2968 #ifdef CONFIG_NUMA
2969         {                                        2969         {
2970                 .name = "numa_stat",             2970                 .name = "numa_stat",
2971                 .seq_show = memcg_numa_stat_s    2971                 .seq_show = memcg_numa_stat_show,
2972         },                                       2972         },
2973 #endif                                           2973 #endif
2974         {                                        2974         {
2975                 .name = "kmem.limit_in_bytes"    2975                 .name = "kmem.limit_in_bytes",
2976                 .private = MEMFILE_PRIVATE(_K    2976                 .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
2977                 .write = mem_cgroup_write,       2977                 .write = mem_cgroup_write,
2978                 .read_u64 = mem_cgroup_read_u    2978                 .read_u64 = mem_cgroup_read_u64,
2979         },                                       2979         },
2980         {                                        2980         {
2981                 .name = "kmem.usage_in_bytes"    2981                 .name = "kmem.usage_in_bytes",
2982                 .private = MEMFILE_PRIVATE(_K    2982                 .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
2983                 .read_u64 = mem_cgroup_read_u    2983                 .read_u64 = mem_cgroup_read_u64,
2984         },                                       2984         },
2985         {                                        2985         {
2986                 .name = "kmem.failcnt",          2986                 .name = "kmem.failcnt",
2987                 .private = MEMFILE_PRIVATE(_K    2987                 .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
2988                 .write = mem_cgroup_reset,       2988                 .write = mem_cgroup_reset,
2989                 .read_u64 = mem_cgroup_read_u    2989                 .read_u64 = mem_cgroup_read_u64,
2990         },                                       2990         },
2991         {                                        2991         {
2992                 .name = "kmem.max_usage_in_by    2992                 .name = "kmem.max_usage_in_bytes",
2993                 .private = MEMFILE_PRIVATE(_K    2993                 .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
2994                 .write = mem_cgroup_reset,       2994                 .write = mem_cgroup_reset,
2995                 .read_u64 = mem_cgroup_read_u    2995                 .read_u64 = mem_cgroup_read_u64,
2996         },                                       2996         },
2997 #ifdef CONFIG_SLUB_DEBUG                         2997 #ifdef CONFIG_SLUB_DEBUG
2998         {                                        2998         {
2999                 .name = "kmem.slabinfo",         2999                 .name = "kmem.slabinfo",
3000                 .seq_show = mem_cgroup_slab_s    3000                 .seq_show = mem_cgroup_slab_show,
3001         },                                       3001         },
3002 #endif                                           3002 #endif
3003         {                                        3003         {
3004                 .name = "kmem.tcp.limit_in_by    3004                 .name = "kmem.tcp.limit_in_bytes",
3005                 .private = MEMFILE_PRIVATE(_T    3005                 .private = MEMFILE_PRIVATE(_TCP, RES_LIMIT),
3006                 .write = mem_cgroup_write,       3006                 .write = mem_cgroup_write,
3007                 .read_u64 = mem_cgroup_read_u    3007                 .read_u64 = mem_cgroup_read_u64,
3008         },                                       3008         },
3009         {                                        3009         {
3010                 .name = "kmem.tcp.usage_in_by    3010                 .name = "kmem.tcp.usage_in_bytes",
3011                 .private = MEMFILE_PRIVATE(_T    3011                 .private = MEMFILE_PRIVATE(_TCP, RES_USAGE),
3012                 .read_u64 = mem_cgroup_read_u    3012                 .read_u64 = mem_cgroup_read_u64,
3013         },                                       3013         },
3014         {                                        3014         {
3015                 .name = "kmem.tcp.failcnt",      3015                 .name = "kmem.tcp.failcnt",
3016                 .private = MEMFILE_PRIVATE(_T    3016                 .private = MEMFILE_PRIVATE(_TCP, RES_FAILCNT),
3017                 .write = mem_cgroup_reset,       3017                 .write = mem_cgroup_reset,
3018                 .read_u64 = mem_cgroup_read_u    3018                 .read_u64 = mem_cgroup_read_u64,
3019         },                                       3019         },
3020         {                                        3020         {
3021                 .name = "kmem.tcp.max_usage_i    3021                 .name = "kmem.tcp.max_usage_in_bytes",
3022                 .private = MEMFILE_PRIVATE(_T    3022                 .private = MEMFILE_PRIVATE(_TCP, RES_MAX_USAGE),
3023                 .write = mem_cgroup_reset,       3023                 .write = mem_cgroup_reset,
3024                 .read_u64 = mem_cgroup_read_u    3024                 .read_u64 = mem_cgroup_read_u64,
3025         },                                       3025         },
3026         { },    /* terminate */                  3026         { },    /* terminate */
3027 };                                               3027 };
3028                                                  3028 
3029 struct cftype memsw_files[] = {                  3029 struct cftype memsw_files[] = {
3030         {                                        3030         {
3031                 .name = "memsw.usage_in_bytes    3031                 .name = "memsw.usage_in_bytes",
3032                 .private = MEMFILE_PRIVATE(_M    3032                 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
3033                 .read_u64 = mem_cgroup_read_u    3033                 .read_u64 = mem_cgroup_read_u64,
3034         },                                       3034         },
3035         {                                        3035         {
3036                 .name = "memsw.max_usage_in_b    3036                 .name = "memsw.max_usage_in_bytes",
3037                 .private = MEMFILE_PRIVATE(_M    3037                 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
3038                 .write = mem_cgroup_reset,       3038                 .write = mem_cgroup_reset,
3039                 .read_u64 = mem_cgroup_read_u    3039                 .read_u64 = mem_cgroup_read_u64,
3040         },                                       3040         },
3041         {                                        3041         {
3042                 .name = "memsw.limit_in_bytes    3042                 .name = "memsw.limit_in_bytes",
3043                 .private = MEMFILE_PRIVATE(_M    3043                 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
3044                 .write = mem_cgroup_write,       3044                 .write = mem_cgroup_write,
3045                 .read_u64 = mem_cgroup_read_u    3045                 .read_u64 = mem_cgroup_read_u64,
3046         },                                       3046         },
3047         {                                        3047         {
3048                 .name = "memsw.failcnt",         3048                 .name = "memsw.failcnt",
3049                 .private = MEMFILE_PRIVATE(_M    3049                 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
3050                 .write = mem_cgroup_reset,       3050                 .write = mem_cgroup_reset,
3051                 .read_u64 = mem_cgroup_read_u    3051                 .read_u64 = mem_cgroup_read_u64,
3052         },                                       3052         },
3053         { },    /* terminate */                  3053         { },    /* terminate */
3054 };                                               3054 };
3055                                                  3055 
3056 void memcg1_account_kmem(struct mem_cgroup *m    3056 void memcg1_account_kmem(struct mem_cgroup *memcg, int nr_pages)
3057 {                                                3057 {
3058         if (!cgroup_subsys_on_dfl(memory_cgrp    3058         if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
3059                 if (nr_pages > 0)                3059                 if (nr_pages > 0)
3060                         page_counter_charge(&    3060                         page_counter_charge(&memcg->kmem, nr_pages);
3061                 else                             3061                 else
3062                         page_counter_uncharge    3062                         page_counter_uncharge(&memcg->kmem, -nr_pages);
3063         }                                        3063         }
3064 }                                                3064 }
3065                                                  3065 
3066 bool memcg1_charge_skmem(struct mem_cgroup *m    3066 bool memcg1_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages,
3067                          gfp_t gfp_mask)         3067                          gfp_t gfp_mask)
3068 {                                                3068 {
3069         struct page_counter *fail;               3069         struct page_counter *fail;
3070                                                  3070 
3071         if (page_counter_try_charge(&memcg->t    3071         if (page_counter_try_charge(&memcg->tcpmem, nr_pages, &fail)) {
3072                 memcg->tcpmem_pressure = 0;      3072                 memcg->tcpmem_pressure = 0;
3073                 return true;                     3073                 return true;
3074         }                                        3074         }
3075         memcg->tcpmem_pressure = 1;              3075         memcg->tcpmem_pressure = 1;
3076         if (gfp_mask & __GFP_NOFAIL) {           3076         if (gfp_mask & __GFP_NOFAIL) {
3077                 page_counter_charge(&memcg->t    3077                 page_counter_charge(&memcg->tcpmem, nr_pages);
3078                 return true;                     3078                 return true;
3079         }                                        3079         }
3080         return false;                            3080         return false;
3081 }                                                3081 }
3082                                                  3082 
3083 bool memcg1_alloc_events(struct mem_cgroup *m    3083 bool memcg1_alloc_events(struct mem_cgroup *memcg)
3084 {                                                3084 {
3085         memcg->events_percpu = alloc_percpu_g    3085         memcg->events_percpu = alloc_percpu_gfp(struct memcg1_events_percpu,
3086                                                  3086                                                 GFP_KERNEL_ACCOUNT);
3087         return !!memcg->events_percpu;           3087         return !!memcg->events_percpu;
3088 }                                                3088 }
3089                                                  3089 
3090 void memcg1_free_events(struct mem_cgroup *me    3090 void memcg1_free_events(struct mem_cgroup *memcg)
3091 {                                                3091 {
3092         if (memcg->events_percpu)                3092         if (memcg->events_percpu)
3093                 free_percpu(memcg->events_per    3093                 free_percpu(memcg->events_percpu);
3094 }                                                3094 }
3095                                                  3095 
3096 static int __init memcg1_init(void)              3096 static int __init memcg1_init(void)
3097 {                                                3097 {
3098         int node;                                3098         int node;
3099                                                  3099 
3100         for_each_node(node) {                    3100         for_each_node(node) {
3101                 struct mem_cgroup_tree_per_no    3101                 struct mem_cgroup_tree_per_node *rtpn;
3102                                                  3102 
3103                 rtpn = kzalloc_node(sizeof(*r    3103                 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, node);
3104                                                  3104 
3105                 rtpn->rb_root = RB_ROOT;         3105                 rtpn->rb_root = RB_ROOT;
3106                 rtpn->rb_rightmost = NULL;       3106                 rtpn->rb_rightmost = NULL;
3107                 spin_lock_init(&rtpn->lock);     3107                 spin_lock_init(&rtpn->lock);
3108                 soft_limit_tree.rb_tree_per_n    3108                 soft_limit_tree.rb_tree_per_node[node] = rtpn;
3109         }                                        3109         }
3110                                                  3110 
3111         return 0;                                3111         return 0;
3112 }                                                3112 }
3113 subsys_initcall(memcg1_init);                    3113 subsys_initcall(memcg1_init);
3114                                                  3114 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

sflogo.php