~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~
memcontrol-v1.c

Version: ~ [ linux-6.12-rc7 ] ~ [ linux-6.11.7 ] ~ [ linux-6.10.14 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.60 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.116 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.171 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.229 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.285 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.323 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.12 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~
Diff markup

Differences between /mm/memcontrol-v1.c (Version linux-6.12-rc7) and /mm/memcontrol-v1.c (Version linux-5.17.15)

  1 // SPDX-License-Identifier: GPL-2.0-or-later        1 
  2                                                   
  3 #include <linux/memcontrol.h>                     
  4 #include <linux/swap.h>                           
  5 #include <linux/mm_inline.h>                      
  6 #include <linux/pagewalk.h>                       
  7 #include <linux/backing-dev.h>                    
  8 #include <linux/swap_cgroup.h>                    
  9 #include <linux/eventfd.h>                        
 10 #include <linux/poll.h>                           
 11 #include <linux/sort.h>                           
 12 #include <linux/file.h>                           
 13 #include <linux/seq_buf.h>                        
 14                                                   
 15 #include "internal.h"                             
 16 #include "swap.h"                                 
 17 #include "memcontrol-v1.h"                        
 18                                                   
 19 /*                                                
 20  * Cgroups above their limits are maintained i    
 21  * their hierarchy representation                 
 22  */                                               
 23                                                   
 24 struct mem_cgroup_tree_per_node {                 
 25         struct rb_root rb_root;                   
 26         struct rb_node *rb_rightmost;             
 27         spinlock_t lock;                          
 28 };                                                
 29                                                   
 30 struct mem_cgroup_tree {                          
 31         struct mem_cgroup_tree_per_node *rb_tr    
 32 };                                                
 33                                                   
 34 static struct mem_cgroup_tree soft_limit_tree     
 35                                                   
 36 /*                                                
 37  * Maximum loops in mem_cgroup_soft_reclaim(),    
 38  * limit reclaim to prevent infinite loops, if    
 39  */                                               
 40 #define MEM_CGROUP_MAX_RECLAIM_LOOPS              
 41 #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOP    
 42                                                   
 43 /* Stuffs for move charges at task migration.     
 44 /*                                                
 45  * Types of charges to be moved.                  
 46  */                                               
 47 #define MOVE_ANON       0x1ULL                    
 48 #define MOVE_FILE       0x2ULL                    
 49 #define MOVE_MASK       (MOVE_ANON | MOVE_FILE    
 50                                                   
 51 /* "mc" and its members are protected by cgrou    
 52 static struct move_charge_struct {                
 53         spinlock_t        lock; /* for from, t    
 54         struct mm_struct  *mm;                    
 55         struct mem_cgroup *from;                  
 56         struct mem_cgroup *to;                    
 57         unsigned long flags;                      
 58         unsigned long precharge;                  
 59         unsigned long moved_charge;               
 60         unsigned long moved_swap;                 
 61         struct task_struct *moving_task;          
 62         wait_queue_head_t waitq;                  
 63 } mc = {                                          
 64         .lock = __SPIN_LOCK_UNLOCKED(mc.lock),    
 65         .waitq = __WAIT_QUEUE_HEAD_INITIALIZER    
 66 };                                                
 67                                                   
 68 /* for OOM */                                     
 69 struct mem_cgroup_eventfd_list {                  
 70         struct list_head list;                    
 71         struct eventfd_ctx *eventfd;              
 72 };                                                
 73                                                   
 74 /*                                                
 75  * cgroup_event represents events which usersp    
 76  */                                               
 77 struct mem_cgroup_event {                         
 78         /*                                        
 79          * memcg which the event belongs to.      
 80          */                                       
 81         struct mem_cgroup *memcg;                 
 82         /*                                        
 83          * eventfd to signal userspace about t    
 84          */                                       
 85         struct eventfd_ctx *eventfd;              
 86         /*                                        
 87          * Each of these stored in a list by t    
 88          */                                       
 89         struct list_head list;                    
 90         /*                                        
 91          * register_event() callback will be u    
 92          * waiter for changes related to this     
 93          * on eventfd to send notification to     
 94          */                                       
 95         int (*register_event)(struct mem_cgrou    
 96                               struct eventfd_c    
 97         /*                                        
 98          * unregister_event() callback will be    
 99          * the eventfd or on cgroup removing.     
100          * if you want provide notification fu    
101          */                                       
102         void (*unregister_event)(struct mem_cg    
103                                  struct eventf    
104         /*                                        
105          * All fields below needed to unregist    
106          * userspace closes eventfd.              
107          */                                       
108         poll_table pt;                            
109         wait_queue_head_t *wqh;                   
110         wait_queue_entry_t wait;                  
111         struct work_struct remove;                
112 };                                                
113                                                   
114 #define MEMFILE_PRIVATE(x, val) ((x) << 16 | (    
115 #define MEMFILE_TYPE(val)       ((val) >> 16 &    
116 #define MEMFILE_ATTR(val)       ((val) & 0xfff    
117                                                   
118 enum {                                            
119         RES_USAGE,                                
120         RES_LIMIT,                                
121         RES_MAX_USAGE,                            
122         RES_FAILCNT,                              
123         RES_SOFT_LIMIT,                           
124 };                                                
125                                                   
126 #ifdef CONFIG_LOCKDEP                             
127 static struct lockdep_map memcg_oom_lock_dep_m    
128         .name = "memcg_oom_lock",                 
129 };                                                
130 #endif                                            
131                                                   
132 DEFINE_SPINLOCK(memcg_oom_lock);                  
133                                                   
134 static void __mem_cgroup_insert_exceeded(struc    
135                                          struc    
136                                          unsig    
137 {                                                 
138         struct rb_node **p = &mctz->rb_root.rb    
139         struct rb_node *parent = NULL;            
140         struct mem_cgroup_per_node *mz_node;      
141         bool rightmost = true;                    
142                                                   
143         if (mz->on_tree)                          
144                 return;                           
145                                                   
146         mz->usage_in_excess = new_usage_in_exc    
147         if (!mz->usage_in_excess)                 
148                 return;                           
149         while (*p) {                              
150                 parent = *p;                      
151                 mz_node = rb_entry(parent, str    
152                                         tree_n    
153                 if (mz->usage_in_excess < mz_n    
154                         p = &(*p)->rb_left;       
155                         rightmost = false;        
156                 } else {                          
157                         p = &(*p)->rb_right;      
158                 }                                 
159         }                                         
160                                                   
161         if (rightmost)                            
162                 mctz->rb_rightmost = &mz->tree    
163                                                   
164         rb_link_node(&mz->tree_node, parent, p    
165         rb_insert_color(&mz->tree_node, &mctz-    
166         mz->on_tree = true;                       
167 }                                                 
168                                                   
169 static void __mem_cgroup_remove_exceeded(struc    
170                                          struc    
171 {                                                 
172         if (!mz->on_tree)                         
173                 return;                           
174                                                   
175         if (&mz->tree_node == mctz->rb_rightmo    
176                 mctz->rb_rightmost = rb_prev(&    
177                                                   
178         rb_erase(&mz->tree_node, &mctz->rb_roo    
179         mz->on_tree = false;                      
180 }                                                 
181                                                   
182 static void mem_cgroup_remove_exceeded(struct     
183                                        struct     
184 {                                                 
185         unsigned long flags;                      
186                                                   
187         spin_lock_irqsave(&mctz->lock, flags);    
188         __mem_cgroup_remove_exceeded(mz, mctz)    
189         spin_unlock_irqrestore(&mctz->lock, fl    
190 }                                                 
191                                                   
192 static unsigned long soft_limit_excess(struct     
193 {                                                 
194         unsigned long nr_pages = page_counter_    
195         unsigned long soft_limit = READ_ONCE(m    
196         unsigned long excess = 0;                 
197                                                   
198         if (nr_pages > soft_limit)                
199                 excess = nr_pages - soft_limit    
200                                                   
201         return excess;                            
202 }                                                 
203                                                   
204 static void memcg1_update_tree(struct mem_cgro    
205 {                                                 
206         unsigned long excess;                     
207         struct mem_cgroup_per_node *mz;           
208         struct mem_cgroup_tree_per_node *mctz;    
209                                                   
210         if (lru_gen_enabled()) {                  
211                 if (soft_limit_excess(memcg))     
212                         lru_gen_soft_reclaim(m    
213                 return;                           
214         }                                         
215                                                   
216         mctz = soft_limit_tree.rb_tree_per_nod    
217         if (!mctz)                                
218                 return;                           
219         /*                                        
220          * Necessary to update all ancestors w    
221          * because their event counter is not     
222          */                                       
223         for (; memcg; memcg = parent_mem_cgrou    
224                 mz = memcg->nodeinfo[nid];        
225                 excess = soft_limit_excess(mem    
226                 /*                                
227                  * We have to update the tree     
228                  * mem is over its softlimit.     
229                  */                               
230                 if (excess || mz->on_tree) {      
231                         unsigned long flags;      
232                                                   
233                         spin_lock_irqsave(&mct    
234                         /* if on-tree, remove     
235                         if (mz->on_tree)          
236                                 __mem_cgroup_r    
237                         /*                        
238                          * Insert again. mz->u    
239                          * If excess is 0, no     
240                          */                       
241                         __mem_cgroup_insert_ex    
242                         spin_unlock_irqrestore    
243                 }                                 
244         }                                         
245 }                                                 
246                                                   
247 void memcg1_remove_from_trees(struct mem_cgrou    
248 {                                                 
249         struct mem_cgroup_tree_per_node *mctz;    
250         struct mem_cgroup_per_node *mz;           
251         int nid;                                  
252                                                   
253         for_each_node(nid) {                      
254                 mz = memcg->nodeinfo[nid];        
255                 mctz = soft_limit_tree.rb_tree    
256                 if (mctz)                         
257                         mem_cgroup_remove_exce    
258         }                                         
259 }                                                 
260                                                   
261 static struct mem_cgroup_per_node *               
262 __mem_cgroup_largest_soft_limit_node(struct me    
263 {                                                 
264         struct mem_cgroup_per_node *mz;           
265                                                   
266 retry:                                            
267         mz = NULL;                                
268         if (!mctz->rb_rightmost)                  
269                 goto done;              /* Not    
270                                                   
271         mz = rb_entry(mctz->rb_rightmost,         
272                       struct mem_cgroup_per_no    
273         /*                                        
274          * Remove the node now but someone els    
275          * we will to add it back at the end o    
276          * position in the tree.                  
277          */                                       
278         __mem_cgroup_remove_exceeded(mz, mctz)    
279         if (!soft_limit_excess(mz->memcg) ||      
280             !css_tryget(&mz->memcg->css))         
281                 goto retry;                       
282 done:                                             
283         return mz;                                
284 }                                                 
285                                                   
286 static struct mem_cgroup_per_node *               
287 mem_cgroup_largest_soft_limit_node(struct mem_    
288 {                                                 
289         struct mem_cgroup_per_node *mz;           
290                                                   
291         spin_lock_irq(&mctz->lock);               
292         mz = __mem_cgroup_largest_soft_limit_n    
293         spin_unlock_irq(&mctz->lock);             
294         return mz;                                
295 }                                                 
296                                                   
297 static int mem_cgroup_soft_reclaim(struct mem_    
298                                    pg_data_t *    
299                                    gfp_t gfp_m    
300                                    unsigned lo    
301 {                                                 
302         struct mem_cgroup *victim = NULL;         
303         int total = 0;                            
304         int loop = 0;                             
305         unsigned long excess;                     
306         unsigned long nr_scanned;                 
307         struct mem_cgroup_reclaim_cookie recla    
308                 .pgdat = pgdat,                   
309         };                                        
310                                                   
311         excess = soft_limit_excess(root_memcg)    
312                                                   
313         while (1) {                               
314                 victim = mem_cgroup_iter(root_    
315                 if (!victim) {                    
316                         loop++;                   
317                         if (loop >= 2) {          
318                                 /*                
319                                  * If we have     
320                                  * anything, i    
321                                  * no reclaima    
322                                  */               
323                                 if (!total)       
324                                         break;    
325                                 /*                
326                                  * We want to     
327                                  * excess >> 2    
328                                  * reclaim too    
329                                  * coming back    
330                                  */               
331                                 if (total >= (    
332                                         (loop     
333                                         break;    
334                         }                         
335                         continue;                 
336                 }                                 
337                 total += mem_cgroup_shrink_nod    
338                                         pgdat,    
339                 *total_scanned += nr_scanned;     
340                 if (!soft_limit_excess(root_me    
341                         break;                    
342         }                                         
343         mem_cgroup_iter_break(root_memcg, vict    
344         return total;                             
345 }                                                 
346                                                   
347 unsigned long memcg1_soft_limit_reclaim(pg_dat    
348                                             gf    
349                                             un    
350 {                                                 
351         unsigned long nr_reclaimed = 0;           
352         struct mem_cgroup_per_node *mz, *next_    
353         unsigned long reclaimed;                  
354         int loop = 0;                             
355         struct mem_cgroup_tree_per_node *mctz;    
356         unsigned long excess;                     
357                                                   
358         if (lru_gen_enabled())                    
359                 return 0;                         
360                                                   
361         if (order > 0)                            
362                 return 0;                         
363                                                   
364         mctz = soft_limit_tree.rb_tree_per_nod    
365                                                   
366         /*                                        
367          * Do not even bother to check the lar    
368          * is empty. Do it lockless to prevent    
369          * are acceptable as soft limit is bes    
370          */                                       
371         if (!mctz || RB_EMPTY_ROOT(&mctz->rb_r    
372                 return 0;                         
373                                                   
374         /*                                        
375          * This loop can run a while, speciall    
376          * keep exceeding their soft limit and    
377          * pressure                               
378          */                                       
379         do {                                      
380                 if (next_mz)                      
381                         mz = next_mz;             
382                 else                              
383                         mz = mem_cgroup_larges    
384                 if (!mz)                          
385                         break;                    
386                                                   
387                 reclaimed = mem_cgroup_soft_re    
388                                                   
389                 nr_reclaimed += reclaimed;        
390                 spin_lock_irq(&mctz->lock);       
391                                                   
392                 /*                                
393                  * If we failed to reclaim any    
394                  * it is time to move on to th    
395                  */                               
396                 next_mz = NULL;                   
397                 if (!reclaimed)                   
398                         next_mz = __mem_cgroup    
399                                                   
400                 excess = soft_limit_excess(mz-    
401                 /*                                
402                  * One school of thought says     
403                  * back the node to the tree i    
404                  * But our reclaim could retur    
405                  * to priority we are exposing    
406                  * memory to reclaim from. Con    
407                  * term TODO.                     
408                  */                               
409                 /* If excess == 0, no tree ops    
410                 __mem_cgroup_insert_exceeded(m    
411                 spin_unlock_irq(&mctz->lock);     
412                 css_put(&mz->memcg->css);         
413                 loop++;                           
414                 /*                                
415                  * Could not reclaim anything     
416                  * mem cgroups to try or we se    
417                  * reclaiming anything.           
418                  */                               
419                 if (!nr_reclaimed &&              
420                         (next_mz == NULL ||       
421                         loop > MEM_CGROUP_MAX_    
422                         break;                    
423         } while (!nr_reclaimed);                  
424         if (next_mz)                              
425                 css_put(&next_mz->memcg->css);    
426         return nr_reclaimed;                      
427 }                                                 
428                                                   
429 /*                                                
430  * A routine for checking "mem" is under move_    
431  *                                                
432  * Checking a cgroup is mc.from or mc.to or un    
433  * moving cgroups. This is for waiting at high    
434  * caused by "move".                              
435  */                                               
436 static bool mem_cgroup_under_move(struct mem_c    
437 {                                                 
438         struct mem_cgroup *from;                  
439         struct mem_cgroup *to;                    
440         bool ret = false;                         
441         /*                                        
442          * Unlike task_move routines, we acces    
443          * mutual exclusion by cgroup_mutex. H    
444          */                                       
445         spin_lock(&mc.lock);                      
446         from = mc.from;                           
447         to = mc.to;                               
448         if (!from)                                
449                 goto unlock;                      
450                                                   
451         ret = mem_cgroup_is_descendant(from, m    
452                 mem_cgroup_is_descendant(to, m    
453 unlock:                                           
454         spin_unlock(&mc.lock);                    
455         return ret;                               
456 }                                                 
457                                                   
458 bool memcg1_wait_acct_move(struct mem_cgroup *    
459 {                                                 
460         if (mc.moving_task && current != mc.mo    
461                 if (mem_cgroup_under_move(memc    
462                         DEFINE_WAIT(wait);        
463                         prepare_to_wait(&mc.wa    
464                         /* moving charge conte    
465                         if (mc.moving_task)       
466                                 schedule();       
467                         finish_wait(&mc.waitq,    
468                         return true;              
469                 }                                 
470         }                                         
471         return false;                             
472 }                                                 
473                                                   
474 /**                                               
475  * folio_memcg_lock - Bind a folio to its memc    
476  * @folio: The folio.                             
477  *                                                
478  * This function prevents unlocked LRU folios     
479  * another cgroup.                                
480  *                                                
481  * It ensures lifetime of the bound memcg.  Th    
482  * for the lifetime of the folio.                 
483  */                                               
484 void folio_memcg_lock(struct folio *folio)        
485 {                                                 
486         struct mem_cgroup *memcg;                 
487         unsigned long flags;                      
488                                                   
489         /*                                        
490          * The RCU lock is held throughout the    
491          * path can get away without acquiring    
492          * because page moving starts with an     
493          */                                       
494         rcu_read_lock();                          
495                                                   
496         if (mem_cgroup_disabled())                
497                 return;                           
498 again:                                            
499         memcg = folio_memcg(folio);               
500         if (unlikely(!memcg))                     
501                 return;                           
502                                                   
503 #ifdef CONFIG_PROVE_LOCKING                       
504         local_irq_save(flags);                    
505         might_lock(&memcg->move_lock);            
506         local_irq_restore(flags);                 
507 #endif                                            
508                                                   
509         if (atomic_read(&memcg->moving_account    
510                 return;                           
511                                                   
512         spin_lock_irqsave(&memcg->move_lock, f    
513         if (memcg != folio_memcg(folio)) {        
514                 spin_unlock_irqrestore(&memcg-    
515                 goto again;                       
516         }                                         
517                                                   
518         /*                                        
519          * When charge migration first begins,    
520          * critical sections holding the fast-    
521          * holding the slowpath move_lock. Tra    
522          * move_lock for folio_memcg_unlock().    
523          */                                       
524         memcg->move_lock_task = current;          
525         memcg->move_lock_flags = flags;           
526 }                                                 
527                                                   
528 static void __folio_memcg_unlock(struct mem_cg    
529 {                                                 
530         if (memcg && memcg->move_lock_task ==     
531                 unsigned long flags = memcg->m    
532                                                   
533                 memcg->move_lock_task = NULL;     
534                 memcg->move_lock_flags = 0;       
535                                                   
536                 spin_unlock_irqrestore(&memcg-    
537         }                                         
538                                                   
539         rcu_read_unlock();                        
540 }                                                 
541                                                   
542 /**                                               
543  * folio_memcg_unlock - Release the binding be    
544  * @folio: The folio.                             
545  *                                                
546  * This releases the binding created by folio_    
547  * not change the accounting of this folio to     
548  * permit others to change it.                    
549  */                                               
550 void folio_memcg_unlock(struct folio *folio)      
551 {                                                 
552         __folio_memcg_unlock(folio_memcg(folio    
553 }                                                 
554                                                   
555 #ifdef CONFIG_SWAP                                
556 /**                                               
557  * mem_cgroup_move_swap_account - move swap ch    
558  * @entry: swap entry to be moved                 
559  * @from:  mem_cgroup which the entry is moved    
560  * @to:  mem_cgroup which the entry is moved t    
561  *                                                
562  * It succeeds only when the swap_cgroup's rec    
563  * as the mem_cgroup's id of @from.               
564  *                                                
565  * Returns 0 on success, -EINVAL on failure.      
566  *                                                
567  * The caller must have charged to @to, IOW, c    
568  * both res and memsw, and called css_get().      
569  */                                               
570 static int mem_cgroup_move_swap_account(swp_en    
571                                 struct mem_cgr    
572 {                                                 
573         unsigned short old_id, new_id;            
574                                                   
575         old_id = mem_cgroup_id(from);             
576         new_id = mem_cgroup_id(to);               
577                                                   
578         if (swap_cgroup_cmpxchg(entry, old_id,    
579                 mod_memcg_state(from, MEMCG_SW    
580                 mod_memcg_state(to, MEMCG_SWAP    
581                 return 0;                         
582         }                                         
583         return -EINVAL;                           
584 }                                                 
585 #else                                             
586 static inline int mem_cgroup_move_swap_account    
587                                 struct mem_cgr    
588 {                                                 
589         return -EINVAL;                           
590 }                                                 
591 #endif                                            
592                                                   
593 static u64 mem_cgroup_move_charge_read(struct     
594                                 struct cftype     
595 {                                                 
596         return mem_cgroup_from_css(css)->move_    
597 }                                                 
598                                                   
599 #ifdef CONFIG_MMU                                 
600 static int mem_cgroup_move_charge_write(struct    
601                                  struct cftype    
602 {                                                 
603         struct mem_cgroup *memcg = mem_cgroup_    
604                                                   
605         pr_warn_once("Cgroup memory moving (mo    
606                      "Please report your useca    
607                      "depend on this functiona    
608                                                   
609         if (val & ~MOVE_MASK)                     
610                 return -EINVAL;                   
611                                                   
612         /*                                        
613          * No kind of locking is needed in her    
614          * check this value once in the beginn    
615          * on with stale data. This means that    
616          * affect task migrations starting aft    
617          */                                       
618         memcg->move_charge_at_immigrate = val;    
619         return 0;                                 
620 }                                                 
621 #else                                             
622 static int mem_cgroup_move_charge_write(struct    
623                                  struct cftype    
624 {                                                 
625         return -ENOSYS;                           
626 }                                                 
627 #endif                                            
628                                                   
629 #ifdef CONFIG_MMU                                 
630 /* Handlers for move charge at task migration.    
631 static int mem_cgroup_do_precharge(unsigned lo    
632 {                                                 
633         int ret;                                  
634                                                   
635         /* Try a single bulk charge without re    
636         ret = try_charge(mc.to, GFP_KERNEL & ~    
637         if (!ret) {                               
638                 mc.precharge += count;            
639                 return ret;                       
640         }                                         
641                                                   
642         /* Try charges one by one with reclaim    
643         while (count--) {                         
644                 ret = try_charge(mc.to, GFP_KE    
645                 if (ret)                          
646                         return ret;               
647                 mc.precharge++;                   
648                 cond_resched();                   
649         }                                         
650         return 0;                                 
651 }                                                 
652                                                   
653 union mc_target {                                 
654         struct folio    *folio;                   
655         swp_entry_t     ent;                      
656 };                                                
657                                                   
658 enum mc_target_type {                             
659         MC_TARGET_NONE = 0,                       
660         MC_TARGET_PAGE,                           
661         MC_TARGET_SWAP,                           
662         MC_TARGET_DEVICE,                         
663 };                                                
664                                                   
665 static struct page *mc_handle_present_pte(stru    
666                                                   
667 {                                                 
668         struct page *page = vm_normal_page(vma    
669                                                   
670         if (!page)                                
671                 return NULL;                      
672         if (PageAnon(page)) {                     
673                 if (!(mc.flags & MOVE_ANON))      
674                         return NULL;              
675         } else {                                  
676                 if (!(mc.flags & MOVE_FILE))      
677                         return NULL;              
678         }                                         
679         get_page(page);                           
680                                                   
681         return page;                              
682 }                                                 
683                                                   
684 #if defined(CONFIG_SWAP) || defined(CONFIG_DEV    
685 static struct page *mc_handle_swap_pte(struct     
686                         pte_t ptent, swp_entry    
687 {                                                 
688         struct page *page = NULL;                 
689         swp_entry_t ent = pte_to_swp_entry(pte    
690                                                   
691         if (!(mc.flags & MOVE_ANON))              
692                 return NULL;                      
693                                                   
694         /*                                        
695          * Handle device private pages that ar    
696          * stored as special swap entries in t    
697          */                                       
698         if (is_device_private_entry(ent)) {       
699                 page = pfn_swap_entry_to_page(    
700                 if (!get_page_unless_zero(page    
701                         return NULL;              
702                 return page;                      
703         }                                         
704                                                   
705         if (non_swap_entry(ent))                  
706                 return NULL;                      
707                                                   
708         /*                                        
709          * Because swap_cache_get_folio() upda    
710          * we call find_get_page() with swappe    
711          */                                       
712         page = find_get_page(swap_address_spac    
713         entry->val = ent.val;                     
714                                                   
715         return page;                              
716 }                                                 
717 #else                                             
718 static struct page *mc_handle_swap_pte(struct     
719                         pte_t ptent, swp_entry    
720 {                                                 
721         return NULL;                              
722 }                                                 
723 #endif                                            
724                                                   
725 static struct page *mc_handle_file_pte(struct     
726                         unsigned long addr, pt    
727 {                                                 
728         unsigned long index;                      
729         struct folio *folio;                      
730                                                   
731         if (!vma->vm_file) /* anonymous vma */    
732                 return NULL;                      
733         if (!(mc.flags & MOVE_FILE))              
734                 return NULL;                      
735                                                   
736         /* folio is moved even if it's not RSS    
737         /* shmem/tmpfs may report page out on     
738         index = linear_page_index(vma, addr);     
739         folio = filemap_get_incore_folio(vma->    
740         if (IS_ERR(folio))                        
741                 return NULL;                      
742         return folio_file_page(folio, index);     
743 }                                                 
744                                                   
745 static void memcg1_check_events(struct mem_cgr    
746 static void memcg1_charge_statistics(struct me    
747                                                   
748 /**                                               
749  * mem_cgroup_move_account - move account of t    
750  * @folio: The folio.                             
751  * @compound: charge the page as compound or s    
752  * @from: mem_cgroup which the folio is moved     
753  * @to: mem_cgroup which the folio is moved to    
754  *                                                
755  * The folio must be locked and not on the LRU    
756  *                                                
757  * This function doesn't do "charge" to new cg    
758  * from old cgroup.                               
759  */                                               
760 static int mem_cgroup_move_account(struct foli    
761                                    bool compou    
762                                    struct mem_    
763                                    struct mem_    
764 {                                                 
765         struct lruvec *from_vec, *to_vec;         
766         struct pglist_data *pgdat;                
767         unsigned int nr_pages = compound ? fol    
768         int nid, ret;                             
769                                                   
770         VM_BUG_ON(from == to);                    
771         VM_BUG_ON_FOLIO(!folio_test_locked(fol    
772         VM_BUG_ON_FOLIO(folio_test_lru(folio),    
773         VM_BUG_ON(compound && !folio_test_larg    
774                                                   
775         ret = -EINVAL;                            
776         if (folio_memcg(folio) != from)           
777                 goto out;                         
778                                                   
779         pgdat = folio_pgdat(folio);               
780         from_vec = mem_cgroup_lruvec(from, pgd    
781         to_vec = mem_cgroup_lruvec(to, pgdat);    
782                                                   
783         folio_memcg_lock(folio);                  
784                                                   
785         if (folio_test_anon(folio)) {             
786                 if (folio_mapped(folio)) {        
787                         __mod_lruvec_state(fro    
788                         __mod_lruvec_state(to_    
789                         if (folio_test_pmd_map    
790                                 __mod_lruvec_s    
791                                                   
792                                 __mod_lruvec_s    
793                                                   
794                         }                         
795                 }                                 
796         } else {                                  
797                 __mod_lruvec_state(from_vec, N    
798                 __mod_lruvec_state(to_vec, NR_    
799                                                   
800                 if (folio_test_swapbacked(foli    
801                         __mod_lruvec_state(fro    
802                         __mod_lruvec_state(to_    
803                 }                                 
804                                                   
805                 if (folio_mapped(folio)) {        
806                         __mod_lruvec_state(fro    
807                         __mod_lruvec_state(to_    
808                 }                                 
809                                                   
810                 if (folio_test_dirty(folio)) {    
811                         struct address_space *    
812                                                   
813                         if (mapping_can_writeb    
814                                 __mod_lruvec_s    
815                                                   
816                                 __mod_lruvec_s    
817                                                   
818                         }                         
819                 }                                 
820         }                                         
821                                                   
822 #ifdef CONFIG_SWAP                                
823         if (folio_test_swapcache(folio)) {        
824                 __mod_lruvec_state(from_vec, N    
825                 __mod_lruvec_state(to_vec, NR_    
826         }                                         
827 #endif                                            
828         if (folio_test_writeback(folio)) {        
829                 __mod_lruvec_state(from_vec, N    
830                 __mod_lruvec_state(to_vec, NR_    
831         }                                         
832                                                   
833         /*                                        
834          * All state has been migrated, let's     
835          *                                        
836          * It is safe to change page's memcg h    
837          * is referenced, charged, isolated, a    
838          * with (un)charging, migration, LRU p    
839          * that would rely on a stable page's     
840          *                                        
841          * Note that folio_memcg_lock is a mem    
842          * to save space. As soon as we switch    
843          * new memcg that isn't locked, the ab    
844          * concurrently again. Make sure we're    
845          */                                       
846         smp_mb();                                 
847                                                   
848         css_get(&to->css);                        
849         css_put(&from->css);                      
850                                                   
851         /* Warning should never happen, so don    
852         WARN_ON_ONCE(folio_unqueue_deferred_sp    
853         folio->memcg_data = (unsigned long)to;    
854                                                   
855         __folio_memcg_unlock(from);               
856                                                   
857         ret = 0;                                  
858         nid = folio_nid(folio);                   
859                                                   
860         local_irq_disable();                      
861         memcg1_charge_statistics(to, nr_pages)    
862         memcg1_check_events(to, nid);             
863         memcg1_charge_statistics(from, -nr_pag    
864         memcg1_check_events(from, nid);           
865         local_irq_enable();                       
866 out:                                              
867         return ret;                               
868 }                                                 
869                                                   
870 /**                                               
871  * get_mctgt_type - get target type of moving     
872  * @vma: the vma the pte to be checked belongs    
873  * @addr: the address corresponding to the pte    
874  * @ptent: the pte to be checked                  
875  * @target: the pointer the target page or swa    
876  *                                                
877  * Context: Called with pte lock held.            
878  * Return:                                        
879  * * MC_TARGET_NONE - If the pte is not a targ    
880  * * MC_TARGET_PAGE - If the page correspondin    
881  *   move charge. If @target is not NULL, the     
882  *   with extra refcnt taken (Caller should re    
883  * * MC_TARGET_SWAP - If the swap entry corres    
884  *   target for charge migration.  If @target     
885  *   stored in target->ent.                       
886  * * MC_TARGET_DEVICE - Like MC_TARGET_PAGE bu    
887  *   thus not on the lru.  For now such page i    
888  *   would be as it is just special memory tak    
889  *   See Documentations/vm/hmm.txt and include    
890  */                                               
891 static enum mc_target_type get_mctgt_type(stru    
892                 unsigned long addr, pte_t pten    
893 {                                                 
894         struct page *page = NULL;                 
895         struct folio *folio;                      
896         enum mc_target_type ret = MC_TARGET_NO    
897         swp_entry_t ent = { .val = 0 };           
898                                                   
899         if (pte_present(ptent))                   
900                 page = mc_handle_present_pte(v    
901         else if (pte_none_mostly(ptent))          
902                 /*                                
903                  * PTE markers should be treat    
904                  * from other swap handling be    
905                  */                               
906                 page = mc_handle_file_pte(vma,    
907         else if (is_swap_pte(ptent))              
908                 page = mc_handle_swap_pte(vma,    
909                                                   
910         if (page)                                 
911                 folio = page_folio(page);         
912         if (target && page) {                     
913                 if (!folio_trylock(folio)) {      
914                         folio_put(folio);         
915                         return ret;               
916                 }                                 
917                 /*                                
918                  * page_mapped() must be stabl    
919                  * pte is locked, so if it's p    
920                  * become unmapped. If it isn'    
921                  * control over the mapped sta    
922                  * prevent new faults against     
923                  * so an unmapped page cannot     
924                  * if the page is already mapp    
925                  * unmap, and there is nothing    
926                  * Alas, skip moving the page     
927                  */                               
928                 if (!pte_present(ptent) && pag    
929                         folio_unlock(folio);      
930                         folio_put(folio);         
931                         return ret;               
932                 }                                 
933         }                                         
934                                                   
935         if (!page && !ent.val)                    
936                 return ret;                       
937         if (page) {                               
938                 /*                                
939                  * Do only loose check w/o ser    
940                  * mem_cgroup_move_account() c    
941                  * not under LRU exclusion.       
942                  */                               
943                 if (folio_memcg(folio) == mc.f    
944                         ret = MC_TARGET_PAGE;     
945                         if (folio_is_device_pr    
946                             folio_is_device_co    
947                                 ret = MC_TARGE    
948                         if (target)               
949                                 target->folio     
950                 }                                 
951                 if (!ret || !target) {            
952                         if (target)               
953                                 folio_unlock(f    
954                         folio_put(folio);         
955                 }                                 
956         }                                         
957         /*                                        
958          * There is a swap entry and a page do    
959          * But we cannot move a tail-page in a    
960          */                                       
961         if (ent.val && !ret && (!page || !Page    
962             mem_cgroup_id(mc.from) == lookup_s    
963                 ret = MC_TARGET_SWAP;             
964                 if (target)                       
965                         target->ent = ent;        
966         }                                         
967         return ret;                               
968 }                                                 
969                                                   
970 #ifdef CONFIG_TRANSPARENT_HUGEPAGE                
971 /*                                                
972  * We don't consider PMD mapped swapping or fi    
973  * not support them for now.                      
974  * Caller should make sure that pmd_trans_huge    
975  */                                               
976 static enum mc_target_type get_mctgt_type_thp(    
977                 unsigned long addr, pmd_t pmd,    
978 {                                                 
979         struct page *page = NULL;                 
980         struct folio *folio;                      
981         enum mc_target_type ret = MC_TARGET_NO    
982                                                   
983         if (unlikely(is_swap_pmd(pmd))) {         
984                 VM_BUG_ON(thp_migration_suppor    
985                                   !is_pmd_migr    
986                 return ret;                       
987         }                                         
988         page = pmd_page(pmd);                     
989         VM_BUG_ON_PAGE(!page || !PageHead(page    
990         folio = page_folio(page);                 
991         if (!(mc.flags & MOVE_ANON))              
992                 return ret;                       
993         if (folio_memcg(folio) == mc.from) {      
994                 ret = MC_TARGET_PAGE;             
995                 if (target) {                     
996                         folio_get(folio);         
997                         if (!folio_trylock(fol    
998                                 folio_put(foli    
999                                 return MC_TARG    
1000                         }                        
1001                         target->folio = folio    
1002                 }                                
1003         }                                        
1004         return ret;                              
1005 }                                                
1006 #else                                            
1007 static inline enum mc_target_type get_mctgt_t    
1008                 unsigned long addr, pmd_t pmd    
1009 {                                                
1010         return MC_TARGET_NONE;                   
1011 }                                                
1012 #endif                                           
1013                                                  
1014 static int mem_cgroup_count_precharge_pte_ran    
1015                                         unsig    
1016                                         struc    
1017 {                                                
1018         struct vm_area_struct *vma = walk->vm    
1019         pte_t *pte;                              
1020         spinlock_t *ptl;                         
1021                                                  
1022         ptl = pmd_trans_huge_lock(pmd, vma);     
1023         if (ptl) {                               
1024                 /*                               
1025                  * Note their can not be MC_T    
1026                  * support transparent huge p    
1027                  * this might change.            
1028                  */                              
1029                 if (get_mctgt_type_thp(vma, a    
1030                         mc.precharge += HPAGE    
1031                 spin_unlock(ptl);                
1032                 return 0;                        
1033         }                                        
1034                                                  
1035         pte = pte_offset_map_lock(vma->vm_mm,    
1036         if (!pte)                                
1037                 return 0;                        
1038         for (; addr != end; pte++, addr += PA    
1039                 if (get_mctgt_type(vma, addr,    
1040                         mc.precharge++; /* in    
1041         pte_unmap_unlock(pte - 1, ptl);          
1042         cond_resched();                          
1043                                                  
1044         return 0;                                
1045 }                                                
1046                                                  
1047 static const struct mm_walk_ops precharge_wal    
1048         .pmd_entry      = mem_cgroup_count_pr    
1049         .walk_lock      = PGWALK_RDLOCK,         
1050 };                                               
1051                                                  
1052 static unsigned long mem_cgroup_count_prechar    
1053 {                                                
1054         unsigned long precharge;                 
1055                                                  
1056         mmap_read_lock(mm);                      
1057         walk_page_range(mm, 0, ULONG_MAX, &pr    
1058         mmap_read_unlock(mm);                    
1059                                                  
1060         precharge = mc.precharge;                
1061         mc.precharge = 0;                        
1062                                                  
1063         return precharge;                        
1064 }                                                
1065                                                  
1066 static int mem_cgroup_precharge_mc(struct mm_    
1067 {                                                
1068         unsigned long precharge = mem_cgroup_    
1069                                                  
1070         VM_BUG_ON(mc.moving_task);               
1071         mc.moving_task = current;                
1072         return mem_cgroup_do_precharge(precha    
1073 }                                                
1074                                                  
1075 /* cancels all extra charges on mc.from and m    
1076 static void __mem_cgroup_clear_mc(void)          
1077 {                                                
1078         struct mem_cgroup *from = mc.from;       
1079         struct mem_cgroup *to = mc.to;           
1080                                                  
1081         /* we must uncharge all the leftover     
1082         if (mc.precharge) {                      
1083                 mem_cgroup_cancel_charge(mc.t    
1084                 mc.precharge = 0;                
1085         }                                        
1086         /*                                       
1087          * we didn't uncharge from mc.from at    
1088          * we must uncharge here.                
1089          */                                      
1090         if (mc.moved_charge) {                   
1091                 mem_cgroup_cancel_charge(mc.f    
1092                 mc.moved_charge = 0;             
1093         }                                        
1094         /* we must fixup refcnts and charges     
1095         if (mc.moved_swap) {                     
1096                 /* uncharge swap account from    
1097                 if (!mem_cgroup_is_root(mc.fr    
1098                         page_counter_uncharge    
1099                                                  
1100                 mem_cgroup_id_put_many(mc.fro    
1101                                                  
1102                 /*                               
1103                  * we charged both to->memory    
1104                  * should uncharge to->memory    
1105                  */                              
1106                 if (!mem_cgroup_is_root(mc.to    
1107                         page_counter_uncharge    
1108                                                  
1109                 mc.moved_swap = 0;               
1110         }                                        
1111         memcg1_oom_recover(from);                
1112         memcg1_oom_recover(to);                  
1113         wake_up_all(&mc.waitq);                  
1114 }                                                
1115                                                  
1116 static void mem_cgroup_clear_mc(void)            
1117 {                                                
1118         struct mm_struct *mm = mc.mm;            
1119                                                  
1120         /*                                       
1121          * we must clear moving_task before w    
1122          * task migration.                       
1123          */                                      
1124         mc.moving_task = NULL;                   
1125         __mem_cgroup_clear_mc();                 
1126         spin_lock(&mc.lock);                     
1127         mc.from = NULL;                          
1128         mc.to = NULL;                            
1129         mc.mm = NULL;                            
1130         spin_unlock(&mc.lock);                   
1131                                                  
1132         mmput(mm);                               
1133 }                                                
1134                                                  
1135 int memcg1_can_attach(struct cgroup_taskset *    
1136 {                                                
1137         struct cgroup_subsys_state *css;         
1138         struct mem_cgroup *memcg = NULL; /* u    
1139         struct mem_cgroup *from;                 
1140         struct task_struct *leader, *p;          
1141         struct mm_struct *mm;                    
1142         unsigned long move_flags;                
1143         int ret = 0;                             
1144                                                  
1145         /* charge immigration isn't supported    
1146         if (cgroup_subsys_on_dfl(memory_cgrp_    
1147                 return 0;                        
1148                                                  
1149         /*                                       
1150          * Multi-process migrations only happ    
1151          * where charge immigration is not us    
1152          * immigration if @tset contains a le    
1153          * multiple.                             
1154          */                                      
1155         p = NULL;                                
1156         cgroup_taskset_for_each_leader(leader    
1157                 WARN_ON_ONCE(p);                 
1158                 p = leader;                      
1159                 memcg = mem_cgroup_from_css(c    
1160         }                                        
1161         if (!p)                                  
1162                 return 0;                        
1163                                                  
1164         /*                                       
1165          * We are now committed to this value    
1166          * tunable will only affect upcoming     
1167          * So we need to save it, and keep it    
1168          */                                      
1169         move_flags = READ_ONCE(memcg->move_ch    
1170         if (!move_flags)                         
1171                 return 0;                        
1172                                                  
1173         from = mem_cgroup_from_task(p);          
1174                                                  
1175         VM_BUG_ON(from == memcg);                
1176                                                  
1177         mm = get_task_mm(p);                     
1178         if (!mm)                                 
1179                 return 0;                        
1180         /* We move charges only when we move     
1181         if (mm->owner == p) {                    
1182                 VM_BUG_ON(mc.from);              
1183                 VM_BUG_ON(mc.to);                
1184                 VM_BUG_ON(mc.precharge);         
1185                 VM_BUG_ON(mc.moved_charge);      
1186                 VM_BUG_ON(mc.moved_swap);        
1187                                                  
1188                 spin_lock(&mc.lock);             
1189                 mc.mm = mm;                      
1190                 mc.from = from;                  
1191                 mc.to = memcg;                   
1192                 mc.flags = move_flags;           
1193                 spin_unlock(&mc.lock);           
1194                 /* We set mc.moving_task late    
1195                                                  
1196                 ret = mem_cgroup_precharge_mc    
1197                 if (ret)                         
1198                         mem_cgroup_clear_mc()    
1199         } else {                                 
1200                 mmput(mm);                       
1201         }                                        
1202         return ret;                              
1203 }                                                
1204                                                  
1205 void memcg1_cancel_attach(struct cgroup_tasks    
1206 {                                                
1207         if (mc.to)                               
1208                 mem_cgroup_clear_mc();           
1209 }                                                
1210                                                  
1211 static int mem_cgroup_move_charge_pte_range(p    
1212                                 unsigned long    
1213                                 struct mm_wal    
1214 {                                                
1215         int ret = 0;                             
1216         struct vm_area_struct *vma = walk->vm    
1217         pte_t *pte;                              
1218         spinlock_t *ptl;                         
1219         enum mc_target_type target_type;         
1220         union mc_target target;                  
1221         struct folio *folio;                     
1222         bool tried_split_before = false;         
1223                                                  
1224 retry_pmd:                                       
1225         ptl = pmd_trans_huge_lock(pmd, vma);     
1226         if (ptl) {                               
1227                 if (mc.precharge < HPAGE_PMD_    
1228                         spin_unlock(ptl);        
1229                         return 0;                
1230                 }                                
1231                 target_type = get_mctgt_type_    
1232                 if (target_type == MC_TARGET_    
1233                         folio = target.folio;    
1234                         /*                       
1235                          * Deferred split que    
1236                          * and unqueue is uns    
1237                          * split or skip if o    
1238                          */                      
1239                         if (!list_empty(&foli    
1240                                 spin_unlock(p    
1241                                 if (!tried_sp    
1242                                         split    
1243                                 folio_unlock(    
1244                                 folio_put(fol    
1245                                 if (tried_spl    
1246                                         retur    
1247                                 tried_split_b    
1248                                 goto retry_pm    
1249                         }                        
1250                         /*                       
1251                          * So long as that pm    
1252                          * be racily added to    
1253                          * __folio_remove_rma    
1254                          */                      
1255                         if (folio_isolate_lru    
1256                                 if (!mem_cgro    
1257                                                  
1258                                         mc.pr    
1259                                         mc.mo    
1260                                 }                
1261                                 folio_putback    
1262                         }                        
1263                         folio_unlock(folio);     
1264                         folio_put(folio);        
1265                 } else if (target_type == MC_    
1266                         folio = target.folio;    
1267                         if (!mem_cgroup_move_    
1268                                                  
1269                                 mc.precharge     
1270                                 mc.moved_char    
1271                         }                        
1272                         folio_unlock(folio);     
1273                         folio_put(folio);        
1274                 }                                
1275                 spin_unlock(ptl);                
1276                 return 0;                        
1277         }                                        
1278                                                  
1279 retry:                                           
1280         pte = pte_offset_map_lock(vma->vm_mm,    
1281         if (!pte)                                
1282                 return 0;                        
1283         for (; addr != end; addr += PAGE_SIZE    
1284                 pte_t ptent = ptep_get(pte++)    
1285                 bool device = false;             
1286                 swp_entry_t ent;                 
1287                                                  
1288                 if (!mc.precharge)               
1289                         break;                   
1290                                                  
1291                 switch (get_mctgt_type(vma, a    
1292                 case MC_TARGET_DEVICE:           
1293                         device = true;           
1294                         fallthrough;             
1295                 case MC_TARGET_PAGE:             
1296                         folio = target.folio;    
1297                         /*                       
1298                          * We can have a part    
1299                          * can be done but it    
1300                          * ignore such a part    
1301                          * memcg. There shoul    
1302                          */                      
1303                         if (folio_test_large(    
1304                                 goto put;        
1305                         if (!device && !folio    
1306                                 goto put;        
1307                         if (!mem_cgroup_move_    
1308                                                  
1309                                 mc.precharge-    
1310                                 /* we uncharg    
1311                                 mc.moved_char    
1312                         }                        
1313                         if (!device)             
1314                                 folio_putback    
1315 put:                    /* get_mctgt_type() g    
1316                         folio_unlock(folio);     
1317                         folio_put(folio);        
1318                         break;                   
1319                 case MC_TARGET_SWAP:             
1320                         ent = target.ent;        
1321                         if (!mem_cgroup_move_    
1322                                 mc.precharge-    
1323                                 mem_cgroup_id    
1324                                 /* we fixup o    
1325                                 mc.moved_swap    
1326                         }                        
1327                         break;                   
1328                 default:                         
1329                         break;                   
1330                 }                                
1331         }                                        
1332         pte_unmap_unlock(pte - 1, ptl);          
1333         cond_resched();                          
1334                                                  
1335         if (addr != end) {                       
1336                 /*                               
1337                  * We have consumed all prech    
1338                  * We try charge one by one,     
1339                  * charges to mc.to if we hav    
1340                  * phase.                        
1341                  */                              
1342                 ret = mem_cgroup_do_precharge    
1343                 if (!ret)                        
1344                         goto retry;              
1345         }                                        
1346                                                  
1347         return ret;                              
1348 }                                                
1349                                                  
1350 static const struct mm_walk_ops charge_walk_o    
1351         .pmd_entry      = mem_cgroup_move_cha    
1352         .walk_lock      = PGWALK_RDLOCK,         
1353 };                                               
1354                                                  
1355 static void mem_cgroup_move_charge(void)         
1356 {                                                
1357         lru_add_drain_all();                     
1358         /*                                       
1359          * Signal folio_memcg_lock() to take     
1360          * while we're moving its pages to an    
1361          * for already started RCU-only updat    
1362          */                                      
1363         atomic_inc(&mc.from->moving_account);    
1364         synchronize_rcu();                       
1365 retry:                                           
1366         if (unlikely(!mmap_read_trylock(mc.mm    
1367                 /*                               
1368                  * Someone who are holding th    
1369                  * waitq. So we cancel all ex    
1370                  * and retry. Because we canc    
1371                  * to move enough charges, bu    
1372                  * feature anyway, so it woul    
1373                  */                              
1374                 __mem_cgroup_clear_mc();         
1375                 cond_resched();                  
1376                 goto retry;                      
1377         }                                        
1378         /*                                       
1379          * When we have consumed all precharg    
1380          * additional charge, the page walk j    
1381          */                                      
1382         walk_page_range(mc.mm, 0, ULONG_MAX,     
1383         mmap_read_unlock(mc.mm);                 
1384         atomic_dec(&mc.from->moving_account);    
1385 }                                                
1386                                                  
1387 void memcg1_move_task(void)                      
1388 {                                                
1389         if (mc.to) {                             
1390                 mem_cgroup_move_charge();        
1391                 mem_cgroup_clear_mc();           
1392         }                                        
1393 }                                                
1394                                                  
1395 #else   /* !CONFIG_MMU */                        
1396 int memcg1_can_attach(struct cgroup_taskset *    
1397 {                                                
1398         return 0;                                
1399 }                                                
1400 void memcg1_cancel_attach(struct cgroup_tasks    
1401 {                                                
1402 }                                                
1403 void memcg1_move_task(void)                      
1404 {                                                
1405 }                                                
1406 #endif                                           
1407                                                  
1408 static void __mem_cgroup_threshold(struct mem    
1409 {                                                
1410         struct mem_cgroup_threshold_ary *t;      
1411         unsigned long usage;                     
1412         int i;                                   
1413                                                  
1414         rcu_read_lock();                         
1415         if (!swap)                               
1416                 t = rcu_dereference(memcg->th    
1417         else                                     
1418                 t = rcu_dereference(memcg->me    
1419                                                  
1420         if (!t)                                  
1421                 goto unlock;                     
1422                                                  
1423         usage = mem_cgroup_usage(memcg, swap)    
1424                                                  
1425         /*                                       
1426          * current_threshold points to thresh    
1427          * If it's not true, a threshold was     
1428          * call of __mem_cgroup_threshold().     
1429          */                                      
1430         i = t->current_threshold;                
1431                                                  
1432         /*                                       
1433          * Iterate backward over array of thr    
1434          * current_threshold and check if a t    
1435          * If none of thresholds below usage     
1436          * only one element of the array here    
1437          */                                      
1438         for (; i >= 0 && unlikely(t->entries[    
1439                 eventfd_signal(t->entries[i].    
1440                                                  
1441         /* i = current_threshold + 1 */          
1442         i++;                                     
1443                                                  
1444         /*                                       
1445          * Iterate forward over array of thre    
1446          * current_threshold+1 and check if a    
1447          * If none of thresholds above usage     
1448          * only one element of the array here    
1449          */                                      
1450         for (; i < t->size && unlikely(t->ent    
1451                 eventfd_signal(t->entries[i].    
1452                                                  
1453         /* Update current_threshold */           
1454         t->current_threshold = i - 1;            
1455 unlock:                                          
1456         rcu_read_unlock();                       
1457 }                                                
1458                                                  
1459 static void mem_cgroup_threshold(struct mem_c    
1460 {                                                
1461         while (memcg) {                          
1462                 __mem_cgroup_threshold(memcg,    
1463                 if (do_memsw_account())          
1464                         __mem_cgroup_threshol    
1465                                                  
1466                 memcg = parent_mem_cgroup(mem    
1467         }                                        
1468 }                                                
1469                                                  
1470 /* Cgroup1: threshold notifications & softlim    
1471 struct memcg1_events_percpu {                    
1472         unsigned long nr_page_events;            
1473         unsigned long targets[MEM_CGROUP_NTAR    
1474 };                                               
1475                                                  
1476 static void memcg1_charge_statistics(struct m    
1477 {                                                
1478         /* pagein of a big page is an event.     
1479         if (nr_pages > 0)                        
1480                 __count_memcg_events(memcg, P    
1481         else {                                   
1482                 __count_memcg_events(memcg, P    
1483                 nr_pages = -nr_pages; /* for     
1484         }                                        
1485                                                  
1486         __this_cpu_add(memcg->events_percpu->    
1487 }                                                
1488                                                  
1489 #define THRESHOLDS_EVENTS_TARGET 128             
1490 #define SOFTLIMIT_EVENTS_TARGET 1024             
1491                                                  
1492 static bool memcg1_event_ratelimit(struct mem    
1493                                 enum mem_cgro    
1494 {                                                
1495         unsigned long val, next;                 
1496                                                  
1497         val = __this_cpu_read(memcg->events_p    
1498         next = __this_cpu_read(memcg->events_    
1499         /* from time_after() in jiffies.h */     
1500         if ((long)(next - val) < 0) {            
1501                 switch (target) {                
1502                 case MEM_CGROUP_TARGET_THRESH    
1503                         next = val + THRESHOL    
1504                         break;                   
1505                 case MEM_CGROUP_TARGET_SOFTLI    
1506                         next = val + SOFTLIMI    
1507                         break;                   
1508                 default:                         
1509                         break;                   
1510                 }                                
1511                 __this_cpu_write(memcg->event    
1512                 return true;                     
1513         }                                        
1514         return false;                            
1515 }                                                
1516                                                  
1517 /*                                               
1518  * Check events in order.                        
1519  *                                               
1520  */                                              
1521 static void memcg1_check_events(struct mem_cg    
1522 {                                                
1523         if (IS_ENABLED(CONFIG_PREEMPT_RT))       
1524                 return;                          
1525                                                  
1526         /* threshold event is triggered in fi    
1527         if (unlikely(memcg1_event_ratelimit(m    
1528                                                  
1529                 bool do_softlimit;               
1530                                                  
1531                 do_softlimit = memcg1_event_r    
1532                                                  
1533                 mem_cgroup_threshold(memcg);     
1534                 if (unlikely(do_softlimit))      
1535                         memcg1_update_tree(me    
1536         }                                        
1537 }                                                
1538                                                  
1539 void memcg1_commit_charge(struct folio *folio    
1540 {                                                
1541         unsigned long flags;                     
1542                                                  
1543         local_irq_save(flags);                   
1544         memcg1_charge_statistics(memcg, folio    
1545         memcg1_check_events(memcg, folio_nid(    
1546         local_irq_restore(flags);                
1547 }                                                
1548                                                  
1549 void memcg1_swapout(struct folio *folio, stru    
1550 {                                                
1551         /*                                       
1552          * Interrupts should be disabled here    
1553          * i_pages lock which is taken with i    
1554          * important here to have the interru    
1555          * only synchronisation we have for u    
1556          */                                      
1557         preempt_disable_nested();                
1558         VM_WARN_ON_IRQS_ENABLED();               
1559         memcg1_charge_statistics(memcg, -foli    
1560         preempt_enable_nested();                 
1561         memcg1_check_events(memcg, folio_nid(    
1562 }                                                
1563                                                  
1564 void memcg1_uncharge_batch(struct mem_cgroup     
1565                            unsigned long nr_m    
1566 {                                                
1567         unsigned long flags;                     
1568                                                  
1569         local_irq_save(flags);                   
1570         __count_memcg_events(memcg, PGPGOUT,     
1571         __this_cpu_add(memcg->events_percpu->    
1572         memcg1_check_events(memcg, nid);         
1573         local_irq_restore(flags);                
1574 }                                                
1575                                                  
1576 static int compare_thresholds(const void *a,     
1577 {                                                
1578         const struct mem_cgroup_threshold *_a    
1579         const struct mem_cgroup_threshold *_b    
1580                                                  
1581         if (_a->threshold > _b->threshold)       
1582                 return 1;                        
1583                                                  
1584         if (_a->threshold < _b->threshold)       
1585                 return -1;                       
1586                                                  
1587         return 0;                                
1588 }                                                
1589                                                  
1590 static int mem_cgroup_oom_notify_cb(struct me    
1591 {                                                
1592         struct mem_cgroup_eventfd_list *ev;      
1593                                                  
1594         spin_lock(&memcg_oom_lock);              
1595                                                  
1596         list_for_each_entry(ev, &memcg->oom_n    
1597                 eventfd_signal(ev->eventfd);     
1598                                                  
1599         spin_unlock(&memcg_oom_lock);            
1600         return 0;                                
1601 }                                                
1602                                                  
1603 static void mem_cgroup_oom_notify(struct mem_    
1604 {                                                
1605         struct mem_cgroup *iter;                 
1606                                                  
1607         for_each_mem_cgroup_tree(iter, memcg)    
1608                 mem_cgroup_oom_notify_cb(iter    
1609 }                                                
1610                                                  
1611 static int __mem_cgroup_usage_register_event(    
1612         struct eventfd_ctx *eventfd, const ch    
1613 {                                                
1614         struct mem_cgroup_thresholds *thresho    
1615         struct mem_cgroup_threshold_ary *new;    
1616         unsigned long threshold;                 
1617         unsigned long usage;                     
1618         int i, size, ret;                        
1619                                                  
1620         ret = page_counter_memparse(args, "-1    
1621         if (ret)                                 
1622                 return ret;                      
1623                                                  
1624         mutex_lock(&memcg->thresholds_lock);     
1625                                                  
1626         if (type == _MEM) {                      
1627                 thresholds = &memcg->threshol    
1628                 usage = mem_cgroup_usage(memc    
1629         } else if (type == _MEMSWAP) {           
1630                 thresholds = &memcg->memsw_th    
1631                 usage = mem_cgroup_usage(memc    
1632         } else                                   
1633                 BUG();                           
1634                                                  
1635         /* Check if a threshold crossed befor    
1636         if (thresholds->primary)                 
1637                 __mem_cgroup_threshold(memcg,    
1638                                                  
1639         size = thresholds->primary ? threshol    
1640                                                  
1641         /* Allocate memory for new array of t    
1642         new = kmalloc(struct_size(new, entrie    
1643         if (!new) {                              
1644                 ret = -ENOMEM;                   
1645                 goto unlock;                     
1646         }                                        
1647         new->size = size;                        
1648                                                  
1649         /* Copy thresholds (if any) to new ar    
1650         if (thresholds->primary)                 
1651                 memcpy(new->entries, threshol    
1652                        flex_array_size(new, e    
1653                                                  
1654         /* Add new threshold */                  
1655         new->entries[size - 1].eventfd = even    
1656         new->entries[size - 1].threshold = th    
1657                                                  
1658         /* Sort thresholds. Registering of ne    
1659         sort(new->entries, size, sizeof(*new-    
1660                         compare_thresholds, N    
1661                                                  
1662         /* Find current threshold */             
1663         new->current_threshold = -1;             
1664         for (i = 0; i < size; i++) {             
1665                 if (new->entries[i].threshold    
1666                         /*                       
1667                          * new->current_thres    
1668                          * rcu_assign_pointer    
1669                          * it here.              
1670                          */                      
1671                         ++new->current_thresh    
1672                 } else                           
1673                         break;                   
1674         }                                        
1675                                                  
1676         /* Free old spare buffer and save old    
1677         kfree(thresholds->spare);                
1678         thresholds->spare = thresholds->prima    
1679                                                  
1680         rcu_assign_pointer(thresholds->primar    
1681                                                  
1682         /* To be sure that nobody uses thresh    
1683         synchronize_rcu();                       
1684                                                  
1685 unlock:                                          
1686         mutex_unlock(&memcg->thresholds_lock)    
1687                                                  
1688         return ret;                              
1689 }                                                
1690                                                  
1691 static int mem_cgroup_usage_register_event(st    
1692         struct eventfd_ctx *eventfd, const ch    
1693 {                                                
1694         return __mem_cgroup_usage_register_ev    
1695 }                                                
1696                                                  
1697 static int memsw_cgroup_usage_register_event(    
1698         struct eventfd_ctx *eventfd, const ch    
1699 {                                                
1700         return __mem_cgroup_usage_register_ev    
1701 }                                                
1702                                                  
1703 static void __mem_cgroup_usage_unregister_eve    
1704         struct eventfd_ctx *eventfd, enum res    
1705 {                                                
1706         struct mem_cgroup_thresholds *thresho    
1707         struct mem_cgroup_threshold_ary *new;    
1708         unsigned long usage;                     
1709         int i, j, size, entries;                 
1710                                                  
1711         mutex_lock(&memcg->thresholds_lock);     
1712                                                  
1713         if (type == _MEM) {                      
1714                 thresholds = &memcg->threshol    
1715                 usage = mem_cgroup_usage(memc    
1716         } else if (type == _MEMSWAP) {           
1717                 thresholds = &memcg->memsw_th    
1718                 usage = mem_cgroup_usage(memc    
1719         } else                                   
1720                 BUG();                           
1721                                                  
1722         if (!thresholds->primary)                
1723                 goto unlock;                     
1724                                                  
1725         /* Check if a threshold crossed befor    
1726         __mem_cgroup_threshold(memcg, type ==    
1727                                                  
1728         /* Calculate new number of threshold     
1729         size = entries = 0;                      
1730         for (i = 0; i < thresholds->primary->    
1731                 if (thresholds->primary->entr    
1732                         size++;                  
1733                 else                             
1734                         entries++;               
1735         }                                        
1736                                                  
1737         new = thresholds->spare;                 
1738                                                  
1739         /* If no items related to eventfd hav    
1740         if (!entries)                            
1741                 goto unlock;                     
1742                                                  
1743         /* Set thresholds array to NULL if we    
1744         if (!size) {                             
1745                 kfree(new);                      
1746                 new = NULL;                      
1747                 goto swap_buffers;               
1748         }                                        
1749                                                  
1750         new->size = size;                        
1751                                                  
1752         /* Copy thresholds and find current t    
1753         new->current_threshold = -1;             
1754         for (i = 0, j = 0; i < thresholds->pr    
1755                 if (thresholds->primary->entr    
1756                         continue;                
1757                                                  
1758                 new->entries[j] = thresholds-    
1759                 if (new->entries[j].threshold    
1760                         /*                       
1761                          * new->current_thres    
1762                          * until rcu_assign_p    
1763                          * it here.              
1764                          */                      
1765                         ++new->current_thresh    
1766                 }                                
1767                 j++;                             
1768         }                                        
1769                                                  
1770 swap_buffers:                                    
1771         /* Swap primary and spare array */       
1772         thresholds->spare = thresholds->prima    
1773                                                  
1774         rcu_assign_pointer(thresholds->primar    
1775                                                  
1776         /* To be sure that nobody uses thresh    
1777         synchronize_rcu();                       
1778                                                  
1779         /* If all events are unregistered, fr    
1780         if (!new) {                              
1781                 kfree(thresholds->spare);        
1782                 thresholds->spare = NULL;        
1783         }                                        
1784 unlock:                                          
1785         mutex_unlock(&memcg->thresholds_lock)    
1786 }                                                
1787                                                  
1788 static void mem_cgroup_usage_unregister_event    
1789         struct eventfd_ctx *eventfd)             
1790 {                                                
1791         return __mem_cgroup_usage_unregister_    
1792 }                                                
1793                                                  
1794 static void memsw_cgroup_usage_unregister_eve    
1795         struct eventfd_ctx *eventfd)             
1796 {                                                
1797         return __mem_cgroup_usage_unregister_    
1798 }                                                
1799                                                  
1800 static int mem_cgroup_oom_register_event(stru    
1801         struct eventfd_ctx *eventfd, const ch    
1802 {                                                
1803         struct mem_cgroup_eventfd_list *event    
1804                                                  
1805         event = kmalloc(sizeof(*event), GFP_K    
1806         if (!event)                              
1807                 return -ENOMEM;                  
1808                                                  
1809         spin_lock(&memcg_oom_lock);              
1810                                                  
1811         event->eventfd = eventfd;                
1812         list_add(&event->list, &memcg->oom_no    
1813                                                  
1814         /* already in OOM ? */                   
1815         if (memcg->under_oom)                    
1816                 eventfd_signal(eventfd);         
1817         spin_unlock(&memcg_oom_lock);            
1818                                                  
1819         return 0;                                
1820 }                                                
1821                                                  
1822 static void mem_cgroup_oom_unregister_event(s    
1823         struct eventfd_ctx *eventfd)             
1824 {                                                
1825         struct mem_cgroup_eventfd_list *ev, *    
1826                                                  
1827         spin_lock(&memcg_oom_lock);              
1828                                                  
1829         list_for_each_entry_safe(ev, tmp, &me    
1830                 if (ev->eventfd == eventfd) {    
1831                         list_del(&ev->list);     
1832                         kfree(ev);               
1833                 }                                
1834         }                                        
1835                                                  
1836         spin_unlock(&memcg_oom_lock);            
1837 }                                                
1838                                                  
1839 /*                                               
1840  * DO NOT USE IN NEW FILES.                      
1841  *                                               
1842  * "cgroup.event_control" implementation.        
1843  *                                               
1844  * This is way over-engineered.  It tries to     
1845  * events for each user.  Such level of flexi    
1846  * unnecessary especially in the light of the    
1847  *                                               
1848  * Please deprecate this and replace with som    
1849  * possible.                                     
1850  */                                              
1851                                                  
1852 /*                                               
1853  * Unregister event and free resources.          
1854  *                                               
1855  * Gets called from workqueue.                   
1856  */                                              
1857 static void memcg_event_remove(struct work_st    
1858 {                                                
1859         struct mem_cgroup_event *event =         
1860                 container_of(work, struct mem    
1861         struct mem_cgroup *memcg = event->mem    
1862                                                  
1863         remove_wait_queue(event->wqh, &event-    
1864                                                  
1865         event->unregister_event(memcg, event-    
1866                                                  
1867         /* Notify userspace the event is goin    
1868         eventfd_signal(event->eventfd);          
1869                                                  
1870         eventfd_ctx_put(event->eventfd);         
1871         kfree(event);                            
1872         css_put(&memcg->css);                    
1873 }                                                
1874                                                  
1875 /*                                               
1876  * Gets called on EPOLLHUP on eventfd when us    
1877  *                                               
1878  * Called with wqh->lock held and interrupts     
1879  */                                              
1880 static int memcg_event_wake(wait_queue_entry_    
1881                             int sync, void *k    
1882 {                                                
1883         struct mem_cgroup_event *event =         
1884                 container_of(wait, struct mem    
1885         struct mem_cgroup *memcg = event->mem    
1886         __poll_t flags = key_to_poll(key);       
1887                                                  
1888         if (flags & EPOLLHUP) {                  
1889                 /*                               
1890                  * If the event has been deta    
1891                  * can simply return knowing     
1892                  * for us.                       
1893                  *                               
1894                  * We can't race against even    
1895                  * side will require wqh->loc    
1896                  * which we hold.                
1897                  */                              
1898                 spin_lock(&memcg->event_list_    
1899                 if (!list_empty(&event->list)    
1900                         list_del_init(&event-    
1901                         /*                       
1902                          * We are in atomic c    
1903                          * may sleep, so we h    
1904                          */                      
1905                         schedule_work(&event-    
1906                 }                                
1907                 spin_unlock(&memcg->event_lis    
1908         }                                        
1909                                                  
1910         return 0;                                
1911 }                                                
1912                                                  
1913 static void memcg_event_ptable_queue_proc(str    
1914                 wait_queue_head_t *wqh, poll_    
1915 {                                                
1916         struct mem_cgroup_event *event =         
1917                 container_of(pt, struct mem_c    
1918                                                  
1919         event->wqh = wqh;                        
1920         add_wait_queue(wqh, &event->wait);       
1921 }                                                
1922                                                  
1923 /*                                               
1924  * DO NOT USE IN NEW FILES.                      
1925  *                                               
1926  * Parse input and register new cgroup event     
1927  *                                               
1928  * Input must be in format '<event_fd> <contr    
1929  * Interpretation of args is defined by contr    
1930  */                                              
1931 static ssize_t memcg_write_event_control(stru    
1932                                          char    
1933 {                                                
1934         struct cgroup_subsys_state *css = of_    
1935         struct mem_cgroup *memcg = mem_cgroup    
1936         struct mem_cgroup_event *event;          
1937         struct cgroup_subsys_state *cfile_css    
1938         unsigned int efd, cfd;                   
1939         struct fd efile;                         
1940         struct fd cfile;                         
1941         struct dentry *cdentry;                  
1942         const char *name;                        
1943         char *endp;                              
1944         int ret;                                 
1945                                                  
1946         if (IS_ENABLED(CONFIG_PREEMPT_RT))       
1947                 return -EOPNOTSUPP;              
1948                                                  
1949         buf = strstrip(buf);                     
1950                                                  
1951         efd = simple_strtoul(buf, &endp, 10);    
1952         if (*endp != ' ')                        
1953                 return -EINVAL;                  
1954         buf = endp + 1;                          
1955                                                  
1956         cfd = simple_strtoul(buf, &endp, 10);    
1957         if (*endp == '\0')                       
1958                 buf = endp;                      
1959         else if (*endp == ' ')                   
1960                 buf = endp + 1;                  
1961         else                                     
1962                 return -EINVAL;                  
1963                                                  
1964         event = kzalloc(sizeof(*event), GFP_K    
1965         if (!event)                              
1966                 return -ENOMEM;                  
1967                                                  
1968         event->memcg = memcg;                    
1969         INIT_LIST_HEAD(&event->list);            
1970         init_poll_funcptr(&event->pt, memcg_e    
1971         init_waitqueue_func_entry(&event->wai    
1972         INIT_WORK(&event->remove, memcg_event    
1973                                                  
1974         efile = fdget(efd);                      
1975         if (!fd_file(efile)) {                   
1976                 ret = -EBADF;                    
1977                 goto out_kfree;                  
1978         }                                        
1979                                                  
1980         event->eventfd = eventfd_ctx_fileget(    
1981         if (IS_ERR(event->eventfd)) {            
1982                 ret = PTR_ERR(event->eventfd)    
1983                 goto out_put_efile;              
1984         }                                        
1985                                                  
1986         cfile = fdget(cfd);                      
1987         if (!fd_file(cfile)) {                   
1988                 ret = -EBADF;                    
1989                 goto out_put_eventfd;            
1990         }                                        
1991                                                  
1992         /* the process need read permission o    
1993         /* AV: shouldn't we check that it's b    
1994         ret = file_permission(fd_file(cfile),    
1995         if (ret < 0)                             
1996                 goto out_put_cfile;              
1997                                                  
1998         /*                                       
1999          * The control file must be a regular    
2000          * file can't be renamed, it's safe t    
2001          */                                      
2002         cdentry = fd_file(cfile)->f_path.dent    
2003         if (cdentry->d_sb->s_type != &cgroup_    
2004                 ret = -EINVAL;                   
2005                 goto out_put_cfile;              
2006         }                                        
2007                                                  
2008         /*                                       
2009          * Determine the event callbacks and     
2010          * to be done via struct cftype but c    
2011          * about these events.  The following    
2012          * is for compatibility anyway.          
2013          *                                       
2014          * DO NOT ADD NEW FILES.                 
2015          */                                      
2016         name = cdentry->d_name.name;             
2017                                                  
2018         if (!strcmp(name, "memory.usage_in_by    
2019                 event->register_event = mem_c    
2020                 event->unregister_event = mem    
2021         } else if (!strcmp(name, "memory.oom_    
2022                 pr_warn_once("oom_control is     
2023                              "Please report y    
2024                              " if you depend     
2025                 event->register_event = mem_c    
2026                 event->unregister_event = mem    
2027         } else if (!strcmp(name, "memory.pres    
2028                 pr_warn_once("pressure_level     
2029                              "Please report y    
2030                              "if you depend o    
2031                 event->register_event = vmpre    
2032                 event->unregister_event = vmp    
2033         } else if (!strcmp(name, "memory.mems    
2034                 event->register_event = memsw    
2035                 event->unregister_event = mem    
2036         } else {                                 
2037                 ret = -EINVAL;                   
2038                 goto out_put_cfile;              
2039         }                                        
2040                                                  
2041         /*                                       
2042          * Verify @cfile should belong to @cs    
2043          * automatically removed on cgroup de    
2044          * asynchronous, so take an extra ref    
2045          */                                      
2046         cfile_css = css_tryget_online_from_di    
2047                                                  
2048         ret = -EINVAL;                           
2049         if (IS_ERR(cfile_css))                   
2050                 goto out_put_cfile;              
2051         if (cfile_css != css) {                  
2052                 css_put(cfile_css);              
2053                 goto out_put_cfile;              
2054         }                                        
2055                                                  
2056         ret = event->register_event(memcg, ev    
2057         if (ret)                                 
2058                 goto out_put_css;                
2059                                                  
2060         vfs_poll(fd_file(efile), &event->pt);    
2061                                                  
2062         spin_lock_irq(&memcg->event_list_lock    
2063         list_add(&event->list, &memcg->event_    
2064         spin_unlock_irq(&memcg->event_list_lo    
2065                                                  
2066         fdput(cfile);                            
2067         fdput(efile);                            
2068                                                  
2069         return nbytes;                           
2070                                                  
2071 out_put_css:                                     
2072         css_put(css);                            
2073 out_put_cfile:                                   
2074         fdput(cfile);                            
2075 out_put_eventfd:                                 
2076         eventfd_ctx_put(event->eventfd);         
2077 out_put_efile:                                   
2078         fdput(efile);                            
2079 out_kfree:                                       
2080         kfree(event);                            
2081                                                  
2082         return ret;                              
2083 }                                                
2084                                                  
2085 void memcg1_memcg_init(struct mem_cgroup *mem    
2086 {                                                
2087         INIT_LIST_HEAD(&memcg->oom_notify);      
2088         mutex_init(&memcg->thresholds_lock);     
2089         spin_lock_init(&memcg->move_lock);       
2090         INIT_LIST_HEAD(&memcg->event_list);      
2091         spin_lock_init(&memcg->event_list_loc    
2092 }                                                
2093                                                  
2094 void memcg1_css_offline(struct mem_cgroup *me    
2095 {                                                
2096         struct mem_cgroup_event *event, *tmp;    
2097                                                  
2098         /*                                       
2099          * Unregister events and notify users    
2100          * Notify userspace about cgroup remo    
2101          * directory to avoid race between us    
2102          */                                      
2103         spin_lock_irq(&memcg->event_list_lock    
2104         list_for_each_entry_safe(event, tmp,     
2105                 list_del_init(&event->list);     
2106                 schedule_work(&event->remove)    
2107         }                                        
2108         spin_unlock_irq(&memcg->event_list_lo    
2109 }                                                
2110                                                  
2111 /*                                               
2112  * Check OOM-Killer is already running under     
2113  * If someone is running, return false.          
2114  */                                              
2115 static bool mem_cgroup_oom_trylock(struct mem    
2116 {                                                
2117         struct mem_cgroup *iter, *failed = NU    
2118                                                  
2119         spin_lock(&memcg_oom_lock);              
2120                                                  
2121         for_each_mem_cgroup_tree(iter, memcg)    
2122                 if (iter->oom_lock) {            
2123                         /*                       
2124                          * this subtree of ou    
2125                          * so we cannot give     
2126                          */                      
2127                         failed = iter;           
2128                         mem_cgroup_iter_break    
2129                         break;                   
2130                 } else                           
2131                         iter->oom_lock = true    
2132         }                                        
2133                                                  
2134         if (failed) {                            
2135                 /*                               
2136                  * OK, we failed to lock the     
2137                  * to clean up what we set up    
2138                  */                              
2139                 for_each_mem_cgroup_tree(iter    
2140                         if (iter == failed) {    
2141                                 mem_cgroup_it    
2142                                 break;           
2143                         }                        
2144                         iter->oom_lock = fals    
2145                 }                                
2146         } else                                   
2147                 mutex_acquire(&memcg_oom_lock    
2148                                                  
2149         spin_unlock(&memcg_oom_lock);            
2150                                                  
2151         return !failed;                          
2152 }                                                
2153                                                  
2154 static void mem_cgroup_oom_unlock(struct mem_    
2155 {                                                
2156         struct mem_cgroup *iter;                 
2157                                                  
2158         spin_lock(&memcg_oom_lock);              
2159         mutex_release(&memcg_oom_lock_dep_map    
2160         for_each_mem_cgroup_tree(iter, memcg)    
2161                 iter->oom_lock = false;          
2162         spin_unlock(&memcg_oom_lock);            
2163 }                                                
2164                                                  
2165 static void mem_cgroup_mark_under_oom(struct     
2166 {                                                
2167         struct mem_cgroup *iter;                 
2168                                                  
2169         spin_lock(&memcg_oom_lock);              
2170         for_each_mem_cgroup_tree(iter, memcg)    
2171                 iter->under_oom++;               
2172         spin_unlock(&memcg_oom_lock);            
2173 }                                                
2174                                                  
2175 static void mem_cgroup_unmark_under_oom(struc    
2176 {                                                
2177         struct mem_cgroup *iter;                 
2178                                                  
2179         /*                                       
2180          * Be careful about under_oom underfl    
2181          * could have been added after mem_cg    
2182          */                                      
2183         spin_lock(&memcg_oom_lock);              
2184         for_each_mem_cgroup_tree(iter, memcg)    
2185                 if (iter->under_oom > 0)         
2186                         iter->under_oom--;       
2187         spin_unlock(&memcg_oom_lock);            
2188 }                                                
2189                                                  
2190 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_wait    
2191                                                  
2192 struct oom_wait_info {                           
2193         struct mem_cgroup *memcg;                
2194         wait_queue_entry_t      wait;            
2195 };                                               
2196                                                  
2197 static int memcg_oom_wake_function(wait_queue    
2198         unsigned mode, int sync, void *arg)      
2199 {                                                
2200         struct mem_cgroup *wake_memcg = (stru    
2201         struct mem_cgroup *oom_wait_memcg;       
2202         struct oom_wait_info *oom_wait_info;     
2203                                                  
2204         oom_wait_info = container_of(wait, st    
2205         oom_wait_memcg = oom_wait_info->memcg    
2206                                                  
2207         if (!mem_cgroup_is_descendant(wake_me    
2208             !mem_cgroup_is_descendant(oom_wai    
2209                 return 0;                        
2210         return autoremove_wake_function(wait,    
2211 }                                                
2212                                                  
2213 void memcg1_oom_recover(struct mem_cgroup *me    
2214 {                                                
2215         /*                                       
2216          * For the following lockless ->under    
2217          * guarantee is that it must see the     
2218          * this function is called as a resul    
2219          * triggered by the notification of t    
2220          * achieved by invoking mem_cgroup_ma    
2221          * triggering notification.              
2222          */                                      
2223         if (memcg && memcg->under_oom)           
2224                 __wake_up(&memcg_oom_waitq, T    
2225 }                                                
2226                                                  
2227 /**                                              
2228  * mem_cgroup_oom_synchronize - complete memc    
2229  * @handle: actually kill/wait or just clean     
2230  *                                               
2231  * This has to be called at the end of a page    
2232  * handler was enabled.                          
2233  *                                               
2234  * Memcg supports userspace OOM handling wher    
2235  * sleep on a waitqueue until the userspace t    
2236  * situation.  Sleeping directly in the charg    
2237  * of locks held is not a good idea, instead     
2238  * in the task and mem_cgroup_oom_synchronize    
2239  * the end of the page fault to complete the     
2240  *                                               
2241  * Returns %true if an ongoing memcg OOM situ    
2242  * completed, %false otherwise.                  
2243  */                                              
2244 bool mem_cgroup_oom_synchronize(bool handle)     
2245 {                                                
2246         struct mem_cgroup *memcg = current->m    
2247         struct oom_wait_info owait;              
2248         bool locked;                             
2249                                                  
2250         /* OOM is global, do not handle */       
2251         if (!memcg)                              
2252                 return false;                    
2253                                                  
2254         if (!handle)                             
2255                 goto cleanup;                    
2256                                                  
2257         owait.memcg = memcg;                     
2258         owait.wait.flags = 0;                    
2259         owait.wait.func = memcg_oom_wake_func    
2260         owait.wait.private = current;            
2261         INIT_LIST_HEAD(&owait.wait.entry);       
2262                                                  
2263         prepare_to_wait(&memcg_oom_waitq, &ow    
2264         mem_cgroup_mark_under_oom(memcg);        
2265                                                  
2266         locked = mem_cgroup_oom_trylock(memcg    
2267                                                  
2268         if (locked)                              
2269                 mem_cgroup_oom_notify(memcg);    
2270                                                  
2271         schedule();                              
2272         mem_cgroup_unmark_under_oom(memcg);      
2273         finish_wait(&memcg_oom_waitq, &owait.    
2274                                                  
2275         if (locked)                              
2276                 mem_cgroup_oom_unlock(memcg);    
2277 cleanup:                                         
2278         current->memcg_in_oom = NULL;            
2279         css_put(&memcg->css);                    
2280         return true;                             
2281 }                                                
2282                                                  
2283                                                  
2284 bool memcg1_oom_prepare(struct mem_cgroup *me    
2285 {                                                
2286         /*                                       
2287          * We are in the middle of the charge    
2288          * don't want to block when potential    
2289          * that holds all kinds of filesystem    
2290          *                                       
2291          * cgroup1 allows disabling the OOM k    
2292          * handling until the charge can succ    
2293          * the task to sleep at the end of th    
2294          * released.                             
2295          *                                       
2296          * On the other hand, in-kernel OOM k    
2297          * memory reclaim (oom_reaper) and th    
2298          * relying on the oom victim to make     
2299          * invoke the oom killer here.           
2300          *                                       
2301          * Please note that mem_cgroup_out_of    
2302          * victim and then we have to bail ou    
2303          */                                      
2304         if (READ_ONCE(memcg->oom_kill_disable    
2305                 if (current->in_user_fault) {    
2306                         css_get(&memcg->css);    
2307                         current->memcg_in_oom    
2308                 }                                
2309                 return false;                    
2310         }                                        
2311                                                  
2312         mem_cgroup_mark_under_oom(memcg);        
2313                                                  
2314         *locked = mem_cgroup_oom_trylock(memc    
2315                                                  
2316         if (*locked)                             
2317                 mem_cgroup_oom_notify(memcg);    
2318                                                  
2319         mem_cgroup_unmark_under_oom(memcg);      
2320                                                  
2321         return true;                             
2322 }                                                
2323                                                  
2324 void memcg1_oom_finish(struct mem_cgroup *mem    
2325 {                                                
2326         if (locked)                              
2327                 mem_cgroup_oom_unlock(memcg);    
2328 }                                                
2329                                                  
2330 static DEFINE_MUTEX(memcg_max_mutex);            
2331                                                  
2332 static int mem_cgroup_resize_max(struct mem_c    
2333                                  unsigned lon    
2334 {                                                
2335         bool enlarge = false;                    
2336         bool drained = false;                    
2337         int ret;                                 
2338         bool limits_invariant;                   
2339         struct page_counter *counter = memsw     
2340                                                  
2341         do {                                     
2342                 if (signal_pending(current))     
2343                         ret = -EINTR;            
2344                         break;                   
2345                 }                                
2346                                                  
2347                 mutex_lock(&memcg_max_mutex);    
2348                 /*                               
2349                  * Make sure that the new lim    
2350                  * break our basic invariant     
2351                  */                              
2352                 limits_invariant = memsw ? ma    
2353                                            ma    
2354                 if (!limits_invariant) {         
2355                         mutex_unlock(&memcg_m    
2356                         ret = -EINVAL;           
2357                         break;                   
2358                 }                                
2359                 if (max > counter->max)          
2360                         enlarge = true;          
2361                 ret = page_counter_set_max(co    
2362                 mutex_unlock(&memcg_max_mutex    
2363                                                  
2364                 if (!ret)                        
2365                         break;                   
2366                                                  
2367                 if (!drained) {                  
2368                         drain_all_stock(memcg    
2369                         drained = true;          
2370                         continue;                
2371                 }                                
2372                                                  
2373                 if (!try_to_free_mem_cgroup_p    
2374                                 memsw ? 0 : M    
2375                         ret = -EBUSY;            
2376                         break;                   
2377                 }                                
2378         } while (true);                          
2379                                                  
2380         if (!ret && enlarge)                     
2381                 memcg1_oom_recover(memcg);       
2382                                                  
2383         return ret;                              
2384 }                                                
2385                                                  
2386 /*                                               
2387  * Reclaims as many pages from the given memc    
2388  *                                               
2389  * Caller is responsible for holding css refe    
2390  */                                              
2391 static int mem_cgroup_force_empty(struct mem_    
2392 {                                                
2393         int nr_retries = MAX_RECLAIM_RETRIES;    
2394                                                  
2395         /* we call try-to-free pages for make    
2396         lru_add_drain_all();                     
2397                                                  
2398         drain_all_stock(memcg);                  
2399                                                  
2400         /* try to free all pages in this cgro    
2401         while (nr_retries && page_counter_rea    
2402                 if (signal_pending(current))     
2403                         return -EINTR;           
2404                                                  
2405                 if (!try_to_free_mem_cgroup_p    
2406                                                  
2407                         nr_retries--;            
2408         }                                        
2409                                                  
2410         return 0;                                
2411 }                                                
2412                                                  
2413 static ssize_t mem_cgroup_force_empty_write(s    
2414                                             c    
2415                                             l    
2416 {                                                
2417         struct mem_cgroup *memcg = mem_cgroup    
2418                                                  
2419         if (mem_cgroup_is_root(memcg))           
2420                 return -EINVAL;                  
2421         return mem_cgroup_force_empty(memcg)     
2422 }                                                
2423                                                  
2424 static u64 mem_cgroup_hierarchy_read(struct c    
2425                                      struct c    
2426 {                                                
2427         return 1;                                
2428 }                                                
2429                                                  
2430 static int mem_cgroup_hierarchy_write(struct     
2431                                       struct     
2432 {                                                
2433         if (val == 1)                            
2434                 return 0;                        
2435                                                  
2436         pr_warn_once("Non-hierarchical mode i    
2437                      "Please report your usec    
2438                      "depend on this function    
2439                                                  
2440         return -EINVAL;                          
2441 }                                                
2442                                                  
2443 static u64 mem_cgroup_read_u64(struct cgroup_    
2444                                struct cftype     
2445 {                                                
2446         struct mem_cgroup *memcg = mem_cgroup    
2447         struct page_counter *counter;            
2448                                                  
2449         switch (MEMFILE_TYPE(cft->private)) {    
2450         case _MEM:                               
2451                 counter = &memcg->memory;        
2452                 break;                           
2453         case _MEMSWAP:                           
2454                 counter = &memcg->memsw;         
2455                 break;                           
2456         case _KMEM:                              
2457                 counter = &memcg->kmem;          
2458                 break;                           
2459         case _TCP:                               
2460                 counter = &memcg->tcpmem;        
2461                 break;                           
2462         default:                                 
2463                 BUG();                           
2464         }                                        
2465                                                  
2466         switch (MEMFILE_ATTR(cft->private)) {    
2467         case RES_USAGE:                          
2468                 if (counter == &memcg->memory    
2469                         return (u64)mem_cgrou    
2470                 if (counter == &memcg->memsw)    
2471                         return (u64)mem_cgrou    
2472                 return (u64)page_counter_read    
2473         case RES_LIMIT:                          
2474                 return (u64)counter->max * PA    
2475         case RES_MAX_USAGE:                      
2476                 return (u64)counter->watermar    
2477         case RES_FAILCNT:                        
2478                 return counter->failcnt;         
2479         case RES_SOFT_LIMIT:                     
2480                 return (u64)READ_ONCE(memcg->    
2481         default:                                 
2482                 BUG();                           
2483         }                                        
2484 }                                                
2485                                                  
2486 /*                                               
2487  * This function doesn't do anything useful.     
2488  * handler for a file so that cgroup_file_mod    
2489  */                                              
2490 static int mem_cgroup_dummy_seq_show(__always    
2491                                      __always    
2492 {                                                
2493         return -EINVAL;                          
2494 }                                                
2495                                                  
2496 static int memcg_update_tcp_max(struct mem_cg    
2497 {                                                
2498         int ret;                                 
2499                                                  
2500         mutex_lock(&memcg_max_mutex);            
2501                                                  
2502         ret = page_counter_set_max(&memcg->tc    
2503         if (ret)                                 
2504                 goto out;                        
2505                                                  
2506         if (!memcg->tcpmem_active) {             
2507                 /*                               
2508                  * The active flag needs to b    
2509                  * update. This is what guara    
2510                  * function is the last one t    
2511                  * for details, and note that    
2512                  * belonging to this memcg un    
2513                  *                               
2514                  * We need to do this, becaus    
2515                  * sites, but we can't contro    
2516                  * as accounted, but the acco    
2517                  * yet, we'll lose accounting    
2518                  *                               
2519                  * We never race with the rea    
2520                  * because when this value ch    
2521                  * patched in yet.               
2522                  */                              
2523                 static_branch_inc(&memcg_sock    
2524                 memcg->tcpmem_active = true;     
2525         }                                        
2526 out:                                             
2527         mutex_unlock(&memcg_max_mutex);          
2528         return ret;                              
2529 }                                                
2530                                                  
2531 /*                                               
2532  * The user of this function is...               
2533  * RES_LIMIT.                                    
2534  */                                              
2535 static ssize_t mem_cgroup_write(struct kernfs    
2536                                 char *buf, si    
2537 {                                                
2538         struct mem_cgroup *memcg = mem_cgroup    
2539         unsigned long nr_pages;                  
2540         int ret;                                 
2541                                                  
2542         buf = strstrip(buf);                     
2543         ret = page_counter_memparse(buf, "-1"    
2544         if (ret)                                 
2545                 return ret;                      
2546                                                  
2547         switch (MEMFILE_ATTR(of_cft(of)->priv    
2548         case RES_LIMIT:                          
2549                 if (mem_cgroup_is_root(memcg)    
2550                         ret = -EINVAL;           
2551                         break;                   
2552                 }                                
2553                 switch (MEMFILE_TYPE(of_cft(o    
2554                 case _MEM:                       
2555                         ret = mem_cgroup_resi    
2556                         break;                   
2557                 case _MEMSWAP:                   
2558                         ret = mem_cgroup_resi    
2559                         break;                   
2560                 case _KMEM:                      
2561                         pr_warn_once("kmem.li    
2562                                      "Writing    
2563                                      "Please     
2564                                      "depend     
2565                         ret = 0;                 
2566                         break;                   
2567                 case _TCP:                       
2568                         pr_warn_once("kmem.tc    
2569                                      "Please     
2570                                      "depend     
2571                         ret = memcg_update_tc    
2572                         break;                   
2573                 }                                
2574                 break;                           
2575         case RES_SOFT_LIMIT:                     
2576                 if (IS_ENABLED(CONFIG_PREEMPT    
2577                         ret = -EOPNOTSUPP;       
2578                 } else {                         
2579                         pr_warn_once("soft_li    
2580                                      "Please     
2581                                      "depend     
2582                         WRITE_ONCE(memcg->sof    
2583                         ret = 0;                 
2584                 }                                
2585                 break;                           
2586         }                                        
2587         return ret ?: nbytes;                    
2588 }                                                
2589                                                  
2590 static ssize_t mem_cgroup_reset(struct kernfs    
2591                                 size_t nbytes    
2592 {                                                
2593         struct mem_cgroup *memcg = mem_cgroup    
2594         struct page_counter *counter;            
2595                                                  
2596         switch (MEMFILE_TYPE(of_cft(of)->priv    
2597         case _MEM:                               
2598                 counter = &memcg->memory;        
2599                 break;                           
2600         case _MEMSWAP:                           
2601                 counter = &memcg->memsw;         
2602                 break;                           
2603         case _KMEM:                              
2604                 counter = &memcg->kmem;          
2605                 break;                           
2606         case _TCP:                               
2607                 counter = &memcg->tcpmem;        
2608                 break;                           
2609         default:                                 
2610                 BUG();                           
2611         }                                        
2612                                                  
2613         switch (MEMFILE_ATTR(of_cft(of)->priv    
2614         case RES_MAX_USAGE:                      
2615                 page_counter_reset_watermark(    
2616                 break;                           
2617         case RES_FAILCNT:                        
2618                 counter->failcnt = 0;            
2619                 break;                           
2620         default:                                 
2621                 BUG();                           
2622         }                                        
2623                                                  
2624         return nbytes;                           
2625 }                                                
2626                                                  
2627 #ifdef CONFIG_NUMA                               
2628                                                  
2629 #define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE)     
2630 #define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON)     
2631 #define LRU_ALL      ((1 << NR_LRU_LISTS) - 1    
2632                                                  
2633 static unsigned long mem_cgroup_node_nr_lru_p    
2634                                 int nid, unsi    
2635 {                                                
2636         struct lruvec *lruvec = mem_cgroup_lr    
2637         unsigned long nr = 0;                    
2638         enum lru_list lru;                       
2639                                                  
2640         VM_BUG_ON((unsigned)nid >= nr_node_id    
2641                                                  
2642         for_each_lru(lru) {                      
2643                 if (!(BIT(lru) & lru_mask))      
2644                         continue;                
2645                 if (tree)                        
2646                         nr += lruvec_page_sta    
2647                 else                             
2648                         nr += lruvec_page_sta    
2649         }                                        
2650         return nr;                               
2651 }                                                
2652                                                  
2653 static unsigned long mem_cgroup_nr_lru_pages(    
2654                                                  
2655                                                  
2656 {                                                
2657         unsigned long nr = 0;                    
2658         enum lru_list lru;                       
2659                                                  
2660         for_each_lru(lru) {                      
2661                 if (!(BIT(lru) & lru_mask))      
2662                         continue;                
2663                 if (tree)                        
2664                         nr += memcg_page_stat    
2665                 else                             
2666                         nr += memcg_page_stat    
2667         }                                        
2668         return nr;                               
2669 }                                                
2670                                                  
2671 static int memcg_numa_stat_show(struct seq_fi    
2672 {                                                
2673         struct numa_stat {                       
2674                 const char *name;                
2675                 unsigned int lru_mask;           
2676         };                                       
2677                                                  
2678         static const struct numa_stat stats[]    
2679                 { "total", LRU_ALL },            
2680                 { "file", LRU_ALL_FILE },        
2681                 { "anon", LRU_ALL_ANON },        
2682                 { "unevictable", BIT(LRU_UNEV    
2683         };                                       
2684         const struct numa_stat *stat;            
2685         int nid;                                 
2686         struct mem_cgroup *memcg = mem_cgroup    
2687                                                  
2688         mem_cgroup_flush_stats(memcg);           
2689                                                  
2690         for (stat = stats; stat < stats + ARR    
2691                 seq_printf(m, "%s=%lu", stat-    
2692                            mem_cgroup_nr_lru_    
2693                                                  
2694                 for_each_node_state(nid, N_ME    
2695                         seq_printf(m, " N%d=%    
2696                                    mem_cgroup    
2697                                                  
2698                 seq_putc(m, '\n');               
2699         }                                        
2700                                                  
2701         for (stat = stats; stat < stats + ARR    
2702                                                  
2703                 seq_printf(m, "hierarchical_%    
2704                            mem_cgroup_nr_lru_    
2705                                                  
2706                 for_each_node_state(nid, N_ME    
2707                         seq_printf(m, " N%d=%    
2708                                    mem_cgroup    
2709                                                  
2710                 seq_putc(m, '\n');               
2711         }                                        
2712                                                  
2713         return 0;                                
2714 }                                                
2715 #endif /* CONFIG_NUMA */                         
2716                                                  
2717 static const unsigned int memcg1_stats[] = {     
2718         NR_FILE_PAGES,                           
2719         NR_ANON_MAPPED,                          
2720 #ifdef CONFIG_TRANSPARENT_HUGEPAGE               
2721         NR_ANON_THPS,                            
2722 #endif                                           
2723         NR_SHMEM,                                
2724         NR_FILE_MAPPED,                          
2725         NR_FILE_DIRTY,                           
2726         NR_WRITEBACK,                            
2727         WORKINGSET_REFAULT_ANON,                 
2728         WORKINGSET_REFAULT_FILE,                 
2729 #ifdef CONFIG_SWAP                               
2730         MEMCG_SWAP,                              
2731         NR_SWAPCACHE,                            
2732 #endif                                           
2733 };                                               
2734                                                  
2735 static const char *const memcg1_stat_names[]     
2736         "cache",                                 
2737         "rss",                                   
2738 #ifdef CONFIG_TRANSPARENT_HUGEPAGE               
2739         "rss_huge",                              
2740 #endif                                           
2741         "shmem",                                 
2742         "mapped_file",                           
2743         "dirty",                                 
2744         "writeback",                             
2745         "workingset_refault_anon",               
2746         "workingset_refault_file",               
2747 #ifdef CONFIG_SWAP                               
2748         "swap",                                  
2749         "swapcached",                            
2750 #endif                                           
2751 };                                               
2752                                                  
2753 /* Universal VM events cgroup1 shows, origina    
2754 static const unsigned int memcg1_events[] = {    
2755         PGPGIN,                                  
2756         PGPGOUT,                                 
2757         PGFAULT,                                 
2758         PGMAJFAULT,                              
2759 };                                               
2760                                                  
2761 void memcg1_stat_format(struct mem_cgroup *me    
2762 {                                                
2763         unsigned long memory, memsw;             
2764         struct mem_cgroup *mi;                   
2765         unsigned int i;                          
2766                                                  
2767         BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_n    
2768                                                  
2769         mem_cgroup_flush_stats(memcg);           
2770                                                  
2771         for (i = 0; i < ARRAY_SIZE(memcg1_sta    
2772                 unsigned long nr;                
2773                                                  
2774                 nr = memcg_page_state_local_o    
2775                 seq_buf_printf(s, "%s %lu\n",    
2776         }                                        
2777                                                  
2778         for (i = 0; i < ARRAY_SIZE(memcg1_eve    
2779                 seq_buf_printf(s, "%s %lu\n",    
2780                                memcg_events_l    
2781                                                  
2782         for (i = 0; i < NR_LRU_LISTS; i++)       
2783                 seq_buf_printf(s, "%s %lu\n",    
2784                                memcg_page_sta    
2785                                PAGE_SIZE);       
2786                                                  
2787         /* Hierarchical information */           
2788         memory = memsw = PAGE_COUNTER_MAX;       
2789         for (mi = memcg; mi; mi = parent_mem_    
2790                 memory = min(memory, READ_ONC    
2791                 memsw = min(memsw, READ_ONCE(    
2792         }                                        
2793         seq_buf_printf(s, "hierarchical_memor    
2794                        (u64)memory * PAGE_SIZ    
2795         seq_buf_printf(s, "hierarchical_memsw    
2796                        (u64)memsw * PAGE_SIZE    
2797                                                  
2798         for (i = 0; i < ARRAY_SIZE(memcg1_sta    
2799                 unsigned long nr;                
2800                                                  
2801                 nr = memcg_page_state_output(    
2802                 seq_buf_printf(s, "total_%s %    
2803                                (u64)nr);         
2804         }                                        
2805                                                  
2806         for (i = 0; i < ARRAY_SIZE(memcg1_eve    
2807                 seq_buf_printf(s, "total_%s %    
2808                                vm_event_name(    
2809                                (u64)memcg_eve    
2810                                                  
2811         for (i = 0; i < NR_LRU_LISTS; i++)       
2812                 seq_buf_printf(s, "total_%s %    
2813                                (u64)memcg_pag    
2814                                PAGE_SIZE);       
2815                                                  
2816 #ifdef CONFIG_DEBUG_VM                           
2817         {                                        
2818                 pg_data_t *pgdat;                
2819                 struct mem_cgroup_per_node *m    
2820                 unsigned long anon_cost = 0;     
2821                 unsigned long file_cost = 0;     
2822                                                  
2823                 for_each_online_pgdat(pgdat)     
2824                         mz = memcg->nodeinfo[    
2825                                                  
2826                         anon_cost += mz->lruv    
2827                         file_cost += mz->lruv    
2828                 }                                
2829                 seq_buf_printf(s, "anon_cost     
2830                 seq_buf_printf(s, "file_cost     
2831         }                                        
2832 #endif                                           
2833 }                                                
2834                                                  
2835 static u64 mem_cgroup_swappiness_read(struct     
2836                                       struct     
2837 {                                                
2838         struct mem_cgroup *memcg = mem_cgroup    
2839                                                  
2840         return mem_cgroup_swappiness(memcg);     
2841 }                                                
2842                                                  
2843 static int mem_cgroup_swappiness_write(struct    
2844                                        struct    
2845 {                                                
2846         struct mem_cgroup *memcg = mem_cgroup    
2847                                                  
2848         if (val > MAX_SWAPPINESS)                
2849                 return -EINVAL;                  
2850                                                  
2851         if (!mem_cgroup_is_root(memcg))          
2852                 WRITE_ONCE(memcg->swappiness,    
2853         else                                     
2854                 WRITE_ONCE(vm_swappiness, val    
2855                                                  
2856         return 0;                                
2857 }                                                
2858                                                  
2859 static int mem_cgroup_oom_control_read(struct    
2860 {                                                
2861         struct mem_cgroup *memcg = mem_cgroup    
2862                                                  
2863         seq_printf(sf, "oom_kill_disable %d\n    
2864         seq_printf(sf, "under_oom %d\n", (boo    
2865         seq_printf(sf, "oom_kill %lu\n",         
2866                    atomic_long_read(&memcg->m    
2867         return 0;                                
2868 }                                                
2869                                                  
2870 static int mem_cgroup_oom_control_write(struc    
2871         struct cftype *cft, u64 val)             
2872 {                                                
2873         struct mem_cgroup *memcg = mem_cgroup    
2874                                                  
2875         pr_warn_once("oom_control is deprecat    
2876                      "Please report your usec    
2877                      "depend on this function    
2878                                                  
2879         /* cannot set to root cgroup and only    
2880         if (mem_cgroup_is_root(memcg) || !((v    
2881                 return -EINVAL;                  
2882                                                  
2883         WRITE_ONCE(memcg->oom_kill_disable, v    
2884         if (!val)                                
2885                 memcg1_oom_recover(memcg);       
2886                                                  
2887         return 0;                                
2888 }                                                
2889                                                  
2890 #ifdef CONFIG_SLUB_DEBUG                         
2891 static int mem_cgroup_slab_show(struct seq_fi    
2892 {                                                
2893         /*                                       
2894          * Deprecated.                           
2895          * Please, take a look at tools/cgrou    
2896          */                                      
2897         return 0;                                
2898 }                                                
2899 #endif                                           
2900                                                  
2901 struct cftype mem_cgroup_legacy_files[] = {      
2902         {                                        
2903                 .name = "usage_in_bytes",        
2904                 .private = MEMFILE_PRIVATE(_M    
2905                 .read_u64 = mem_cgroup_read_u    
2906         },                                       
2907         {                                        
2908                 .name = "max_usage_in_bytes",    
2909                 .private = MEMFILE_PRIVATE(_M    
2910                 .write = mem_cgroup_reset,       
2911                 .read_u64 = mem_cgroup_read_u    
2912         },                                       
2913         {                                        
2914                 .name = "limit_in_bytes",        
2915                 .private = MEMFILE_PRIVATE(_M    
2916                 .write = mem_cgroup_write,       
2917                 .read_u64 = mem_cgroup_read_u    
2918         },                                       
2919         {                                        
2920                 .name = "soft_limit_in_bytes"    
2921                 .private = MEMFILE_PRIVATE(_M    
2922                 .write = mem_cgroup_write,       
2923                 .read_u64 = mem_cgroup_read_u    
2924         },                                       
2925         {                                        
2926                 .name = "failcnt",               
2927                 .private = MEMFILE_PRIVATE(_M    
2928                 .write = mem_cgroup_reset,       
2929                 .read_u64 = mem_cgroup_read_u    
2930         },                                       
2931         {                                        
2932                 .name = "stat",                  
2933                 .seq_show = memory_stat_show,    
2934         },                                       
2935         {                                        
2936                 .name = "force_empty",           
2937                 .write = mem_cgroup_force_emp    
2938         },                                       
2939         {                                        
2940                 .name = "use_hierarchy",         
2941                 .write_u64 = mem_cgroup_hiera    
2942                 .read_u64 = mem_cgroup_hierar    
2943         },                                       
2944         {                                        
2945                 .name = "cgroup.event_control    
2946                 .write = memcg_write_event_co    
2947                 .flags = CFTYPE_NO_PREFIX | C    
2948         },                                       
2949         {                                        
2950                 .name = "swappiness",            
2951                 .read_u64 = mem_cgroup_swappi    
2952                 .write_u64 = mem_cgroup_swapp    
2953         },                                       
2954         {                                        
2955                 .name = "move_charge_at_immig    
2956                 .read_u64 = mem_cgroup_move_c    
2957                 .write_u64 = mem_cgroup_move_    
2958         },                                       
2959         {                                        
2960                 .name = "oom_control",           
2961                 .seq_show = mem_cgroup_oom_co    
2962                 .write_u64 = mem_cgroup_oom_c    
2963         },                                       
2964         {                                        
2965                 .name = "pressure_level",        
2966                 .seq_show = mem_cgroup_dummy_    
2967         },                                       
2968 #ifdef CONFIG_NUMA                               
2969         {                                        
2970                 .name = "numa_stat",             
2971                 .seq_show = memcg_numa_stat_s    
2972         },                                       
2973 #endif                                           
2974         {                                        
2975                 .name = "kmem.limit_in_bytes"    
2976                 .private = MEMFILE_PRIVATE(_K    
2977                 .write = mem_cgroup_write,       
2978                 .read_u64 = mem_cgroup_read_u    
2979         },                                       
2980         {                                        
2981                 .name = "kmem.usage_in_bytes"    
2982                 .private = MEMFILE_PRIVATE(_K    
2983                 .read_u64 = mem_cgroup_read_u    
2984         },                                       
2985         {                                        
2986                 .name = "kmem.failcnt",          
2987                 .private = MEMFILE_PRIVATE(_K    
2988                 .write = mem_cgroup_reset,       
2989                 .read_u64 = mem_cgroup_read_u    
2990         },                                       
2991         {                                        
2992                 .name = "kmem.max_usage_in_by    
2993                 .private = MEMFILE_PRIVATE(_K    
2994                 .write = mem_cgroup_reset,       
2995                 .read_u64 = mem_cgroup_read_u    
2996         },                                       
2997 #ifdef CONFIG_SLUB_DEBUG                         
2998         {                                        
2999                 .name = "kmem.slabinfo",         
3000                 .seq_show = mem_cgroup_slab_s    
3001         },                                       
3002 #endif                                           
3003         {                                        
3004                 .name = "kmem.tcp.limit_in_by    
3005                 .private = MEMFILE_PRIVATE(_T    
3006                 .write = mem_cgroup_write,       
3007                 .read_u64 = mem_cgroup_read_u    
3008         },                                       
3009         {                                        
3010                 .name = "kmem.tcp.usage_in_by    
3011                 .private = MEMFILE_PRIVATE(_T    
3012                 .read_u64 = mem_cgroup_read_u    
3013         },                                       
3014         {                                        
3015                 .name = "kmem.tcp.failcnt",      
3016                 .private = MEMFILE_PRIVATE(_T    
3017                 .write = mem_cgroup_reset,       
3018                 .read_u64 = mem_cgroup_read_u    
3019         },                                       
3020         {                                        
3021                 .name = "kmem.tcp.max_usage_i    
3022                 .private = MEMFILE_PRIVATE(_T    
3023                 .write = mem_cgroup_reset,       
3024                 .read_u64 = mem_cgroup_read_u    
3025         },                                       
3026         { },    /* terminate */                  
3027 };                                               
3028                                                  
3029 struct cftype memsw_files[] = {                  
3030         {                                        
3031                 .name = "memsw.usage_in_bytes    
3032                 .private = MEMFILE_PRIVATE(_M    
3033                 .read_u64 = mem_cgroup_read_u    
3034         },                                       
3035         {                                        
3036                 .name = "memsw.max_usage_in_b    
3037                 .private = MEMFILE_PRIVATE(_M    
3038                 .write = mem_cgroup_reset,       
3039                 .read_u64 = mem_cgroup_read_u    
3040         },                                       
3041         {                                        
3042                 .name = "memsw.limit_in_bytes    
3043                 .private = MEMFILE_PRIVATE(_M    
3044                 .write = mem_cgroup_write,       
3045                 .read_u64 = mem_cgroup_read_u    
3046         },                                       
3047         {                                        
3048                 .name = "memsw.failcnt",         
3049                 .private = MEMFILE_PRIVATE(_M    
3050                 .write = mem_cgroup_reset,       
3051                 .read_u64 = mem_cgroup_read_u    
3052         },                                       
3053         { },    /* terminate */                  
3054 };                                               
3055                                                  
3056 void memcg1_account_kmem(struct mem_cgroup *m    
3057 {                                                
3058         if (!cgroup_subsys_on_dfl(memory_cgrp    
3059                 if (nr_pages > 0)                
3060                         page_counter_charge(&    
3061                 else                             
3062                         page_counter_uncharge    
3063         }                                        
3064 }                                                
3065                                                  
3066 bool memcg1_charge_skmem(struct mem_cgroup *m    
3067                          gfp_t gfp_mask)         
3068 {                                                
3069         struct page_counter *fail;               
3070                                                  
3071         if (page_counter_try_charge(&memcg->t    
3072                 memcg->tcpmem_pressure = 0;      
3073                 return true;                     
3074         }                                        
3075         memcg->tcpmem_pressure = 1;              
3076         if (gfp_mask & __GFP_NOFAIL) {           
3077                 page_counter_charge(&memcg->t    
3078                 return true;                     
3079         }                                        
3080         return false;                            
3081 }                                                
3082                                                  
3083 bool memcg1_alloc_events(struct mem_cgroup *m    
3084 {                                                
3085         memcg->events_percpu = alloc_percpu_g    
3086                                                  
3087         return !!memcg->events_percpu;           
3088 }                                                
3089                                                  
3090 void memcg1_free_events(struct mem_cgroup *me    
3091 {                                                
3092         if (memcg->events_percpu)                
3093                 free_percpu(memcg->events_per    
3094 }                                                
3095                                                  
3096 static int __init memcg1_init(void)              
3097 {                                                
3098         int node;                                
3099                                                  
3100         for_each_node(node) {                    
3101                 struct mem_cgroup_tree_per_no    
3102                                                  
3103                 rtpn = kzalloc_node(sizeof(*r    
3104                                                  
3105                 rtpn->rb_root = RB_ROOT;         
3106                 rtpn->rb_rightmost = NULL;       
3107                 spin_lock_init(&rtpn->lock);     
3108                 soft_limit_tree.rb_tree_per_n    
3109         }                                        
3110                                                  
3111         return 0;                                
3112 }                                                
3113 subsys_initcall(memcg1_init);                    
3114
~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.
TOMOYO Linux Cross Reference Linux/mm/memcontrol-v1.c

Diff markup

Differences between /mm/memcontrol-v1.c (Version linux-6.12-rc7) and /mm/memcontrol-v1.c (Version linux-5.17.15)

TOMOYO Linux Cross Reference
Linux/mm/memcontrol-v1.c