~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/mm/page_counter.c

Version: ~ [ linux-6.12-rc7 ] ~ [ linux-6.11.7 ] ~ [ linux-6.10.14 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.60 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.116 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.171 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.229 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.285 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.323 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.12 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

Diff markup

Differences between /mm/page_counter.c (Version linux-6.12-rc7) and /mm/page_counter.c (Version linux-4.4.302)


  1 // SPDX-License-Identifier: GPL-2.0            << 
  2 /*                                                  1 /*
  3  * Lockless hierarchical page accounting & lim      2  * Lockless hierarchical page accounting & limiting
  4  *                                                  3  *
  5  * Copyright (C) 2014 Red Hat, Inc., Johannes       4  * Copyright (C) 2014 Red Hat, Inc., Johannes Weiner
  6  */                                                 5  */
  7                                                     6 
  8 #include <linux/page_counter.h>                     7 #include <linux/page_counter.h>
  9 #include <linux/atomic.h>                           8 #include <linux/atomic.h>
 10 #include <linux/kernel.h>                           9 #include <linux/kernel.h>
 11 #include <linux/string.h>                          10 #include <linux/string.h>
 12 #include <linux/sched.h>                           11 #include <linux/sched.h>
 13 #include <linux/bug.h>                             12 #include <linux/bug.h>
 14 #include <asm/page.h>                              13 #include <asm/page.h>
 15                                                    14 
 16 static bool track_protection(struct page_count << 
 17 {                                              << 
 18         return c->protection_support;          << 
 19 }                                              << 
 20                                                << 
 21 static void propagate_protected_usage(struct p << 
 22                                       unsigned << 
 23 {                                              << 
 24         unsigned long protected, old_protected << 
 25         long delta;                            << 
 26                                                << 
 27         if (!c->parent)                        << 
 28                 return;                        << 
 29                                                << 
 30         protected = min(usage, READ_ONCE(c->mi << 
 31         old_protected = atomic_long_read(&c->m << 
 32         if (protected != old_protected) {      << 
 33                 old_protected = atomic_long_xc << 
 34                 delta = protected - old_protec << 
 35                 if (delta)                     << 
 36                         atomic_long_add(delta, << 
 37         }                                      << 
 38                                                << 
 39         protected = min(usage, READ_ONCE(c->lo << 
 40         old_protected = atomic_long_read(&c->l << 
 41         if (protected != old_protected) {      << 
 42                 old_protected = atomic_long_xc << 
 43                 delta = protected - old_protec << 
 44                 if (delta)                     << 
 45                         atomic_long_add(delta, << 
 46         }                                      << 
 47 }                                              << 
 48                                                << 
 49 /**                                                15 /**
 50  * page_counter_cancel - take pages out of the     16  * page_counter_cancel - take pages out of the local counter
 51  * @counter: counter                               17  * @counter: counter
 52  * @nr_pages: number of pages to cancel            18  * @nr_pages: number of pages to cancel
 53  */                                                19  */
 54 void page_counter_cancel(struct page_counter *     20 void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages)
 55 {                                                  21 {
 56         long new;                                  22         long new;
 57                                                    23 
 58         new = atomic_long_sub_return(nr_pages, !!  24         new = atomic_long_sub_return(nr_pages, &counter->count);
 59         /* More uncharges than charges? */         25         /* More uncharges than charges? */
 60         if (WARN_ONCE(new < 0, "page_counter u !!  26         WARN_ON_ONCE(new < 0);
 61                       new, nr_pages)) {        << 
 62                 new = 0;                       << 
 63                 atomic_long_set(&counter->usag << 
 64         }                                      << 
 65         if (track_protection(counter))         << 
 66                 propagate_protected_usage(coun << 
 67 }                                                  27 }
 68                                                    28 
 69 /**                                                29 /**
 70  * page_counter_charge - hierarchically charge     30  * page_counter_charge - hierarchically charge pages
 71  * @counter: counter                               31  * @counter: counter
 72  * @nr_pages: number of pages to charge            32  * @nr_pages: number of pages to charge
 73  *                                                 33  *
 74  * NOTE: This does not consider any configured     34  * NOTE: This does not consider any configured counter limits.
 75  */                                                35  */
 76 void page_counter_charge(struct page_counter *     36 void page_counter_charge(struct page_counter *counter, unsigned long nr_pages)
 77 {                                                  37 {
 78         struct page_counter *c;                    38         struct page_counter *c;
 79         bool protection = track_protection(cou << 
 80                                                    39 
 81         for (c = counter; c; c = c->parent) {      40         for (c = counter; c; c = c->parent) {
 82                 long new;                          41                 long new;
 83                                                    42 
 84                 new = atomic_long_add_return(n !!  43                 new = atomic_long_add_return(nr_pages, &c->count);
 85                 if (protection)                << 
 86                         propagate_protected_us << 
 87                 /*                                 44                 /*
 88                  * This is indeed racy, but we     45                  * This is indeed racy, but we can live with some
 89                  * inaccuracy in the watermark     46                  * inaccuracy in the watermark.
 90                  *                             << 
 91                  * Notably, we have two waterm << 
 92                  * visible peak and one that c << 
 93                  *                             << 
 94                  * Since we reset both waterma << 
 95                  * we can guarantee that water << 
 96                  * don't need to do both compa << 
 97                  *                             << 
 98                  * On systems with branch pred << 
 99                  * be almost free.             << 
100                  */                                47                  */
101                 if (new > READ_ONCE(c->local_w !!  48                 if (new > c->watermark)
102                         WRITE_ONCE(c->local_wa !!  49                         c->watermark = new;
103                         if (new > READ_ONCE(c- << 
104                                 WRITE_ONCE(c-> << 
105                 }                              << 
106         }                                          50         }
107 }                                                  51 }
108                                                    52 
109 /**                                                53 /**
110  * page_counter_try_charge - try to hierarchic     54  * page_counter_try_charge - try to hierarchically charge pages
111  * @counter: counter                               55  * @counter: counter
112  * @nr_pages: number of pages to charge            56  * @nr_pages: number of pages to charge
113  * @fail: points first counter to hit its limi     57  * @fail: points first counter to hit its limit, if any
114  *                                                 58  *
115  * Returns %true on success, or %false and @fa     59  * Returns %true on success, or %false and @fail if the counter or one
116  * of its ancestors has hit its configured lim     60  * of its ancestors has hit its configured limit.
117  */                                                61  */
118 bool page_counter_try_charge(struct page_count     62 bool page_counter_try_charge(struct page_counter *counter,
119                              unsigned long nr_     63                              unsigned long nr_pages,
120                              struct page_count     64                              struct page_counter **fail)
121 {                                                  65 {
122         struct page_counter *c;                    66         struct page_counter *c;
123         bool protection = track_protection(cou << 
124                                                    67 
125         for (c = counter; c; c = c->parent) {      68         for (c = counter; c; c = c->parent) {
126                 long new;                          69                 long new;
127                 /*                                 70                 /*
128                  * Charge speculatively to avo     71                  * Charge speculatively to avoid an expensive CAS.  If
129                  * a bigger charge fails, it m     72                  * a bigger charge fails, it might falsely lock out a
130                  * racing smaller charge and s     73                  * racing smaller charge and send it into reclaim
131                  * early, but the error is lim     74                  * early, but the error is limited to the difference
132                  * between the two sizes, whic     75                  * between the two sizes, which is less than 2M/4M in
133                  * case of a THP locking out a     76                  * case of a THP locking out a regular page charge.
134                  *                                 77                  *
135                  * The atomic_long_add_return(     78                  * The atomic_long_add_return() implies a full memory
136                  * barrier between incrementin     79                  * barrier between incrementing the count and reading
137                  * the limit.  When racing wit !!  80                  * the limit.  When racing with page_counter_limit(),
138                  * we either see the new limit     81                  * we either see the new limit or the setter sees the
139                  * counter has changed and ret     82                  * counter has changed and retries.
140                  */                                83                  */
141                 new = atomic_long_add_return(n !!  84                 new = atomic_long_add_return(nr_pages, &c->count);
142                 if (new > c->max) {            !!  85                 if (new > c->limit) {
143                         atomic_long_sub(nr_pag !!  86                         atomic_long_sub(nr_pages, &c->count);
144                         /*                         87                         /*
145                          * This is racy, but w     88                          * This is racy, but we can live with some
146                          * inaccuracy in the f !!  89                          * inaccuracy in the failcnt.
147                          * to report stats.    << 
148                          */                        90                          */
149                         data_race(c->failcnt++ !!  91                         c->failcnt++;
150                         *fail = c;                 92                         *fail = c;
151                         goto failed;               93                         goto failed;
152                 }                                  94                 }
153                 if (protection)                !!  95                 /*
154                         propagate_protected_us !!  96                  * Just like with failcnt, we can live with some
155                                                !!  97                  * inaccuracy in the watermark.
156                 /* see comment on page_counter !!  98                  */
157                 if (new > READ_ONCE(c->local_w !!  99                 if (new > c->watermark)
158                         WRITE_ONCE(c->local_wa !! 100                         c->watermark = new;
159                         if (new > READ_ONCE(c- << 
160                                 WRITE_ONCE(c-> << 
161                 }                              << 
162         }                                         101         }
163         return true;                              102         return true;
164                                                   103 
165 failed:                                           104 failed:
166         for (c = counter; c != *fail; c = c->p    105         for (c = counter; c != *fail; c = c->parent)
167                 page_counter_cancel(c, nr_page    106                 page_counter_cancel(c, nr_pages);
168                                                   107 
169         return false;                             108         return false;
170 }                                                 109 }
171                                                   110 
172 /**                                               111 /**
173  * page_counter_uncharge - hierarchically unch    112  * page_counter_uncharge - hierarchically uncharge pages
174  * @counter: counter                              113  * @counter: counter
175  * @nr_pages: number of pages to uncharge         114  * @nr_pages: number of pages to uncharge
176  */                                               115  */
177 void page_counter_uncharge(struct page_counter    116 void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages)
178 {                                                 117 {
179         struct page_counter *c;                   118         struct page_counter *c;
180                                                   119 
181         for (c = counter; c; c = c->parent)       120         for (c = counter; c; c = c->parent)
182                 page_counter_cancel(c, nr_page    121                 page_counter_cancel(c, nr_pages);
183 }                                                 122 }
184                                                   123 
185 /**                                               124 /**
186  * page_counter_set_max - set the maximum numb !! 125  * page_counter_limit - limit the number of pages allowed
187  * @counter: counter                              126  * @counter: counter
188  * @nr_pages: limit to set                     !! 127  * @limit: limit to set
189  *                                                128  *
190  * Returns 0 on success, -EBUSY if the current    129  * Returns 0 on success, -EBUSY if the current number of pages on the
191  * counter already exceeds the specified limit    130  * counter already exceeds the specified limit.
192  *                                                131  *
193  * The caller must serialize invocations on th    132  * The caller must serialize invocations on the same counter.
194  */                                               133  */
195 int page_counter_set_max(struct page_counter * !! 134 int page_counter_limit(struct page_counter *counter, unsigned long limit)
196 {                                                 135 {
197         for (;;) {                                136         for (;;) {
198                 unsigned long old;                137                 unsigned long old;
199                 long usage;                    !! 138                 long count;
200                                                   139 
201                 /*                                140                 /*
202                  * Update the limit while maki    141                  * Update the limit while making sure that it's not
203                  * below the concurrently-chan    142                  * below the concurrently-changing counter value.
204                  *                                143                  *
205                  * The xchg implies two full m    144                  * The xchg implies two full memory barriers before
206                  * and after, so the read-swap    145                  * and after, so the read-swap-read is ordered and
207                  * ensures coherency with page    146                  * ensures coherency with page_counter_try_charge():
208                  * that function modifies the     147                  * that function modifies the count before checking
209                  * the limit, so if it sees th    148                  * the limit, so if it sees the old limit, we see the
210                  * modified counter and retry.    149                  * modified counter and retry.
211                  */                               150                  */
212                 usage = page_counter_read(coun !! 151                 count = atomic_long_read(&counter->count);
213                                                   152 
214                 if (usage > nr_pages)          !! 153                 if (count > limit)
215                         return -EBUSY;            154                         return -EBUSY;
216                                                   155 
217                 old = xchg(&counter->max, nr_p !! 156                 old = xchg(&counter->limit, limit);
218                                                   157 
219                 if (page_counter_read(counter) !! 158                 if (atomic_long_read(&counter->count) <= count)
220                         return 0;                 159                         return 0;
221                                                   160 
222                 counter->max = old;            !! 161                 counter->limit = old;
223                 cond_resched();                   162                 cond_resched();
224         }                                         163         }
225 }                                                 164 }
226                                                   165 
227 /**                                               166 /**
228  * page_counter_set_min - set the amount of pr << 
229  * @counter: counter                           << 
230  * @nr_pages: value to set                     << 
231  *                                             << 
232  * The caller must serialize invocations on th << 
233  */                                            << 
234 void page_counter_set_min(struct page_counter  << 
235 {                                              << 
236         struct page_counter *c;                << 
237                                                << 
238         WRITE_ONCE(counter->min, nr_pages);    << 
239                                                << 
240         for (c = counter; c; c = c->parent)    << 
241                 propagate_protected_usage(c, a << 
242 }                                              << 
243                                                << 
244 /**                                            << 
245  * page_counter_set_low - set the amount of pr << 
246  * @counter: counter                           << 
247  * @nr_pages: value to set                     << 
248  *                                             << 
249  * The caller must serialize invocations on th << 
250  */                                            << 
251 void page_counter_set_low(struct page_counter  << 
252 {                                              << 
253         struct page_counter *c;                << 
254                                                << 
255         WRITE_ONCE(counter->low, nr_pages);    << 
256                                                << 
257         for (c = counter; c; c = c->parent)    << 
258                 propagate_protected_usage(c, a << 
259 }                                              << 
260                                                << 
261 /**                                            << 
262  * page_counter_memparse - memparse() for page    167  * page_counter_memparse - memparse() for page counter limits
263  * @buf: string to parse                          168  * @buf: string to parse
264  * @max: string meaning maximum possible value    169  * @max: string meaning maximum possible value
265  * @nr_pages: returns the result in number of     170  * @nr_pages: returns the result in number of pages
266  *                                                171  *
267  * Returns -EINVAL, or 0 and @nr_pages on succ    172  * Returns -EINVAL, or 0 and @nr_pages on success.  @nr_pages will be
268  * limited to %PAGE_COUNTER_MAX.                  173  * limited to %PAGE_COUNTER_MAX.
269  */                                               174  */
270 int page_counter_memparse(const char *buf, con    175 int page_counter_memparse(const char *buf, const char *max,
271                           unsigned long *nr_pa    176                           unsigned long *nr_pages)
272 {                                                 177 {
273         char *end;                                178         char *end;
274         u64 bytes;                                179         u64 bytes;
275                                                   180 
276         if (!strcmp(buf, max)) {                  181         if (!strcmp(buf, max)) {
277                 *nr_pages = PAGE_COUNTER_MAX;     182                 *nr_pages = PAGE_COUNTER_MAX;
278                 return 0;                         183                 return 0;
279         }                                         184         }
280                                                   185 
281         bytes = memparse(buf, &end);              186         bytes = memparse(buf, &end);
282         if (*end != '\0')                         187         if (*end != '\0')
283                 return -EINVAL;                   188                 return -EINVAL;
284                                                   189 
285         *nr_pages = min(bytes / PAGE_SIZE, (u6    190         *nr_pages = min(bytes / PAGE_SIZE, (u64)PAGE_COUNTER_MAX);
286                                                   191 
287         return 0;                                 192         return 0;
288 }                                                 193 }
289                                                << 
290                                                << 
291 #ifdef CONFIG_MEMCG                            << 
292 /*                                             << 
293  * This function calculates an individual page << 
294  * protection which is derived from its own me << 
295  * parent's and siblings' settings, as well as << 
296  * distribution in the tree.                   << 
297  *                                             << 
298  * The following rules apply to the effective  << 
299  *                                             << 
300  * 1. At the first level of reclaim, effective << 
301  *    the declared protection in memory.min an << 
302  *                                             << 
303  * 2. To enable safe delegation of the protect << 
304  *    subsequent levels the effective protecti << 
305  *    parent's effective protection.           << 
306  *                                             << 
307  * 3. To make complex and dynamic subtrees eas << 
308  *    user is allowed to overcommit the declar << 
309  *    level. If that is the case, the parent's << 
310  *    distributed to the children in proportio << 
311  *    they have declared and how much of it th << 
312  *                                             << 
313  *    This makes distribution proportional, bu << 
314  *    if one counter claims much more protecti << 
315  *    the unused remainder is available to its << 
316  *                                             << 
317  * 4. Conversely, when the declared protection << 
318  *    given level, the distribution of the lar << 
319  *    budget is NOT proportional. A counter's  << 
320  *    is capped to its own memory.min/low sett << 
321  *                                             << 
322  * 5. However, to allow protecting recursive s << 
323  *    without having to declare each individua << 
324  *    of the ancestor's claim to protection, a << 
325  *    "floating" - protection from up the tree << 
326  *    proportion to each counter's *usage*. Th << 
327  *    neutral wrt sibling cgroups and lets the << 
328  *    the shared parental protection budget, b << 
329  *    subtree as a whole from neighboring subt << 
330  *                                             << 
331  * Note that 4. and 5. are not in conflict: 4. << 
332  * against immediate siblings whereas 5. is ab << 
333  * neighboring subtrees.                       << 
334  */                                            << 
335 static unsigned long effective_protection(unsi << 
336                                           unsi << 
337                                           unsi << 
338                                           unsi << 
339                                           unsi << 
340                                           bool << 
341 {                                              << 
342         unsigned long protected;               << 
343         unsigned long ep;                      << 
344                                                << 
345         protected = min(usage, setting);       << 
346         /*                                     << 
347          * If all cgroups at this level combin << 
348          * protection than what the parent aff << 
349          * shares in proportion to utilization << 
350          *                                     << 
351          * We are using actual utilization rat << 
352          * claimed protection in order to be w << 
353          * but unused protection is available  << 
354          * otherwise get a smaller chunk than  << 
355          */                                    << 
356         if (siblings_protected > parent_effect << 
357                 return protected * parent_effe << 
358                                                << 
359         /*                                     << 
360          * Ok, utilized protection of all chil << 
361          * parent affords them, so we know wha << 
362          * and utilizes is effectively protect << 
363          *                                     << 
364          * If there is unprotected usage beyon << 
365          * will apply pressure in proportion t << 
366          *                                     << 
367          * If there is unutilized protection,  << 
368          * shielded from reclaim, but we do re << 
369          * protection than what the group coul << 
370          * is okay. With the overcommit distri << 
371          * protection is always dependent on h << 
372          * consumed among the siblings anyway. << 
373          */                                    << 
374         ep = protected;                        << 
375                                                << 
376         /*                                     << 
377          * If the children aren't claiming (al << 
378          * afforded to them by the parent, dis << 
379          * proportion to the (unprotected) mem << 
380          * way, cgroups that aren't explicitly << 
381          * other compete freely over the allow << 
382          * collectively protected from neighbo << 
383          *                                     << 
384          * We're using unprotected memory for  << 
385          * some cgroups DO claim explicit prot << 
386          * the same bytes twice.               << 
387          *                                     << 
388          * Check both usage and parent_usage a << 
389          * protected values. One should imply  << 
390          * aren't read atomically - make sure  << 
391          */                                    << 
392         if (!recursive_protection)             << 
393                 return ep;                     << 
394                                                << 
395         if (parent_effective > siblings_protec << 
396             parent_usage > siblings_protected  << 
397             usage > protected) {               << 
398                 unsigned long unclaimed;       << 
399                                                << 
400                 unclaimed = parent_effective - << 
401                 unclaimed *= usage - protected << 
402                 unclaimed /= parent_usage - si << 
403                                                << 
404                 ep += unclaimed;               << 
405         }                                      << 
406                                                << 
407         return ep;                             << 
408 }                                              << 
409                                                << 
410                                                << 
411 /**                                            << 
412  * page_counter_calculate_protection - check i << 
413  * @root: the top ancestor of the sub-tree bei << 
414  * @counter: the page_counter the counter to u << 
415  * @recursive_protection: Whether to use memor << 
416  *                                             << 
417  * Calculates elow/emin thresholds for given p << 
418  *                                             << 
419  * WARNING: This function is not stateless! It << 
420  *          of a top-down tree iteration, not  << 
421  */                                            << 
422 void page_counter_calculate_protection(struct  << 
423                                        struct  << 
424                                        bool re << 
425 {                                              << 
426         unsigned long usage, parent_usage;     << 
427         struct page_counter *parent = counter- << 
428                                                << 
429         /*                                     << 
430          * Effective values of the reclaim tar << 
431          * can be stale. Have a look at mem_cg << 
432          * details.                            << 
433          * TODO: calculation should be more ro << 
434          * that special casing.                << 
435          */                                    << 
436         if (root == counter)                   << 
437                 return;                        << 
438                                                << 
439         usage = page_counter_read(counter);    << 
440         if (!usage)                            << 
441                 return;                        << 
442                                                << 
443         if (parent == root) {                  << 
444                 counter->emin = READ_ONCE(coun << 
445                 counter->elow = READ_ONCE(coun << 
446                 return;                        << 
447         }                                      << 
448                                                << 
449         parent_usage = page_counter_read(paren << 
450                                                << 
451         WRITE_ONCE(counter->emin, effective_pr << 
452                         READ_ONCE(counter->min << 
453                         READ_ONCE(parent->emin << 
454                         atomic_long_read(&pare << 
455                         recursive_protection)) << 
456                                                << 
457         WRITE_ONCE(counter->elow, effective_pr << 
458                         READ_ONCE(counter->low << 
459                         READ_ONCE(parent->elow << 
460                         atomic_long_read(&pare << 
461                         recursive_protection)) << 
462 }                                              << 
463 #endif /* CONFIG_MEMCG */                      << 
464                                                   194 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

sflogo.php