~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~
vmstat.c

Version: ~ [ linux-6.12-rc7 ] ~ [ linux-6.11.7 ] ~ [ linux-6.10.14 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.60 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.116 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.171 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.229 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.285 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.323 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.12 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~
Diff markup

Differences between /mm/vmstat.c (Version linux-6.12-rc7) and /mm/vmstat.c (Version linux-5.15.171)

  1 // SPDX-License-Identifier: GPL-2.0-only            1 // SPDX-License-Identifier: GPL-2.0-only
  2 /*                                                  2 /*
  3  *  linux/mm/vmstat.c                               3  *  linux/mm/vmstat.c
  4  *                                                  4  *
  5  *  Manages VM statistics                           5  *  Manages VM statistics
  6  *  Copyright (C) 1991, 1992, 1993, 1994  Linu      6  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
  7  *                                                  7  *
  8  *  zoned VM statistics                             8  *  zoned VM statistics
  9  *  Copyright (C) 2006 Silicon Graphics, Inc.,      9  *  Copyright (C) 2006 Silicon Graphics, Inc.,
 10  *              Christoph Lameter <christoph@l     10  *              Christoph Lameter <christoph@lameter.com>
 11  *  Copyright (C) 2008-2014 Christoph Lameter      11  *  Copyright (C) 2008-2014 Christoph Lameter
 12  */                                                12  */
 13 #include <linux/fs.h>                              13 #include <linux/fs.h>
 14 #include <linux/mm.h>                              14 #include <linux/mm.h>
 15 #include <linux/err.h>                             15 #include <linux/err.h>
 16 #include <linux/module.h>                          16 #include <linux/module.h>
 17 #include <linux/slab.h>                            17 #include <linux/slab.h>
 18 #include <linux/cpu.h>                             18 #include <linux/cpu.h>
 19 #include <linux/cpumask.h>                         19 #include <linux/cpumask.h>
 20 #include <linux/vmstat.h>                          20 #include <linux/vmstat.h>
 21 #include <linux/proc_fs.h>                         21 #include <linux/proc_fs.h>
 22 #include <linux/seq_file.h>                        22 #include <linux/seq_file.h>
 23 #include <linux/debugfs.h>                         23 #include <linux/debugfs.h>
 24 #include <linux/sched.h>                           24 #include <linux/sched.h>
 25 #include <linux/math64.h>                          25 #include <linux/math64.h>
 26 #include <linux/writeback.h>                       26 #include <linux/writeback.h>
 27 #include <linux/compaction.h>                      27 #include <linux/compaction.h>
 28 #include <linux/mm_inline.h>                       28 #include <linux/mm_inline.h>
                                                   >>  29 #include <linux/page_ext.h>
 29 #include <linux/page_owner.h>                      30 #include <linux/page_owner.h>
 30 #include <linux/sched/isolation.h>             << 
 31                                                    31 
 32 #include "internal.h"                              32 #include "internal.h"
 33                                                    33 
 34 #ifdef CONFIG_NUMA                                 34 #ifdef CONFIG_NUMA
 35 int sysctl_vm_numa_stat = ENABLE_NUMA_STAT;        35 int sysctl_vm_numa_stat = ENABLE_NUMA_STAT;
 36                                                    36 
 37 /* zero numa counters within a zone */             37 /* zero numa counters within a zone */
 38 static void zero_zone_numa_counters(struct zon     38 static void zero_zone_numa_counters(struct zone *zone)
 39 {                                                  39 {
 40         int item, cpu;                             40         int item, cpu;
 41                                                    41 
 42         for (item = 0; item < NR_VM_NUMA_EVENT     42         for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++) {
 43                 atomic_long_set(&zone->vm_numa     43                 atomic_long_set(&zone->vm_numa_event[item], 0);
 44                 for_each_online_cpu(cpu) {         44                 for_each_online_cpu(cpu) {
 45                         per_cpu_ptr(zone->per_     45                         per_cpu_ptr(zone->per_cpu_zonestats, cpu)->vm_numa_event[item]
 46                                                    46                                                 = 0;
 47                 }                                  47                 }
 48         }                                          48         }
 49 }                                                  49 }
 50                                                    50 
 51 /* zero numa counters of all the populated zon     51 /* zero numa counters of all the populated zones */
 52 static void zero_zones_numa_counters(void)         52 static void zero_zones_numa_counters(void)
 53 {                                                  53 {
 54         struct zone *zone;                         54         struct zone *zone;
 55                                                    55 
 56         for_each_populated_zone(zone)              56         for_each_populated_zone(zone)
 57                 zero_zone_numa_counters(zone);     57                 zero_zone_numa_counters(zone);
 58 }                                                  58 }
 59                                                    59 
 60 /* zero global numa counters */                    60 /* zero global numa counters */
 61 static void zero_global_numa_counters(void)        61 static void zero_global_numa_counters(void)
 62 {                                                  62 {
 63         int item;                                  63         int item;
 64                                                    64 
 65         for (item = 0; item < NR_VM_NUMA_EVENT     65         for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++)
 66                 atomic_long_set(&vm_numa_event     66                 atomic_long_set(&vm_numa_event[item], 0);
 67 }                                                  67 }
 68                                                    68 
 69 static void invalid_numa_statistics(void)          69 static void invalid_numa_statistics(void)
 70 {                                                  70 {
 71         zero_zones_numa_counters();                71         zero_zones_numa_counters();
 72         zero_global_numa_counters();               72         zero_global_numa_counters();
 73 }                                                  73 }
 74                                                    74 
 75 static DEFINE_MUTEX(vm_numa_stat_lock);            75 static DEFINE_MUTEX(vm_numa_stat_lock);
 76                                                    76 
 77 int sysctl_vm_numa_stat_handler(const struct c !!  77 int sysctl_vm_numa_stat_handler(struct ctl_table *table, int write,
 78                 void *buffer, size_t *length,      78                 void *buffer, size_t *length, loff_t *ppos)
 79 {                                                  79 {
 80         int ret, oldval;                           80         int ret, oldval;
 81                                                    81 
 82         mutex_lock(&vm_numa_stat_lock);            82         mutex_lock(&vm_numa_stat_lock);
 83         if (write)                                 83         if (write)
 84                 oldval = sysctl_vm_numa_stat;      84                 oldval = sysctl_vm_numa_stat;
 85         ret = proc_dointvec_minmax(table, writ     85         ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
 86         if (ret || !write)                         86         if (ret || !write)
 87                 goto out;                          87                 goto out;
 88                                                    88 
 89         if (oldval == sysctl_vm_numa_stat)         89         if (oldval == sysctl_vm_numa_stat)
 90                 goto out;                          90                 goto out;
 91         else if (sysctl_vm_numa_stat == ENABLE     91         else if (sysctl_vm_numa_stat == ENABLE_NUMA_STAT) {
 92                 static_branch_enable(&vm_numa_     92                 static_branch_enable(&vm_numa_stat_key);
 93                 pr_info("enable numa statistic     93                 pr_info("enable numa statistics\n");
 94         } else {                                   94         } else {
 95                 static_branch_disable(&vm_numa     95                 static_branch_disable(&vm_numa_stat_key);
 96                 invalid_numa_statistics();         96                 invalid_numa_statistics();
 97                 pr_info("disable numa statisti     97                 pr_info("disable numa statistics, and clear numa counters\n");
 98         }                                          98         }
 99                                                    99 
100 out:                                              100 out:
101         mutex_unlock(&vm_numa_stat_lock);         101         mutex_unlock(&vm_numa_stat_lock);
102         return ret;                               102         return ret;
103 }                                                 103 }
104 #endif                                            104 #endif
105                                                   105 
106 #ifdef CONFIG_VM_EVENT_COUNTERS                   106 #ifdef CONFIG_VM_EVENT_COUNTERS
107 DEFINE_PER_CPU(struct vm_event_state, vm_event    107 DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
108 EXPORT_PER_CPU_SYMBOL(vm_event_states);           108 EXPORT_PER_CPU_SYMBOL(vm_event_states);
109                                                   109 
110 static void sum_vm_events(unsigned long *ret)     110 static void sum_vm_events(unsigned long *ret)
111 {                                                 111 {
112         int cpu;                                  112         int cpu;
113         int i;                                    113         int i;
114                                                   114 
115         memset(ret, 0, NR_VM_EVENT_ITEMS * siz    115         memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long));
116                                                   116 
117         for_each_online_cpu(cpu) {                117         for_each_online_cpu(cpu) {
118                 struct vm_event_state *this =     118                 struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
119                                                   119 
120                 for (i = 0; i < NR_VM_EVENT_IT    120                 for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
121                         ret[i] += this->event[    121                         ret[i] += this->event[i];
122         }                                         122         }
123 }                                                 123 }
124                                                   124 
125 /*                                                125 /*
126  * Accumulate the vm event counters across all    126  * Accumulate the vm event counters across all CPUs.
127  * The result is unavoidably approximate - it     127  * The result is unavoidably approximate - it can change
128  * during and after execution of this function    128  * during and after execution of this function.
129 */                                                129 */
130 void all_vm_events(unsigned long *ret)            130 void all_vm_events(unsigned long *ret)
131 {                                                 131 {
132         cpus_read_lock();                         132         cpus_read_lock();
133         sum_vm_events(ret);                       133         sum_vm_events(ret);
134         cpus_read_unlock();                       134         cpus_read_unlock();
135 }                                                 135 }
136 EXPORT_SYMBOL_GPL(all_vm_events);                 136 EXPORT_SYMBOL_GPL(all_vm_events);
137                                                   137 
138 /*                                                138 /*
139  * Fold the foreign cpu events into our own.      139  * Fold the foreign cpu events into our own.
140  *                                                140  *
141  * This is adding to the events on one process    141  * This is adding to the events on one processor
142  * but keeps the global counts constant.          142  * but keeps the global counts constant.
143  */                                               143  */
144 void vm_events_fold_cpu(int cpu)                  144 void vm_events_fold_cpu(int cpu)
145 {                                                 145 {
146         struct vm_event_state *fold_state = &p    146         struct vm_event_state *fold_state = &per_cpu(vm_event_states, cpu);
147         int i;                                    147         int i;
148                                                   148 
149         for (i = 0; i < NR_VM_EVENT_ITEMS; i++    149         for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
150                 count_vm_events(i, fold_state-    150                 count_vm_events(i, fold_state->event[i]);
151                 fold_state->event[i] = 0;         151                 fold_state->event[i] = 0;
152         }                                         152         }
153 }                                                 153 }
154                                                   154 
155 #endif /* CONFIG_VM_EVENT_COUNTERS */             155 #endif /* CONFIG_VM_EVENT_COUNTERS */
156                                                   156 
157 /*                                                157 /*
158  * Manage combined zone based / global counter    158  * Manage combined zone based / global counters
159  *                                                159  *
160  * vm_stat contains the global counters           160  * vm_stat contains the global counters
161  */                                               161  */
162 atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITE    162 atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp;
163 atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITE    163 atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS] __cacheline_aligned_in_smp;
164 atomic_long_t vm_numa_event[NR_VM_NUMA_EVENT_I    164 atomic_long_t vm_numa_event[NR_VM_NUMA_EVENT_ITEMS] __cacheline_aligned_in_smp;
165 EXPORT_SYMBOL(vm_zone_stat);                      165 EXPORT_SYMBOL(vm_zone_stat);
166 EXPORT_SYMBOL(vm_node_stat);                      166 EXPORT_SYMBOL(vm_node_stat);
167                                                   167 
168 #ifdef CONFIG_NUMA                             << 
169 static void fold_vm_zone_numa_events(struct zo << 
170 {                                              << 
171         unsigned long zone_numa_events[NR_VM_N << 
172         int cpu;                               << 
173         enum numa_stat_item item;              << 
174                                                << 
175         for_each_online_cpu(cpu) {             << 
176                 struct per_cpu_zonestat *pzsta << 
177                                                << 
178                 pzstats = per_cpu_ptr(zone->pe << 
179                 for (item = 0; item < NR_VM_NU << 
180                         zone_numa_events[item] << 
181         }                                      << 
182                                                << 
183         for (item = 0; item < NR_VM_NUMA_EVENT << 
184                 zone_numa_event_add(zone_numa_ << 
185 }                                              << 
186                                                << 
187 void fold_vm_numa_events(void)                 << 
188 {                                              << 
189         struct zone *zone;                     << 
190                                                << 
191         for_each_populated_zone(zone)          << 
192                 fold_vm_zone_numa_events(zone) << 
193 }                                              << 
194 #endif                                         << 
195                                                << 
196 #ifdef CONFIG_SMP                                 168 #ifdef CONFIG_SMP
197                                                   169 
198 int calculate_pressure_threshold(struct zone *    170 int calculate_pressure_threshold(struct zone *zone)
199 {                                                 171 {
200         int threshold;                            172         int threshold;
201         int watermark_distance;                   173         int watermark_distance;
202                                                   174 
203         /*                                        175         /*
204          * As vmstats are not up to date, ther    176          * As vmstats are not up to date, there is drift between the estimated
205          * and real values. For high threshold    177          * and real values. For high thresholds and a high number of CPUs, it
206          * is possible for the min watermark t    178          * is possible for the min watermark to be breached while the estimated
207          * value looks fine. The pressure thre    179          * value looks fine. The pressure threshold is a reduced value such
208          * that even the maximum amount of dri    180          * that even the maximum amount of drift will not accidentally breach
209          * the min watermark                      181          * the min watermark
210          */                                       182          */
211         watermark_distance = low_wmark_pages(z    183         watermark_distance = low_wmark_pages(zone) - min_wmark_pages(zone);
212         threshold = max(1, (int)(watermark_dis    184         threshold = max(1, (int)(watermark_distance / num_online_cpus()));
213                                                   185 
214         /*                                        186         /*
215          * Maximum threshold is 125               187          * Maximum threshold is 125
216          */                                       188          */
217         threshold = min(125, threshold);          189         threshold = min(125, threshold);
218                                                   190 
219         return threshold;                         191         return threshold;
220 }                                                 192 }
221                                                   193 
222 int calculate_normal_threshold(struct zone *zo    194 int calculate_normal_threshold(struct zone *zone)
223 {                                                 195 {
224         int threshold;                            196         int threshold;
225         int mem;        /* memory in 128 MB un    197         int mem;        /* memory in 128 MB units */
226                                                   198 
227         /*                                        199         /*
228          * The threshold scales with the numbe    200          * The threshold scales with the number of processors and the amount
229          * of memory per zone. More memory mea    201          * of memory per zone. More memory means that we can defer updates for
230          * longer, more processors could lead     202          * longer, more processors could lead to more contention.
231          * fls() is used to have a cheap way o    203          * fls() is used to have a cheap way of logarithmic scaling.
232          *                                        204          *
233          * Some sample thresholds:                205          * Some sample thresholds:
234          *                                        206          *
235          * Threshold    Processors      (fls)     207          * Threshold    Processors      (fls)   Zonesize        fls(mem)+1
236          * -----------------------------------    208          * ------------------------------------------------------------------
237          * 8            1               1         209          * 8            1               1       0.9-1 GB        4
238          * 16           2               2         210          * 16           2               2       0.9-1 GB        4
239          * 20           2               2         211          * 20           2               2       1-2 GB          5
240          * 24           2               2         212          * 24           2               2       2-4 GB          6
241          * 28           2               2         213          * 28           2               2       4-8 GB          7
242          * 32           2               2         214          * 32           2               2       8-16 GB         8
243          * 4            2               2         215          * 4            2               2       <128M           1
244          * 30           4               3         216          * 30           4               3       2-4 GB          5
245          * 48           4               3         217          * 48           4               3       8-16 GB         8
246          * 32           8               4         218          * 32           8               4       1-2 GB          4
247          * 32           8               4         219          * 32           8               4       0.9-1GB         4
248          * 10           16              5         220          * 10           16              5       <128M           1
249          * 40           16              5         221          * 40           16              5       900M            4
250          * 70           64              7         222          * 70           64              7       2-4 GB          5
251          * 84           64              7         223          * 84           64              7       4-8 GB          6
252          * 108          512             9         224          * 108          512             9       4-8 GB          6
253          * 125          1024            10        225          * 125          1024            10      8-16 GB         8
254          * 125          1024            10        226          * 125          1024            10      16-32 GB        9
255          */                                       227          */
256                                                   228 
257         mem = zone_managed_pages(zone) >> (27     229         mem = zone_managed_pages(zone) >> (27 - PAGE_SHIFT);
258                                                   230 
259         threshold = 2 * fls(num_online_cpus())    231         threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem));
260                                                   232 
261         /*                                        233         /*
262          * Maximum threshold is 125               234          * Maximum threshold is 125
263          */                                       235          */
264         threshold = min(125, threshold);          236         threshold = min(125, threshold);
265                                                   237 
266         return threshold;                         238         return threshold;
267 }                                                 239 }
268                                                   240 
269 /*                                                241 /*
270  * Refresh the thresholds for each zone.          242  * Refresh the thresholds for each zone.
271  */                                               243  */
272 void refresh_zone_stat_thresholds(void)           244 void refresh_zone_stat_thresholds(void)
273 {                                                 245 {
274         struct pglist_data *pgdat;                246         struct pglist_data *pgdat;
275         struct zone *zone;                        247         struct zone *zone;
276         int cpu;                                  248         int cpu;
277         int threshold;                            249         int threshold;
278                                                   250 
279         /* Zero current pgdat thresholds */       251         /* Zero current pgdat thresholds */
280         for_each_online_pgdat(pgdat) {            252         for_each_online_pgdat(pgdat) {
281                 for_each_online_cpu(cpu) {        253                 for_each_online_cpu(cpu) {
282                         per_cpu_ptr(pgdat->per    254                         per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold = 0;
283                 }                                 255                 }
284         }                                         256         }
285                                                   257 
286         for_each_populated_zone(zone) {           258         for_each_populated_zone(zone) {
287                 struct pglist_data *pgdat = zo    259                 struct pglist_data *pgdat = zone->zone_pgdat;
288                 unsigned long max_drift, toler    260                 unsigned long max_drift, tolerate_drift;
289                                                   261 
290                 threshold = calculate_normal_t    262                 threshold = calculate_normal_threshold(zone);
291                                                   263 
292                 for_each_online_cpu(cpu) {        264                 for_each_online_cpu(cpu) {
293                         int pgdat_threshold;      265                         int pgdat_threshold;
294                                                   266 
295                         per_cpu_ptr(zone->per_    267                         per_cpu_ptr(zone->per_cpu_zonestats, cpu)->stat_threshold
296                                                   268                                                         = threshold;
297                                                   269 
298                         /* Base nodestat thres    270                         /* Base nodestat threshold on the largest populated zone. */
299                         pgdat_threshold = per_    271                         pgdat_threshold = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold;
300                         per_cpu_ptr(pgdat->per    272                         per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold
301                                 = max(threshol    273                                 = max(threshold, pgdat_threshold);
302                 }                                 274                 }
303                                                   275 
304                 /*                                276                 /*
305                  * Only set percpu_drift_mark     277                  * Only set percpu_drift_mark if there is a danger that
306                  * NR_FREE_PAGES reports the l    278                  * NR_FREE_PAGES reports the low watermark is ok when in fact
307                  * the min watermark could be     279                  * the min watermark could be breached by an allocation
308                  */                               280                  */
309                 tolerate_drift = low_wmark_pag    281                 tolerate_drift = low_wmark_pages(zone) - min_wmark_pages(zone);
310                 max_drift = num_online_cpus()     282                 max_drift = num_online_cpus() * threshold;
311                 if (max_drift > tolerate_drift    283                 if (max_drift > tolerate_drift)
312                         zone->percpu_drift_mar    284                         zone->percpu_drift_mark = high_wmark_pages(zone) +
313                                         max_dr    285                                         max_drift;
314         }                                         286         }
315 }                                                 287 }
316                                                   288 
317 void set_pgdat_percpu_threshold(pg_data_t *pgd    289 void set_pgdat_percpu_threshold(pg_data_t *pgdat,
318                                 int (*calculat    290                                 int (*calculate_pressure)(struct zone *))
319 {                                                 291 {
320         struct zone *zone;                        292         struct zone *zone;
321         int cpu;                                  293         int cpu;
322         int threshold;                            294         int threshold;
323         int i;                                    295         int i;
324                                                   296 
325         for (i = 0; i < pgdat->nr_zones; i++)     297         for (i = 0; i < pgdat->nr_zones; i++) {
326                 zone = &pgdat->node_zones[i];     298                 zone = &pgdat->node_zones[i];
327                 if (!zone->percpu_drift_mark)     299                 if (!zone->percpu_drift_mark)
328                         continue;                 300                         continue;
329                                                   301 
330                 threshold = (*calculate_pressu    302                 threshold = (*calculate_pressure)(zone);
331                 for_each_online_cpu(cpu)          303                 for_each_online_cpu(cpu)
332                         per_cpu_ptr(zone->per_    304                         per_cpu_ptr(zone->per_cpu_zonestats, cpu)->stat_threshold
333                                                   305                                                         = threshold;
334         }                                         306         }
335 }                                                 307 }
336                                                   308 
337 /*                                                309 /*
338  * For use when we know that interrupts are di    310  * For use when we know that interrupts are disabled,
339  * or when we know that preemption is disabled    311  * or when we know that preemption is disabled and that
340  * particular counter cannot be updated from i    312  * particular counter cannot be updated from interrupt context.
341  */                                               313  */
342 void __mod_zone_page_state(struct zone *zone,     314 void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
343                            long delta)            315                            long delta)
344 {                                                 316 {
345         struct per_cpu_zonestat __percpu *pcp     317         struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
346         s8 __percpu *p = pcp->vm_stat_diff + i    318         s8 __percpu *p = pcp->vm_stat_diff + item;
347         long x;                                   319         long x;
348         long t;                                   320         long t;
349                                                   321 
350         /*                                        322         /*
351          * Accurate vmstat updates require a R    323          * Accurate vmstat updates require a RMW. On !PREEMPT_RT kernels,
352          * atomicity is provided by IRQs being    324          * atomicity is provided by IRQs being disabled -- either explicitly
353          * or via local_lock_irq. On PREEMPT_R    325          * or via local_lock_irq. On PREEMPT_RT, local_lock_irq only disables
354          * CPU migrations and preemption poten    326          * CPU migrations and preemption potentially corrupts a counter so
355          * disable preemption.                    327          * disable preemption.
356          */                                       328          */
357         preempt_disable_nested();              !! 329         if (IS_ENABLED(CONFIG_PREEMPT_RT))
                                                   >> 330                 preempt_disable();
358                                                   331 
359         x = delta + __this_cpu_read(*p);          332         x = delta + __this_cpu_read(*p);
360                                                   333 
361         t = __this_cpu_read(pcp->stat_threshol    334         t = __this_cpu_read(pcp->stat_threshold);
362                                                   335 
363         if (unlikely(abs(x) > t)) {               336         if (unlikely(abs(x) > t)) {
364                 zone_page_state_add(x, zone, i    337                 zone_page_state_add(x, zone, item);
365                 x = 0;                            338                 x = 0;
366         }                                         339         }
367         __this_cpu_write(*p, x);                  340         __this_cpu_write(*p, x);
368                                                   341 
369         preempt_enable_nested();               !! 342         if (IS_ENABLED(CONFIG_PREEMPT_RT))
                                                   >> 343                 preempt_enable();
370 }                                                 344 }
371 EXPORT_SYMBOL(__mod_zone_page_state);             345 EXPORT_SYMBOL(__mod_zone_page_state);
372                                                   346 
373 void __mod_node_page_state(struct pglist_data     347 void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
374                                 long delta)       348                                 long delta)
375 {                                                 349 {
376         struct per_cpu_nodestat __percpu *pcp     350         struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
377         s8 __percpu *p = pcp->vm_node_stat_dif    351         s8 __percpu *p = pcp->vm_node_stat_diff + item;
378         long x;                                   352         long x;
379         long t;                                   353         long t;
380                                                   354 
381         if (vmstat_item_in_bytes(item)) {         355         if (vmstat_item_in_bytes(item)) {
382                 /*                                356                 /*
383                  * Only cgroups use subpage ac    357                  * Only cgroups use subpage accounting right now; at
384                  * the global level, these ite    358                  * the global level, these items still change in
385                  * multiples of whole pages. S    359                  * multiples of whole pages. Store them as pages
386                  * internally to keep the per-    360                  * internally to keep the per-cpu counters compact.
387                  */                               361                  */
388                 VM_WARN_ON_ONCE(delta & (PAGE_    362                 VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1));
389                 delta >>= PAGE_SHIFT;             363                 delta >>= PAGE_SHIFT;
390         }                                         364         }
391                                                   365 
392         /* See __mod_node_page_state */           366         /* See __mod_node_page_state */
393         preempt_disable_nested();              !! 367         if (IS_ENABLED(CONFIG_PREEMPT_RT))
                                                   >> 368                 preempt_disable();
394                                                   369 
395         x = delta + __this_cpu_read(*p);          370         x = delta + __this_cpu_read(*p);
396                                                   371 
397         t = __this_cpu_read(pcp->stat_threshol    372         t = __this_cpu_read(pcp->stat_threshold);
398                                                   373 
399         if (unlikely(abs(x) > t)) {               374         if (unlikely(abs(x) > t)) {
400                 node_page_state_add(x, pgdat,     375                 node_page_state_add(x, pgdat, item);
401                 x = 0;                            376                 x = 0;
402         }                                         377         }
403         __this_cpu_write(*p, x);                  378         __this_cpu_write(*p, x);
404                                                   379 
405         preempt_enable_nested();               !! 380         if (IS_ENABLED(CONFIG_PREEMPT_RT))
                                                   >> 381                 preempt_enable();
406 }                                                 382 }
407 EXPORT_SYMBOL(__mod_node_page_state);             383 EXPORT_SYMBOL(__mod_node_page_state);
408                                                   384 
409 /*                                                385 /*
410  * Optimized increment and decrement functions    386  * Optimized increment and decrement functions.
411  *                                                387  *
412  * These are only for a single page and theref    388  * These are only for a single page and therefore can take a struct page *
413  * argument instead of struct zone *. This all    389  * argument instead of struct zone *. This allows the inclusion of the code
414  * generated for page_zone(page) into the opti    390  * generated for page_zone(page) into the optimized functions.
415  *                                                391  *
416  * No overflow check is necessary and therefor    392  * No overflow check is necessary and therefore the differential can be
417  * incremented or decremented in place which m    393  * incremented or decremented in place which may allow the compilers to
418  * generate better code.                          394  * generate better code.
419  * The increment or decrement is known and the    395  * The increment or decrement is known and therefore one boundary check can
420  * be omitted.                                    396  * be omitted.
421  *                                                397  *
422  * NOTE: These functions are very performance     398  * NOTE: These functions are very performance sensitive. Change only
423  * with care.                                     399  * with care.
424  *                                                400  *
425  * Some processors have inc/dec instructions t    401  * Some processors have inc/dec instructions that are atomic vs an interrupt.
426  * However, the code must first determine the     402  * However, the code must first determine the differential location in a zone
427  * based on the processor number and then inc/    403  * based on the processor number and then inc/dec the counter. There is no
428  * guarantee without disabling preemption that    404  * guarantee without disabling preemption that the processor will not change
429  * in between and therefore the atomicity vs.     405  * in between and therefore the atomicity vs. interrupt cannot be exploited
430  * in a useful way here.                          406  * in a useful way here.
431  */                                               407  */
432 void __inc_zone_state(struct zone *zone, enum     408 void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
433 {                                                 409 {
434         struct per_cpu_zonestat __percpu *pcp     410         struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
435         s8 __percpu *p = pcp->vm_stat_diff + i    411         s8 __percpu *p = pcp->vm_stat_diff + item;
436         s8 v, t;                                  412         s8 v, t;
437                                                   413 
438         /* See __mod_node_page_state */           414         /* See __mod_node_page_state */
439         preempt_disable_nested();              !! 415         if (IS_ENABLED(CONFIG_PREEMPT_RT))
                                                   >> 416                 preempt_disable();
440                                                   417 
441         v = __this_cpu_inc_return(*p);            418         v = __this_cpu_inc_return(*p);
442         t = __this_cpu_read(pcp->stat_threshol    419         t = __this_cpu_read(pcp->stat_threshold);
443         if (unlikely(v > t)) {                    420         if (unlikely(v > t)) {
444                 s8 overstep = t >> 1;             421                 s8 overstep = t >> 1;
445                                                   422 
446                 zone_page_state_add(v + overst    423                 zone_page_state_add(v + overstep, zone, item);
447                 __this_cpu_write(*p, -overstep    424                 __this_cpu_write(*p, -overstep);
448         }                                         425         }
449                                                   426 
450         preempt_enable_nested();               !! 427         if (IS_ENABLED(CONFIG_PREEMPT_RT))
                                                   >> 428                 preempt_enable();
451 }                                                 429 }
452                                                   430 
453 void __inc_node_state(struct pglist_data *pgda    431 void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
454 {                                                 432 {
455         struct per_cpu_nodestat __percpu *pcp     433         struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
456         s8 __percpu *p = pcp->vm_node_stat_dif    434         s8 __percpu *p = pcp->vm_node_stat_diff + item;
457         s8 v, t;                                  435         s8 v, t;
458                                                   436 
459         VM_WARN_ON_ONCE(vmstat_item_in_bytes(i    437         VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
460                                                   438 
461         /* See __mod_node_page_state */           439         /* See __mod_node_page_state */
462         preempt_disable_nested();              !! 440         if (IS_ENABLED(CONFIG_PREEMPT_RT))
                                                   >> 441                 preempt_disable();
463                                                   442 
464         v = __this_cpu_inc_return(*p);            443         v = __this_cpu_inc_return(*p);
465         t = __this_cpu_read(pcp->stat_threshol    444         t = __this_cpu_read(pcp->stat_threshold);
466         if (unlikely(v > t)) {                    445         if (unlikely(v > t)) {
467                 s8 overstep = t >> 1;             446                 s8 overstep = t >> 1;
468                                                   447 
469                 node_page_state_add(v + overst    448                 node_page_state_add(v + overstep, pgdat, item);
470                 __this_cpu_write(*p, -overstep    449                 __this_cpu_write(*p, -overstep);
471         }                                         450         }
472                                                   451 
473         preempt_enable_nested();               !! 452         if (IS_ENABLED(CONFIG_PREEMPT_RT))
                                                   >> 453                 preempt_enable();
474 }                                                 454 }
475                                                   455 
476 void __inc_zone_page_state(struct page *page,     456 void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
477 {                                                 457 {
478         __inc_zone_state(page_zone(page), item    458         __inc_zone_state(page_zone(page), item);
479 }                                                 459 }
480 EXPORT_SYMBOL(__inc_zone_page_state);             460 EXPORT_SYMBOL(__inc_zone_page_state);
481                                                   461 
482 void __inc_node_page_state(struct page *page,     462 void __inc_node_page_state(struct page *page, enum node_stat_item item)
483 {                                                 463 {
484         __inc_node_state(page_pgdat(page), ite    464         __inc_node_state(page_pgdat(page), item);
485 }                                                 465 }
486 EXPORT_SYMBOL(__inc_node_page_state);             466 EXPORT_SYMBOL(__inc_node_page_state);
487                                                   467 
488 void __dec_zone_state(struct zone *zone, enum     468 void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
489 {                                                 469 {
490         struct per_cpu_zonestat __percpu *pcp     470         struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
491         s8 __percpu *p = pcp->vm_stat_diff + i    471         s8 __percpu *p = pcp->vm_stat_diff + item;
492         s8 v, t;                                  472         s8 v, t;
493                                                   473 
494         /* See __mod_node_page_state */           474         /* See __mod_node_page_state */
495         preempt_disable_nested();              !! 475         if (IS_ENABLED(CONFIG_PREEMPT_RT))
                                                   >> 476                 preempt_disable();
496                                                   477 
497         v = __this_cpu_dec_return(*p);            478         v = __this_cpu_dec_return(*p);
498         t = __this_cpu_read(pcp->stat_threshol    479         t = __this_cpu_read(pcp->stat_threshold);
499         if (unlikely(v < - t)) {                  480         if (unlikely(v < - t)) {
500                 s8 overstep = t >> 1;             481                 s8 overstep = t >> 1;
501                                                   482 
502                 zone_page_state_add(v - overst    483                 zone_page_state_add(v - overstep, zone, item);
503                 __this_cpu_write(*p, overstep)    484                 __this_cpu_write(*p, overstep);
504         }                                         485         }
505                                                   486 
506         preempt_enable_nested();               !! 487         if (IS_ENABLED(CONFIG_PREEMPT_RT))
                                                   >> 488                 preempt_enable();
507 }                                                 489 }
508                                                   490 
509 void __dec_node_state(struct pglist_data *pgda    491 void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
510 {                                                 492 {
511         struct per_cpu_nodestat __percpu *pcp     493         struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
512         s8 __percpu *p = pcp->vm_node_stat_dif    494         s8 __percpu *p = pcp->vm_node_stat_diff + item;
513         s8 v, t;                                  495         s8 v, t;
514                                                   496 
515         VM_WARN_ON_ONCE(vmstat_item_in_bytes(i    497         VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
516                                                   498 
517         /* See __mod_node_page_state */           499         /* See __mod_node_page_state */
518         preempt_disable_nested();              !! 500         if (IS_ENABLED(CONFIG_PREEMPT_RT))
                                                   >> 501                 preempt_disable();
519                                                   502 
520         v = __this_cpu_dec_return(*p);            503         v = __this_cpu_dec_return(*p);
521         t = __this_cpu_read(pcp->stat_threshol    504         t = __this_cpu_read(pcp->stat_threshold);
522         if (unlikely(v < - t)) {                  505         if (unlikely(v < - t)) {
523                 s8 overstep = t >> 1;             506                 s8 overstep = t >> 1;
524                                                   507 
525                 node_page_state_add(v - overst    508                 node_page_state_add(v - overstep, pgdat, item);
526                 __this_cpu_write(*p, overstep)    509                 __this_cpu_write(*p, overstep);
527         }                                         510         }
528                                                   511 
529         preempt_enable_nested();               !! 512         if (IS_ENABLED(CONFIG_PREEMPT_RT))
                                                   >> 513                 preempt_enable();
530 }                                                 514 }
531                                                   515 
532 void __dec_zone_page_state(struct page *page,     516 void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
533 {                                                 517 {
534         __dec_zone_state(page_zone(page), item    518         __dec_zone_state(page_zone(page), item);
535 }                                                 519 }
536 EXPORT_SYMBOL(__dec_zone_page_state);             520 EXPORT_SYMBOL(__dec_zone_page_state);
537                                                   521 
538 void __dec_node_page_state(struct page *page,     522 void __dec_node_page_state(struct page *page, enum node_stat_item item)
539 {                                                 523 {
540         __dec_node_state(page_pgdat(page), ite    524         __dec_node_state(page_pgdat(page), item);
541 }                                                 525 }
542 EXPORT_SYMBOL(__dec_node_page_state);             526 EXPORT_SYMBOL(__dec_node_page_state);
543                                                   527 
544 #ifdef CONFIG_HAVE_CMPXCHG_LOCAL                  528 #ifdef CONFIG_HAVE_CMPXCHG_LOCAL
545 /*                                                529 /*
546  * If we have cmpxchg_local support then we do    530  * If we have cmpxchg_local support then we do not need to incur the overhead
547  * that comes with local_irq_save/restore if w    531  * that comes with local_irq_save/restore if we use this_cpu_cmpxchg.
548  *                                                532  *
549  * mod_state() modifies the zone counter state    533  * mod_state() modifies the zone counter state through atomic per cpu
550  * operations.                                    534  * operations.
551  *                                                535  *
552  * Overstep mode specifies how overstep should    536  * Overstep mode specifies how overstep should handled:
553  *     0       No overstepping                    537  *     0       No overstepping
554  *     1       Overstepping half of threshold     538  *     1       Overstepping half of threshold
555  *     -1      Overstepping minus half of thre    539  *     -1      Overstepping minus half of threshold
556 */                                                540 */
557 static inline void mod_zone_state(struct zone     541 static inline void mod_zone_state(struct zone *zone,
558        enum zone_stat_item item, long delta, i    542        enum zone_stat_item item, long delta, int overstep_mode)
559 {                                                 543 {
560         struct per_cpu_zonestat __percpu *pcp     544         struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
561         s8 __percpu *p = pcp->vm_stat_diff + i    545         s8 __percpu *p = pcp->vm_stat_diff + item;
562         long n, t, z;                          !! 546         long o, n, t, z;
563         s8 o;                                  << 
564                                                   547 
565         o = this_cpu_read(*p);                 << 
566         do {                                      548         do {
567                 z = 0;  /* overflow to zone co    549                 z = 0;  /* overflow to zone counters */
568                                                   550 
569                 /*                                551                 /*
570                  * The fetching of the stat_th    552                  * The fetching of the stat_threshold is racy. We may apply
571                  * a counter threshold to the     553                  * a counter threshold to the wrong the cpu if we get
572                  * rescheduled while executing    554                  * rescheduled while executing here. However, the next
573                  * counter update will apply t    555                  * counter update will apply the threshold again and
574                  * therefore bring the counter    556                  * therefore bring the counter under the threshold again.
575                  *                                557                  *
576                  * Most of the time the thresh    558                  * Most of the time the thresholds are the same anyways
577                  * for all cpus in a zone.        559                  * for all cpus in a zone.
578                  */                               560                  */
579                 t = this_cpu_read(pcp->stat_th    561                 t = this_cpu_read(pcp->stat_threshold);
580                                                   562 
581                 n = delta + (long)o;           !! 563                 o = this_cpu_read(*p);
                                                   >> 564                 n = delta + o;
582                                                   565 
583                 if (abs(n) > t) {                 566                 if (abs(n) > t) {
584                         int os = overstep_mode    567                         int os = overstep_mode * (t >> 1) ;
585                                                   568 
586                         /* Overflow must be ad    569                         /* Overflow must be added to zone counters */
587                         z = n + os;               570                         z = n + os;
588                         n = -os;                  571                         n = -os;
589                 }                                 572                 }
590         } while (!this_cpu_try_cmpxchg(*p, &o, !! 573         } while (this_cpu_cmpxchg(*p, o, n) != o);
591                                                   574 
592         if (z)                                    575         if (z)
593                 zone_page_state_add(z, zone, i    576                 zone_page_state_add(z, zone, item);
594 }                                                 577 }
595                                                   578 
596 void mod_zone_page_state(struct zone *zone, en    579 void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
597                          long delta)              580                          long delta)
598 {                                                 581 {
599         mod_zone_state(zone, item, delta, 0);     582         mod_zone_state(zone, item, delta, 0);
600 }                                                 583 }
601 EXPORT_SYMBOL(mod_zone_page_state);               584 EXPORT_SYMBOL(mod_zone_page_state);
602                                                   585 
603 void inc_zone_page_state(struct page *page, en    586 void inc_zone_page_state(struct page *page, enum zone_stat_item item)
604 {                                                 587 {
605         mod_zone_state(page_zone(page), item,     588         mod_zone_state(page_zone(page), item, 1, 1);
606 }                                                 589 }
607 EXPORT_SYMBOL(inc_zone_page_state);               590 EXPORT_SYMBOL(inc_zone_page_state);
608                                                   591 
609 void dec_zone_page_state(struct page *page, en    592 void dec_zone_page_state(struct page *page, enum zone_stat_item item)
610 {                                                 593 {
611         mod_zone_state(page_zone(page), item,     594         mod_zone_state(page_zone(page), item, -1, -1);
612 }                                                 595 }
613 EXPORT_SYMBOL(dec_zone_page_state);               596 EXPORT_SYMBOL(dec_zone_page_state);
614                                                   597 
615 static inline void mod_node_state(struct pglis    598 static inline void mod_node_state(struct pglist_data *pgdat,
616        enum node_stat_item item, int delta, in    599        enum node_stat_item item, int delta, int overstep_mode)
617 {                                                 600 {
618         struct per_cpu_nodestat __percpu *pcp     601         struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
619         s8 __percpu *p = pcp->vm_node_stat_dif    602         s8 __percpu *p = pcp->vm_node_stat_diff + item;
620         long n, t, z;                          !! 603         long o, n, t, z;
621         s8 o;                                  << 
622                                                   604 
623         if (vmstat_item_in_bytes(item)) {         605         if (vmstat_item_in_bytes(item)) {
624                 /*                                606                 /*
625                  * Only cgroups use subpage ac    607                  * Only cgroups use subpage accounting right now; at
626                  * the global level, these ite    608                  * the global level, these items still change in
627                  * multiples of whole pages. S    609                  * multiples of whole pages. Store them as pages
628                  * internally to keep the per-    610                  * internally to keep the per-cpu counters compact.
629                  */                               611                  */
630                 VM_WARN_ON_ONCE(delta & (PAGE_    612                 VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1));
631                 delta >>= PAGE_SHIFT;             613                 delta >>= PAGE_SHIFT;
632         }                                         614         }
633                                                   615 
634         o = this_cpu_read(*p);                 << 
635         do {                                      616         do {
636                 z = 0;  /* overflow to node co    617                 z = 0;  /* overflow to node counters */
637                                                   618 
638                 /*                                619                 /*
639                  * The fetching of the stat_th    620                  * The fetching of the stat_threshold is racy. We may apply
640                  * a counter threshold to the     621                  * a counter threshold to the wrong the cpu if we get
641                  * rescheduled while executing    622                  * rescheduled while executing here. However, the next
642                  * counter update will apply t    623                  * counter update will apply the threshold again and
643                  * therefore bring the counter    624                  * therefore bring the counter under the threshold again.
644                  *                                625                  *
645                  * Most of the time the thresh    626                  * Most of the time the thresholds are the same anyways
646                  * for all cpus in a node.        627                  * for all cpus in a node.
647                  */                               628                  */
648                 t = this_cpu_read(pcp->stat_th    629                 t = this_cpu_read(pcp->stat_threshold);
649                                                   630 
650                 n = delta + (long)o;           !! 631                 o = this_cpu_read(*p);
                                                   >> 632                 n = delta + o;
651                                                   633 
652                 if (abs(n) > t) {                 634                 if (abs(n) > t) {
653                         int os = overstep_mode    635                         int os = overstep_mode * (t >> 1) ;
654                                                   636 
655                         /* Overflow must be ad    637                         /* Overflow must be added to node counters */
656                         z = n + os;               638                         z = n + os;
657                         n = -os;                  639                         n = -os;
658                 }                                 640                 }
659         } while (!this_cpu_try_cmpxchg(*p, &o, !! 641         } while (this_cpu_cmpxchg(*p, o, n) != o);
660                                                   642 
661         if (z)                                    643         if (z)
662                 node_page_state_add(z, pgdat,     644                 node_page_state_add(z, pgdat, item);
663 }                                                 645 }
664                                                   646 
665 void mod_node_page_state(struct pglist_data *p    647 void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
666                                         long d    648                                         long delta)
667 {                                                 649 {
668         mod_node_state(pgdat, item, delta, 0);    650         mod_node_state(pgdat, item, delta, 0);
669 }                                                 651 }
670 EXPORT_SYMBOL(mod_node_page_state);               652 EXPORT_SYMBOL(mod_node_page_state);
671                                                   653 
672 void inc_node_state(struct pglist_data *pgdat,    654 void inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
673 {                                                 655 {
674         mod_node_state(pgdat, item, 1, 1);        656         mod_node_state(pgdat, item, 1, 1);
675 }                                                 657 }
676                                                   658 
677 void inc_node_page_state(struct page *page, en    659 void inc_node_page_state(struct page *page, enum node_stat_item item)
678 {                                                 660 {
679         mod_node_state(page_pgdat(page), item,    661         mod_node_state(page_pgdat(page), item, 1, 1);
680 }                                                 662 }
681 EXPORT_SYMBOL(inc_node_page_state);               663 EXPORT_SYMBOL(inc_node_page_state);
682                                                   664 
683 void dec_node_page_state(struct page *page, en    665 void dec_node_page_state(struct page *page, enum node_stat_item item)
684 {                                                 666 {
685         mod_node_state(page_pgdat(page), item,    667         mod_node_state(page_pgdat(page), item, -1, -1);
686 }                                                 668 }
687 EXPORT_SYMBOL(dec_node_page_state);               669 EXPORT_SYMBOL(dec_node_page_state);
688 #else                                             670 #else
689 /*                                                671 /*
690  * Use interrupt disable to serialize counter     672  * Use interrupt disable to serialize counter updates
691  */                                               673  */
692 void mod_zone_page_state(struct zone *zone, en    674 void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
693                          long delta)              675                          long delta)
694 {                                                 676 {
695         unsigned long flags;                      677         unsigned long flags;
696                                                   678 
697         local_irq_save(flags);                    679         local_irq_save(flags);
698         __mod_zone_page_state(zone, item, delt    680         __mod_zone_page_state(zone, item, delta);
699         local_irq_restore(flags);                 681         local_irq_restore(flags);
700 }                                                 682 }
701 EXPORT_SYMBOL(mod_zone_page_state);               683 EXPORT_SYMBOL(mod_zone_page_state);
702                                                   684 
703 void inc_zone_page_state(struct page *page, en    685 void inc_zone_page_state(struct page *page, enum zone_stat_item item)
704 {                                                 686 {
705         unsigned long flags;                      687         unsigned long flags;
706         struct zone *zone;                        688         struct zone *zone;
707                                                   689 
708         zone = page_zone(page);                   690         zone = page_zone(page);
709         local_irq_save(flags);                    691         local_irq_save(flags);
710         __inc_zone_state(zone, item);             692         __inc_zone_state(zone, item);
711         local_irq_restore(flags);                 693         local_irq_restore(flags);
712 }                                                 694 }
713 EXPORT_SYMBOL(inc_zone_page_state);               695 EXPORT_SYMBOL(inc_zone_page_state);
714                                                   696 
715 void dec_zone_page_state(struct page *page, en    697 void dec_zone_page_state(struct page *page, enum zone_stat_item item)
716 {                                                 698 {
717         unsigned long flags;                      699         unsigned long flags;
718                                                   700 
719         local_irq_save(flags);                    701         local_irq_save(flags);
720         __dec_zone_page_state(page, item);        702         __dec_zone_page_state(page, item);
721         local_irq_restore(flags);                 703         local_irq_restore(flags);
722 }                                                 704 }
723 EXPORT_SYMBOL(dec_zone_page_state);               705 EXPORT_SYMBOL(dec_zone_page_state);
724                                                   706 
725 void inc_node_state(struct pglist_data *pgdat,    707 void inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
726 {                                                 708 {
727         unsigned long flags;                      709         unsigned long flags;
728                                                   710 
729         local_irq_save(flags);                    711         local_irq_save(flags);
730         __inc_node_state(pgdat, item);            712         __inc_node_state(pgdat, item);
731         local_irq_restore(flags);                 713         local_irq_restore(flags);
732 }                                                 714 }
733 EXPORT_SYMBOL(inc_node_state);                    715 EXPORT_SYMBOL(inc_node_state);
734                                                   716 
735 void mod_node_page_state(struct pglist_data *p    717 void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
736                                         long d    718                                         long delta)
737 {                                                 719 {
738         unsigned long flags;                      720         unsigned long flags;
739                                                   721 
740         local_irq_save(flags);                    722         local_irq_save(flags);
741         __mod_node_page_state(pgdat, item, del    723         __mod_node_page_state(pgdat, item, delta);
742         local_irq_restore(flags);                 724         local_irq_restore(flags);
743 }                                                 725 }
744 EXPORT_SYMBOL(mod_node_page_state);               726 EXPORT_SYMBOL(mod_node_page_state);
745                                                   727 
746 void inc_node_page_state(struct page *page, en    728 void inc_node_page_state(struct page *page, enum node_stat_item item)
747 {                                                 729 {
748         unsigned long flags;                      730         unsigned long flags;
749         struct pglist_data *pgdat;                731         struct pglist_data *pgdat;
750                                                   732 
751         pgdat = page_pgdat(page);                 733         pgdat = page_pgdat(page);
752         local_irq_save(flags);                    734         local_irq_save(flags);
753         __inc_node_state(pgdat, item);            735         __inc_node_state(pgdat, item);
754         local_irq_restore(flags);                 736         local_irq_restore(flags);
755 }                                                 737 }
756 EXPORT_SYMBOL(inc_node_page_state);               738 EXPORT_SYMBOL(inc_node_page_state);
757                                                   739 
758 void dec_node_page_state(struct page *page, en    740 void dec_node_page_state(struct page *page, enum node_stat_item item)
759 {                                                 741 {
760         unsigned long flags;                      742         unsigned long flags;
761                                                   743 
762         local_irq_save(flags);                    744         local_irq_save(flags);
763         __dec_node_page_state(page, item);        745         __dec_node_page_state(page, item);
764         local_irq_restore(flags);                 746         local_irq_restore(flags);
765 }                                                 747 }
766 EXPORT_SYMBOL(dec_node_page_state);               748 EXPORT_SYMBOL(dec_node_page_state);
767 #endif                                            749 #endif
768                                                   750 
769 /*                                                751 /*
770  * Fold a differential into the global counter    752  * Fold a differential into the global counters.
771  * Returns the number of counters updated.        753  * Returns the number of counters updated.
772  */                                               754  */
773 static int fold_diff(int *zone_diff, int *node    755 static int fold_diff(int *zone_diff, int *node_diff)
774 {                                                 756 {
775         int i;                                    757         int i;
776         int changes = 0;                          758         int changes = 0;
777                                                   759 
778         for (i = 0; i < NR_VM_ZONE_STAT_ITEMS;    760         for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
779                 if (zone_diff[i]) {               761                 if (zone_diff[i]) {
780                         atomic_long_add(zone_d    762                         atomic_long_add(zone_diff[i], &vm_zone_stat[i]);
781                         changes++;                763                         changes++;
782         }                                         764         }
783                                                   765 
784         for (i = 0; i < NR_VM_NODE_STAT_ITEMS;    766         for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
785                 if (node_diff[i]) {               767                 if (node_diff[i]) {
786                         atomic_long_add(node_d    768                         atomic_long_add(node_diff[i], &vm_node_stat[i]);
787                         changes++;                769                         changes++;
788         }                                         770         }
789         return changes;                           771         return changes;
790 }                                                 772 }
791                                                   773 
                                                   >> 774 #ifdef CONFIG_NUMA
                                                   >> 775 static void fold_vm_zone_numa_events(struct zone *zone)
                                                   >> 776 {
                                                   >> 777         unsigned long zone_numa_events[NR_VM_NUMA_EVENT_ITEMS] = { 0, };
                                                   >> 778         int cpu;
                                                   >> 779         enum numa_stat_item item;
                                                   >> 780 
                                                   >> 781         for_each_online_cpu(cpu) {
                                                   >> 782                 struct per_cpu_zonestat *pzstats;
                                                   >> 783 
                                                   >> 784                 pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
                                                   >> 785                 for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++)
                                                   >> 786                         zone_numa_events[item] += xchg(&pzstats->vm_numa_event[item], 0);
                                                   >> 787         }
                                                   >> 788 
                                                   >> 789         for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++)
                                                   >> 790                 zone_numa_event_add(zone_numa_events[item], zone, item);
                                                   >> 791 }
                                                   >> 792 
                                                   >> 793 void fold_vm_numa_events(void)
                                                   >> 794 {
                                                   >> 795         struct zone *zone;
                                                   >> 796 
                                                   >> 797         for_each_populated_zone(zone)
                                                   >> 798                 fold_vm_zone_numa_events(zone);
                                                   >> 799 }
                                                   >> 800 #endif
                                                   >> 801 
792 /*                                                802 /*
793  * Update the zone counters for the current cp    803  * Update the zone counters for the current cpu.
794  *                                                804  *
795  * Note that refresh_cpu_vm_stats strives to o    805  * Note that refresh_cpu_vm_stats strives to only access
796  * node local memory. The per cpu pagesets on     806  * node local memory. The per cpu pagesets on remote zones are placed
797  * in the memory local to the processor using     807  * in the memory local to the processor using that pageset. So the
798  * loop over all zones will access a series of    808  * loop over all zones will access a series of cachelines local to
799  * the processor.                                 809  * the processor.
800  *                                                810  *
801  * The call to zone_page_state_add updates the    811  * The call to zone_page_state_add updates the cachelines with the
802  * statistics in the remote zone struct as wel    812  * statistics in the remote zone struct as well as the global cachelines
803  * with the global counters. These could cause    813  * with the global counters. These could cause remote node cache line
804  * bouncing and will have to be only done when    814  * bouncing and will have to be only done when necessary.
805  *                                                815  *
806  * The function returns the number of global c    816  * The function returns the number of global counters updated.
807  */                                               817  */
808 static int refresh_cpu_vm_stats(bool do_pagese    818 static int refresh_cpu_vm_stats(bool do_pagesets)
809 {                                                 819 {
810         struct pglist_data *pgdat;                820         struct pglist_data *pgdat;
811         struct zone *zone;                        821         struct zone *zone;
812         int i;                                    822         int i;
813         int global_zone_diff[NR_VM_ZONE_STAT_I    823         int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
814         int global_node_diff[NR_VM_NODE_STAT_I    824         int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
815         int changes = 0;                          825         int changes = 0;
816                                                   826 
817         for_each_populated_zone(zone) {           827         for_each_populated_zone(zone) {
818                 struct per_cpu_zonestat __perc    828                 struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats;
                                                   >> 829 #ifdef CONFIG_NUMA
819                 struct per_cpu_pages __percpu     830                 struct per_cpu_pages __percpu *pcp = zone->per_cpu_pageset;
                                                   >> 831 #endif
820                                                   832 
821                 for (i = 0; i < NR_VM_ZONE_STA    833                 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
822                         int v;                    834                         int v;
823                                                   835 
824                         v = this_cpu_xchg(pzst    836                         v = this_cpu_xchg(pzstats->vm_stat_diff[i], 0);
825                         if (v) {                  837                         if (v) {
826                                                   838 
827                                 atomic_long_ad    839                                 atomic_long_add(v, &zone->vm_stat[i]);
828                                 global_zone_di    840                                 global_zone_diff[i] += v;
829 #ifdef CONFIG_NUMA                                841 #ifdef CONFIG_NUMA
830                                 /* 3 seconds i    842                                 /* 3 seconds idle till flush */
831                                 __this_cpu_wri    843                                 __this_cpu_write(pcp->expire, 3);
832 #endif                                            844 #endif
833                         }                         845                         }
834                 }                                 846                 }
                                                   >> 847 #ifdef CONFIG_NUMA
835                                                   848 
836                 if (do_pagesets) {                849                 if (do_pagesets) {
837                         cond_resched();           850                         cond_resched();
838                                                << 
839                         changes += decay_pcp_h << 
840 #ifdef CONFIG_NUMA                             << 
841                         /*                        851                         /*
842                          * Deal with draining     852                          * Deal with draining the remote pageset of this
843                          * processor              853                          * processor
844                          *                        854                          *
845                          * Check if there are     855                          * Check if there are pages remaining in this pageset
846                          * if not then there i    856                          * if not then there is nothing to expire.
847                          */                       857                          */
848                         if (!__this_cpu_read(p    858                         if (!__this_cpu_read(pcp->expire) ||
849                                !__this_cpu_rea    859                                !__this_cpu_read(pcp->count))
850                                 continue;         860                                 continue;
851                                                   861 
852                         /*                        862                         /*
853                          * We never drain zone    863                          * We never drain zones local to this processor.
854                          */                       864                          */
855                         if (zone_to_nid(zone)     865                         if (zone_to_nid(zone) == numa_node_id()) {
856                                 __this_cpu_wri    866                                 __this_cpu_write(pcp->expire, 0);
857                                 continue;         867                                 continue;
858                         }                         868                         }
859                                                   869 
860                         if (__this_cpu_dec_ret !! 870                         if (__this_cpu_dec_return(pcp->expire))
861                                 changes++;     << 
862                                 continue;         871                                 continue;
863                         }                      << 
864                                                   872 
865                         if (__this_cpu_read(pc    873                         if (__this_cpu_read(pcp->count)) {
866                                 drain_zone_pag    874                                 drain_zone_pages(zone, this_cpu_ptr(pcp));
867                                 changes++;        875                                 changes++;
868                         }                         876                         }
869 #endif                                         << 
870                 }                                 877                 }
                                                   >> 878 #endif
871         }                                         879         }
872                                                   880 
873         for_each_online_pgdat(pgdat) {            881         for_each_online_pgdat(pgdat) {
874                 struct per_cpu_nodestat __perc    882                 struct per_cpu_nodestat __percpu *p = pgdat->per_cpu_nodestats;
875                                                   883 
876                 for (i = 0; i < NR_VM_NODE_STA    884                 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
877                         int v;                    885                         int v;
878                                                   886 
879                         v = this_cpu_xchg(p->v    887                         v = this_cpu_xchg(p->vm_node_stat_diff[i], 0);
880                         if (v) {                  888                         if (v) {
881                                 atomic_long_ad    889                                 atomic_long_add(v, &pgdat->vm_stat[i]);
882                                 global_node_di    890                                 global_node_diff[i] += v;
883                         }                         891                         }
884                 }                                 892                 }
885         }                                         893         }
886                                                   894 
887         changes += fold_diff(global_zone_diff,    895         changes += fold_diff(global_zone_diff, global_node_diff);
888         return changes;                           896         return changes;
889 }                                                 897 }
890                                                   898 
891 /*                                                899 /*
892  * Fold the data for an offline cpu into the g    900  * Fold the data for an offline cpu into the global array.
893  * There cannot be any access by the offline c    901  * There cannot be any access by the offline cpu and therefore
894  * synchronization is simplified.                 902  * synchronization is simplified.
895  */                                               903  */
896 void cpu_vm_stats_fold(int cpu)                   904 void cpu_vm_stats_fold(int cpu)
897 {                                                 905 {
898         struct pglist_data *pgdat;                906         struct pglist_data *pgdat;
899         struct zone *zone;                        907         struct zone *zone;
900         int i;                                    908         int i;
901         int global_zone_diff[NR_VM_ZONE_STAT_I    909         int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
902         int global_node_diff[NR_VM_NODE_STAT_I    910         int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
903                                                   911 
904         for_each_populated_zone(zone) {           912         for_each_populated_zone(zone) {
905                 struct per_cpu_zonestat *pzsta    913                 struct per_cpu_zonestat *pzstats;
906                                                   914 
907                 pzstats = per_cpu_ptr(zone->pe    915                 pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
908                                                   916 
909                 for (i = 0; i < NR_VM_ZONE_STA    917                 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
910                         if (pzstats->vm_stat_d    918                         if (pzstats->vm_stat_diff[i]) {
911                                 int v;            919                                 int v;
912                                                   920 
913                                 v = pzstats->v    921                                 v = pzstats->vm_stat_diff[i];
914                                 pzstats->vm_st    922                                 pzstats->vm_stat_diff[i] = 0;
915                                 atomic_long_ad    923                                 atomic_long_add(v, &zone->vm_stat[i]);
916                                 global_zone_di    924                                 global_zone_diff[i] += v;
917                         }                         925                         }
918                 }                                 926                 }
919 #ifdef CONFIG_NUMA                                927 #ifdef CONFIG_NUMA
920                 for (i = 0; i < NR_VM_NUMA_EVE    928                 for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++) {
921                         if (pzstats->vm_numa_e    929                         if (pzstats->vm_numa_event[i]) {
922                                 unsigned long     930                                 unsigned long v;
923                                                   931 
924                                 v = pzstats->v    932                                 v = pzstats->vm_numa_event[i];
925                                 pzstats->vm_nu    933                                 pzstats->vm_numa_event[i] = 0;
926                                 zone_numa_even    934                                 zone_numa_event_add(v, zone, i);
927                         }                         935                         }
928                 }                                 936                 }
929 #endif                                            937 #endif
930         }                                         938         }
931                                                   939 
932         for_each_online_pgdat(pgdat) {            940         for_each_online_pgdat(pgdat) {
933                 struct per_cpu_nodestat *p;       941                 struct per_cpu_nodestat *p;
934                                                   942 
935                 p = per_cpu_ptr(pgdat->per_cpu    943                 p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu);
936                                                   944 
937                 for (i = 0; i < NR_VM_NODE_STA    945                 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
938                         if (p->vm_node_stat_di    946                         if (p->vm_node_stat_diff[i]) {
939                                 int v;            947                                 int v;
940                                                   948 
941                                 v = p->vm_node    949                                 v = p->vm_node_stat_diff[i];
942                                 p->vm_node_sta    950                                 p->vm_node_stat_diff[i] = 0;
943                                 atomic_long_ad    951                                 atomic_long_add(v, &pgdat->vm_stat[i]);
944                                 global_node_di    952                                 global_node_diff[i] += v;
945                         }                         953                         }
946         }                                         954         }
947                                                   955 
948         fold_diff(global_zone_diff, global_nod    956         fold_diff(global_zone_diff, global_node_diff);
949 }                                                 957 }
950                                                   958 
951 /*                                                959 /*
952  * this is only called if !populated_zone(zone    960  * this is only called if !populated_zone(zone), which implies no other users of
953  * pset->vm_stat_diff[] exist.                    961  * pset->vm_stat_diff[] exist.
954  */                                               962  */
955 void drain_zonestat(struct zone *zone, struct     963 void drain_zonestat(struct zone *zone, struct per_cpu_zonestat *pzstats)
956 {                                                 964 {
957         unsigned long v;                          965         unsigned long v;
958         int i;                                    966         int i;
959                                                   967 
960         for (i = 0; i < NR_VM_ZONE_STAT_ITEMS;    968         for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
961                 if (pzstats->vm_stat_diff[i])     969                 if (pzstats->vm_stat_diff[i]) {
962                         v = pzstats->vm_stat_d    970                         v = pzstats->vm_stat_diff[i];
963                         pzstats->vm_stat_diff[    971                         pzstats->vm_stat_diff[i] = 0;
964                         zone_page_state_add(v,    972                         zone_page_state_add(v, zone, i);
965                 }                                 973                 }
966         }                                         974         }
967                                                   975 
968 #ifdef CONFIG_NUMA                                976 #ifdef CONFIG_NUMA
969         for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS    977         for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++) {
970                 if (pzstats->vm_numa_event[i])    978                 if (pzstats->vm_numa_event[i]) {
971                         v = pzstats->vm_numa_e    979                         v = pzstats->vm_numa_event[i];
972                         pzstats->vm_numa_event    980                         pzstats->vm_numa_event[i] = 0;
973                         zone_numa_event_add(v,    981                         zone_numa_event_add(v, zone, i);
974                 }                                 982                 }
975         }                                         983         }
976 #endif                                            984 #endif
977 }                                                 985 }
978 #endif                                            986 #endif
979                                                   987 
980 #ifdef CONFIG_NUMA                                988 #ifdef CONFIG_NUMA
981 /*                                                989 /*
982  * Determine the per node value of a stat item    990  * Determine the per node value of a stat item. This function
983  * is called frequently in a NUMA machine, so     991  * is called frequently in a NUMA machine, so try to be as
984  * frugal as possible.                            992  * frugal as possible.
985  */                                               993  */
986 unsigned long sum_zone_node_page_state(int nod    994 unsigned long sum_zone_node_page_state(int node,
987                                  enum zone_sta    995                                  enum zone_stat_item item)
988 {                                                 996 {
989         struct zone *zones = NODE_DATA(node)->    997         struct zone *zones = NODE_DATA(node)->node_zones;
990         int i;                                    998         int i;
991         unsigned long count = 0;                  999         unsigned long count = 0;
992                                                   1000 
993         for (i = 0; i < MAX_NR_ZONES; i++)        1001         for (i = 0; i < MAX_NR_ZONES; i++)
994                 count += zone_page_state(zones    1002                 count += zone_page_state(zones + i, item);
995                                                   1003 
996         return count;                             1004         return count;
997 }                                                 1005 }
998                                                   1006 
999 /* Determine the per node value of a numa stat    1007 /* Determine the per node value of a numa stat item. */
1000 unsigned long sum_zone_numa_event_state(int n    1008 unsigned long sum_zone_numa_event_state(int node,
1001                                  enum numa_st    1009                                  enum numa_stat_item item)
1002 {                                                1010 {
1003         struct zone *zones = NODE_DATA(node)-    1011         struct zone *zones = NODE_DATA(node)->node_zones;
1004         unsigned long count = 0;                 1012         unsigned long count = 0;
1005         int i;                                   1013         int i;
1006                                                  1014 
1007         for (i = 0; i < MAX_NR_ZONES; i++)       1015         for (i = 0; i < MAX_NR_ZONES; i++)
1008                 count += zone_numa_event_stat    1016                 count += zone_numa_event_state(zones + i, item);
1009                                                  1017 
1010         return count;                            1018         return count;
1011 }                                                1019 }
1012                                                  1020 
1013 /*                                               1021 /*
1014  * Determine the per node value of a stat ite    1022  * Determine the per node value of a stat item.
1015  */                                              1023  */
1016 unsigned long node_page_state_pages(struct pg    1024 unsigned long node_page_state_pages(struct pglist_data *pgdat,
1017                                     enum node    1025                                     enum node_stat_item item)
1018 {                                                1026 {
1019         long x = atomic_long_read(&pgdat->vm_    1027         long x = atomic_long_read(&pgdat->vm_stat[item]);
1020 #ifdef CONFIG_SMP                                1028 #ifdef CONFIG_SMP
1021         if (x < 0)                               1029         if (x < 0)
1022                 x = 0;                           1030                 x = 0;
1023 #endif                                           1031 #endif
1024         return x;                                1032         return x;
1025 }                                                1033 }
1026                                                  1034 
1027 unsigned long node_page_state(struct pglist_d    1035 unsigned long node_page_state(struct pglist_data *pgdat,
1028                               enum node_stat_    1036                               enum node_stat_item item)
1029 {                                                1037 {
1030         VM_WARN_ON_ONCE(vmstat_item_in_bytes(    1038         VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
1031                                                  1039 
1032         return node_page_state_pages(pgdat, i    1040         return node_page_state_pages(pgdat, item);
1033 }                                                1041 }
1034 #endif                                           1042 #endif
1035                                                  1043 
1036 /*                                            << 
1037  * Count number of pages "struct page" and "s << 
1038  * nr_memmap_boot_pages: # of pages allocated << 
1039  * nr_memmap_pages: # of pages that were allo << 
1040  */                                           << 
1041 static atomic_long_t nr_memmap_boot_pages = A << 
1042 static atomic_long_t nr_memmap_pages = ATOMIC << 
1043                                               << 
1044 void memmap_boot_pages_add(long delta)        << 
1045 {                                             << 
1046         atomic_long_add(delta, &nr_memmap_boo << 
1047 }                                             << 
1048                                               << 
1049 void memmap_pages_add(long delta)             << 
1050 {                                             << 
1051         atomic_long_add(delta, &nr_memmap_pag << 
1052 }                                             << 
1053                                               << 
1054 #ifdef CONFIG_COMPACTION                         1044 #ifdef CONFIG_COMPACTION
1055                                                  1045 
1056 struct contig_page_info {                        1046 struct contig_page_info {
1057         unsigned long free_pages;                1047         unsigned long free_pages;
1058         unsigned long free_blocks_total;         1048         unsigned long free_blocks_total;
1059         unsigned long free_blocks_suitable;      1049         unsigned long free_blocks_suitable;
1060 };                                               1050 };
1061                                                  1051 
1062 /*                                               1052 /*
1063  * Calculate the number of free pages in a zo    1053  * Calculate the number of free pages in a zone, how many contiguous
1064  * pages are free and how many are large enou    1054  * pages are free and how many are large enough to satisfy an allocation of
1065  * the target size. Note that this function m    1055  * the target size. Note that this function makes no attempt to estimate
1066  * how many suitable free blocks there *might    1056  * how many suitable free blocks there *might* be if MOVABLE pages were
1067  * migrated. Calculating that is possible, bu    1057  * migrated. Calculating that is possible, but expensive and can be
1068  * figured out from userspace                    1058  * figured out from userspace
1069  */                                              1059  */
1070 static void fill_contig_page_info(struct zone    1060 static void fill_contig_page_info(struct zone *zone,
1071                                 unsigned int     1061                                 unsigned int suitable_order,
1072                                 struct contig    1062                                 struct contig_page_info *info)
1073 {                                                1063 {
1074         unsigned int order;                      1064         unsigned int order;
1075                                                  1065 
1076         info->free_pages = 0;                    1066         info->free_pages = 0;
1077         info->free_blocks_total = 0;             1067         info->free_blocks_total = 0;
1078         info->free_blocks_suitable = 0;          1068         info->free_blocks_suitable = 0;
1079                                                  1069 
1080         for (order = 0; order < NR_PAGE_ORDER !! 1070         for (order = 0; order < MAX_ORDER; order++) {
1081                 unsigned long blocks;            1071                 unsigned long blocks;
1082                                                  1072 
1083                 /*                            !! 1073                 /* Count number of free blocks */
1084                  * Count number of free block !! 1074                 blocks = zone->free_area[order].nr_free;
1085                  *                            << 
1086                  * Access to nr_free is lockl << 
1087                  * diagnostic purposes. Use d << 
1088                  */                           << 
1089                 blocks = data_race(zone->free << 
1090                 info->free_blocks_total += bl    1075                 info->free_blocks_total += blocks;
1091                                                  1076 
1092                 /* Count free base pages */      1077                 /* Count free base pages */
1093                 info->free_pages += blocks <<    1078                 info->free_pages += blocks << order;
1094                                                  1079 
1095                 /* Count the suitable free bl    1080                 /* Count the suitable free blocks */
1096                 if (order >= suitable_order)     1081                 if (order >= suitable_order)
1097                         info->free_blocks_sui    1082                         info->free_blocks_suitable += blocks <<
1098                                                  1083                                                 (order - suitable_order);
1099         }                                        1084         }
1100 }                                                1085 }
1101                                                  1086 
1102 /*                                               1087 /*
1103  * A fragmentation index only makes sense if     1088  * A fragmentation index only makes sense if an allocation of a requested
1104  * size would fail. If that is true, the frag    1089  * size would fail. If that is true, the fragmentation index indicates
1105  * whether external fragmentation or a lack o    1090  * whether external fragmentation or a lack of memory was the problem.
1106  * The value can be used to determine if page    1091  * The value can be used to determine if page reclaim or compaction
1107  * should be used                                1092  * should be used
1108  */                                              1093  */
1109 static int __fragmentation_index(unsigned int    1094 static int __fragmentation_index(unsigned int order, struct contig_page_info *info)
1110 {                                                1095 {
1111         unsigned long requested = 1UL << orde    1096         unsigned long requested = 1UL << order;
1112                                                  1097 
1113         if (WARN_ON_ONCE(order > MAX_PAGE_ORD !! 1098         if (WARN_ON_ONCE(order >= MAX_ORDER))
1114                 return 0;                        1099                 return 0;
1115                                                  1100 
1116         if (!info->free_blocks_total)            1101         if (!info->free_blocks_total)
1117                 return 0;                        1102                 return 0;
1118                                                  1103 
1119         /* Fragmentation index only makes sen    1104         /* Fragmentation index only makes sense when a request would fail */
1120         if (info->free_blocks_suitable)          1105         if (info->free_blocks_suitable)
1121                 return -1000;                    1106                 return -1000;
1122                                                  1107 
1123         /*                                       1108         /*
1124          * Index is between 0 and 1 so return    1109          * Index is between 0 and 1 so return within 3 decimal places
1125          *                                       1110          *
1126          * 0 => allocation would fail due to     1111          * 0 => allocation would fail due to lack of memory
1127          * 1 => allocation would fail due to     1112          * 1 => allocation would fail due to fragmentation
1128          */                                      1113          */
1129         return 1000 - div_u64( (1000+(div_u64    1114         return 1000 - div_u64( (1000+(div_u64(info->free_pages * 1000ULL, requested))), info->free_blocks_total);
1130 }                                                1115 }
1131                                                  1116 
1132 /*                                               1117 /*
1133  * Calculates external fragmentation within a    1118  * Calculates external fragmentation within a zone wrt the given order.
1134  * It is defined as the percentage of pages f    1119  * It is defined as the percentage of pages found in blocks of size
1135  * less than 1 << order. It returns values in    1120  * less than 1 << order. It returns values in range [0, 100].
1136  */                                              1121  */
1137 unsigned int extfrag_for_order(struct zone *z    1122 unsigned int extfrag_for_order(struct zone *zone, unsigned int order)
1138 {                                                1123 {
1139         struct contig_page_info info;            1124         struct contig_page_info info;
1140                                                  1125 
1141         fill_contig_page_info(zone, order, &i    1126         fill_contig_page_info(zone, order, &info);
1142         if (info.free_pages == 0)                1127         if (info.free_pages == 0)
1143                 return 0;                        1128                 return 0;
1144                                                  1129 
1145         return div_u64((info.free_pages -        1130         return div_u64((info.free_pages -
1146                         (info.free_blocks_sui    1131                         (info.free_blocks_suitable << order)) * 100,
1147                         info.free_pages);        1132                         info.free_pages);
1148 }                                                1133 }
1149                                                  1134 
1150 /* Same as __fragmentation index but allocs c    1135 /* Same as __fragmentation index but allocs contig_page_info on stack */
1151 int fragmentation_index(struct zone *zone, un    1136 int fragmentation_index(struct zone *zone, unsigned int order)
1152 {                                                1137 {
1153         struct contig_page_info info;            1138         struct contig_page_info info;
1154                                                  1139 
1155         fill_contig_page_info(zone, order, &i    1140         fill_contig_page_info(zone, order, &info);
1156         return __fragmentation_index(order, &    1141         return __fragmentation_index(order, &info);
1157 }                                                1142 }
1158 #endif                                           1143 #endif
1159                                                  1144 
1160 #if defined(CONFIG_PROC_FS) || defined(CONFIG    1145 #if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) || \
1161     defined(CONFIG_NUMA) || defined(CONFIG_ME    1146     defined(CONFIG_NUMA) || defined(CONFIG_MEMCG)
1162 #ifdef CONFIG_ZONE_DMA                           1147 #ifdef CONFIG_ZONE_DMA
1163 #define TEXT_FOR_DMA(xx) xx "_dma",              1148 #define TEXT_FOR_DMA(xx) xx "_dma",
1164 #else                                            1149 #else
1165 #define TEXT_FOR_DMA(xx)                         1150 #define TEXT_FOR_DMA(xx)
1166 #endif                                           1151 #endif
1167                                                  1152 
1168 #ifdef CONFIG_ZONE_DMA32                         1153 #ifdef CONFIG_ZONE_DMA32
1169 #define TEXT_FOR_DMA32(xx) xx "_dma32",          1154 #define TEXT_FOR_DMA32(xx) xx "_dma32",
1170 #else                                            1155 #else
1171 #define TEXT_FOR_DMA32(xx)                       1156 #define TEXT_FOR_DMA32(xx)
1172 #endif                                           1157 #endif
1173                                                  1158 
1174 #ifdef CONFIG_HIGHMEM                            1159 #ifdef CONFIG_HIGHMEM
1175 #define TEXT_FOR_HIGHMEM(xx) xx "_high",         1160 #define TEXT_FOR_HIGHMEM(xx) xx "_high",
1176 #else                                            1161 #else
1177 #define TEXT_FOR_HIGHMEM(xx)                     1162 #define TEXT_FOR_HIGHMEM(xx)
1178 #endif                                           1163 #endif
1179                                                  1164 
1180 #ifdef CONFIG_ZONE_DEVICE                     << 
1181 #define TEXT_FOR_DEVICE(xx) xx "_device",     << 
1182 #else                                         << 
1183 #define TEXT_FOR_DEVICE(xx)                   << 
1184 #endif                                        << 
1185                                               << 
1186 #define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx)     1165 #define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \
1187                                         TEXT_ !! 1166                                         TEXT_FOR_HIGHMEM(xx) xx "_movable",
1188                                         TEXT_ << 
1189                                                  1167 
1190 const char * const vmstat_text[] = {             1168 const char * const vmstat_text[] = {
1191         /* enum zone_stat_item counters */       1169         /* enum zone_stat_item counters */
1192         "nr_free_pages",                         1170         "nr_free_pages",
1193         "nr_zone_inactive_anon",                 1171         "nr_zone_inactive_anon",
1194         "nr_zone_active_anon",                   1172         "nr_zone_active_anon",
1195         "nr_zone_inactive_file",                 1173         "nr_zone_inactive_file",
1196         "nr_zone_active_file",                   1174         "nr_zone_active_file",
1197         "nr_zone_unevictable",                   1175         "nr_zone_unevictable",
1198         "nr_zone_write_pending",                 1176         "nr_zone_write_pending",
1199         "nr_mlock",                              1177         "nr_mlock",
1200         "nr_bounce",                             1178         "nr_bounce",
1201 #if IS_ENABLED(CONFIG_ZSMALLOC)                  1179 #if IS_ENABLED(CONFIG_ZSMALLOC)
1202         "nr_zspages",                            1180         "nr_zspages",
1203 #endif                                           1181 #endif
1204         "nr_free_cma",                           1182         "nr_free_cma",
1205 #ifdef CONFIG_UNACCEPTED_MEMORY               << 
1206         "nr_unaccepted",                      << 
1207 #endif                                        << 
1208                                                  1183 
1209         /* enum numa_stat_item counters */       1184         /* enum numa_stat_item counters */
1210 #ifdef CONFIG_NUMA                               1185 #ifdef CONFIG_NUMA
1211         "numa_hit",                              1186         "numa_hit",
1212         "numa_miss",                             1187         "numa_miss",
1213         "numa_foreign",                          1188         "numa_foreign",
1214         "numa_interleave",                       1189         "numa_interleave",
1215         "numa_local",                            1190         "numa_local",
1216         "numa_other",                            1191         "numa_other",
1217 #endif                                           1192 #endif
1218                                                  1193 
1219         /* enum node_stat_item counters */       1194         /* enum node_stat_item counters */
1220         "nr_inactive_anon",                      1195         "nr_inactive_anon",
1221         "nr_active_anon",                        1196         "nr_active_anon",
1222         "nr_inactive_file",                      1197         "nr_inactive_file",
1223         "nr_active_file",                        1198         "nr_active_file",
1224         "nr_unevictable",                        1199         "nr_unevictable",
1225         "nr_slab_reclaimable",                   1200         "nr_slab_reclaimable",
1226         "nr_slab_unreclaimable",                 1201         "nr_slab_unreclaimable",
1227         "nr_isolated_anon",                      1202         "nr_isolated_anon",
1228         "nr_isolated_file",                      1203         "nr_isolated_file",
1229         "workingset_nodes",                      1204         "workingset_nodes",
1230         "workingset_refault_anon",               1205         "workingset_refault_anon",
1231         "workingset_refault_file",               1206         "workingset_refault_file",
1232         "workingset_activate_anon",              1207         "workingset_activate_anon",
1233         "workingset_activate_file",              1208         "workingset_activate_file",
1234         "workingset_restore_anon",               1209         "workingset_restore_anon",
1235         "workingset_restore_file",               1210         "workingset_restore_file",
1236         "workingset_nodereclaim",                1211         "workingset_nodereclaim",
1237         "nr_anon_pages",                         1212         "nr_anon_pages",
1238         "nr_mapped",                             1213         "nr_mapped",
1239         "nr_file_pages",                         1214         "nr_file_pages",
1240         "nr_dirty",                              1215         "nr_dirty",
1241         "nr_writeback",                          1216         "nr_writeback",
1242         "nr_writeback_temp",                     1217         "nr_writeback_temp",
1243         "nr_shmem",                              1218         "nr_shmem",
1244         "nr_shmem_hugepages",                    1219         "nr_shmem_hugepages",
1245         "nr_shmem_pmdmapped",                    1220         "nr_shmem_pmdmapped",
1246         "nr_file_hugepages",                     1221         "nr_file_hugepages",
1247         "nr_file_pmdmapped",                     1222         "nr_file_pmdmapped",
1248         "nr_anon_transparent_hugepages",         1223         "nr_anon_transparent_hugepages",
1249         "nr_vmscan_write",                       1224         "nr_vmscan_write",
1250         "nr_vmscan_immediate_reclaim",           1225         "nr_vmscan_immediate_reclaim",
1251         "nr_dirtied",                            1226         "nr_dirtied",
1252         "nr_written",                            1227         "nr_written",
1253         "nr_throttled_written",               << 
1254         "nr_kernel_misc_reclaimable",            1228         "nr_kernel_misc_reclaimable",
1255         "nr_foll_pin_acquired",                  1229         "nr_foll_pin_acquired",
1256         "nr_foll_pin_released",                  1230         "nr_foll_pin_released",
1257         "nr_kernel_stack",                       1231         "nr_kernel_stack",
1258 #if IS_ENABLED(CONFIG_SHADOW_CALL_STACK)         1232 #if IS_ENABLED(CONFIG_SHADOW_CALL_STACK)
1259         "nr_shadow_call_stack",                  1233         "nr_shadow_call_stack",
1260 #endif                                           1234 #endif
1261         "nr_page_table_pages",                   1235         "nr_page_table_pages",
1262         "nr_sec_page_table_pages",            << 
1263 #ifdef CONFIG_IOMMU_SUPPORT                   << 
1264         "nr_iommu_pages",                     << 
1265 #endif                                        << 
1266 #ifdef CONFIG_SWAP                               1236 #ifdef CONFIG_SWAP
1267         "nr_swapcached",                         1237         "nr_swapcached",
1268 #endif                                           1238 #endif
1269 #ifdef CONFIG_NUMA_BALANCING                  !! 1239 
1270         "pgpromote_success",                  !! 1240         /* enum writeback_stat_item counters */
1271         "pgpromote_candidate",                << 
1272 #endif                                        << 
1273         "pgdemote_kswapd",                    << 
1274         "pgdemote_direct",                    << 
1275         "pgdemote_khugepaged",                << 
1276         /* system-wide enum vm_stat_item coun << 
1277         "nr_dirty_threshold",                    1241         "nr_dirty_threshold",
1278         "nr_dirty_background_threshold",         1242         "nr_dirty_background_threshold",
1279         "nr_memmap_pages",                    << 
1280         "nr_memmap_boot_pages",               << 
1281                                                  1243 
1282 #if defined(CONFIG_VM_EVENT_COUNTERS) || defi    1244 #if defined(CONFIG_VM_EVENT_COUNTERS) || defined(CONFIG_MEMCG)
1283         /* enum vm_event_item counters */        1245         /* enum vm_event_item counters */
1284         "pgpgin",                                1246         "pgpgin",
1285         "pgpgout",                               1247         "pgpgout",
1286         "pswpin",                                1248         "pswpin",
1287         "pswpout",                               1249         "pswpout",
1288                                                  1250 
1289         TEXTS_FOR_ZONES("pgalloc")               1251         TEXTS_FOR_ZONES("pgalloc")
1290         TEXTS_FOR_ZONES("allocstall")            1252         TEXTS_FOR_ZONES("allocstall")
1291         TEXTS_FOR_ZONES("pgskip")                1253         TEXTS_FOR_ZONES("pgskip")
1292                                                  1254 
1293         "pgfree",                                1255         "pgfree",
1294         "pgactivate",                            1256         "pgactivate",
1295         "pgdeactivate",                          1257         "pgdeactivate",
1296         "pglazyfree",                            1258         "pglazyfree",
1297                                                  1259 
1298         "pgfault",                               1260         "pgfault",
1299         "pgmajfault",                            1261         "pgmajfault",
1300         "pglazyfreed",                           1262         "pglazyfreed",
1301                                                  1263 
1302         "pgrefill",                              1264         "pgrefill",
1303         "pgreuse",                               1265         "pgreuse",
1304         "pgsteal_kswapd",                        1266         "pgsteal_kswapd",
1305         "pgsteal_direct",                        1267         "pgsteal_direct",
1306         "pgsteal_khugepaged",                 !! 1268         "pgdemote_kswapd",
                                                   >> 1269         "pgdemote_direct",
1307         "pgscan_kswapd",                         1270         "pgscan_kswapd",
1308         "pgscan_direct",                         1271         "pgscan_direct",
1309         "pgscan_khugepaged",                  << 
1310         "pgscan_direct_throttle",                1272         "pgscan_direct_throttle",
1311         "pgscan_anon",                           1273         "pgscan_anon",
1312         "pgscan_file",                           1274         "pgscan_file",
1313         "pgsteal_anon",                          1275         "pgsteal_anon",
1314         "pgsteal_file",                          1276         "pgsteal_file",
1315                                                  1277 
1316 #ifdef CONFIG_NUMA                               1278 #ifdef CONFIG_NUMA
1317         "zone_reclaim_success",               << 
1318         "zone_reclaim_failed",                   1279         "zone_reclaim_failed",
1319 #endif                                           1280 #endif
1320         "pginodesteal",                          1281         "pginodesteal",
1321         "slabs_scanned",                         1282         "slabs_scanned",
1322         "kswapd_inodesteal",                     1283         "kswapd_inodesteal",
1323         "kswapd_low_wmark_hit_quickly",          1284         "kswapd_low_wmark_hit_quickly",
1324         "kswapd_high_wmark_hit_quickly",         1285         "kswapd_high_wmark_hit_quickly",
1325         "pageoutrun",                            1286         "pageoutrun",
1326                                                  1287 
1327         "pgrotated",                             1288         "pgrotated",
1328                                                  1289 
1329         "drop_pagecache",                        1290         "drop_pagecache",
1330         "drop_slab",                             1291         "drop_slab",
1331         "oom_kill",                              1292         "oom_kill",
1332                                                  1293 
1333 #ifdef CONFIG_NUMA_BALANCING                     1294 #ifdef CONFIG_NUMA_BALANCING
1334         "numa_pte_updates",                      1295         "numa_pte_updates",
1335         "numa_huge_pte_updates",                 1296         "numa_huge_pte_updates",
1336         "numa_hint_faults",                      1297         "numa_hint_faults",
1337         "numa_hint_faults_local",                1298         "numa_hint_faults_local",
1338         "numa_pages_migrated",                   1299         "numa_pages_migrated",
1339 #endif                                           1300 #endif
1340 #ifdef CONFIG_MIGRATION                          1301 #ifdef CONFIG_MIGRATION
1341         "pgmigrate_success",                     1302         "pgmigrate_success",
1342         "pgmigrate_fail",                        1303         "pgmigrate_fail",
1343         "thp_migration_success",                 1304         "thp_migration_success",
1344         "thp_migration_fail",                    1305         "thp_migration_fail",
1345         "thp_migration_split",                   1306         "thp_migration_split",
1346 #endif                                           1307 #endif
1347 #ifdef CONFIG_COMPACTION                         1308 #ifdef CONFIG_COMPACTION
1348         "compact_migrate_scanned",               1309         "compact_migrate_scanned",
1349         "compact_free_scanned",                  1310         "compact_free_scanned",
1350         "compact_isolated",                      1311         "compact_isolated",
1351         "compact_stall",                         1312         "compact_stall",
1352         "compact_fail",                          1313         "compact_fail",
1353         "compact_success",                       1314         "compact_success",
1354         "compact_daemon_wake",                   1315         "compact_daemon_wake",
1355         "compact_daemon_migrate_scanned",        1316         "compact_daemon_migrate_scanned",
1356         "compact_daemon_free_scanned",           1317         "compact_daemon_free_scanned",
1357 #endif                                           1318 #endif
1358                                                  1319 
1359 #ifdef CONFIG_HUGETLB_PAGE                       1320 #ifdef CONFIG_HUGETLB_PAGE
1360         "htlb_buddy_alloc_success",              1321         "htlb_buddy_alloc_success",
1361         "htlb_buddy_alloc_fail",                 1322         "htlb_buddy_alloc_fail",
1362 #endif                                           1323 #endif
1363 #ifdef CONFIG_CMA                                1324 #ifdef CONFIG_CMA
1364         "cma_alloc_success",                     1325         "cma_alloc_success",
1365         "cma_alloc_fail",                        1326         "cma_alloc_fail",
1366 #endif                                           1327 #endif
1367         "unevictable_pgs_culled",                1328         "unevictable_pgs_culled",
1368         "unevictable_pgs_scanned",               1329         "unevictable_pgs_scanned",
1369         "unevictable_pgs_rescued",               1330         "unevictable_pgs_rescued",
1370         "unevictable_pgs_mlocked",               1331         "unevictable_pgs_mlocked",
1371         "unevictable_pgs_munlocked",             1332         "unevictable_pgs_munlocked",
1372         "unevictable_pgs_cleared",               1333         "unevictable_pgs_cleared",
1373         "unevictable_pgs_stranded",              1334         "unevictable_pgs_stranded",
1374                                                  1335 
1375 #ifdef CONFIG_TRANSPARENT_HUGEPAGE               1336 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1376         "thp_fault_alloc",                       1337         "thp_fault_alloc",
1377         "thp_fault_fallback",                    1338         "thp_fault_fallback",
1378         "thp_fault_fallback_charge",             1339         "thp_fault_fallback_charge",
1379         "thp_collapse_alloc",                    1340         "thp_collapse_alloc",
1380         "thp_collapse_alloc_failed",             1341         "thp_collapse_alloc_failed",
1381         "thp_file_alloc",                        1342         "thp_file_alloc",
1382         "thp_file_fallback",                     1343         "thp_file_fallback",
1383         "thp_file_fallback_charge",              1344         "thp_file_fallback_charge",
1384         "thp_file_mapped",                       1345         "thp_file_mapped",
1385         "thp_split_page",                        1346         "thp_split_page",
1386         "thp_split_page_failed",                 1347         "thp_split_page_failed",
1387         "thp_deferred_split_page",               1348         "thp_deferred_split_page",
1388         "thp_underused_split_page",           << 
1389         "thp_split_pmd",                         1349         "thp_split_pmd",
1390         "thp_scan_exceed_none_pte",           << 
1391         "thp_scan_exceed_swap_pte",           << 
1392         "thp_scan_exceed_share_pte",          << 
1393 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_    1350 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
1394         "thp_split_pud",                         1351         "thp_split_pud",
1395 #endif                                           1352 #endif
1396         "thp_zero_page_alloc",                   1353         "thp_zero_page_alloc",
1397         "thp_zero_page_alloc_failed",            1354         "thp_zero_page_alloc_failed",
1398         "thp_swpout",                            1355         "thp_swpout",
1399         "thp_swpout_fallback",                   1356         "thp_swpout_fallback",
1400 #endif                                           1357 #endif
1401 #ifdef CONFIG_MEMORY_BALLOON                     1358 #ifdef CONFIG_MEMORY_BALLOON
1402         "balloon_inflate",                       1359         "balloon_inflate",
1403         "balloon_deflate",                       1360         "balloon_deflate",
1404 #ifdef CONFIG_BALLOON_COMPACTION                 1361 #ifdef CONFIG_BALLOON_COMPACTION
1405         "balloon_migrate",                       1362         "balloon_migrate",
1406 #endif                                           1363 #endif
1407 #endif /* CONFIG_MEMORY_BALLOON */               1364 #endif /* CONFIG_MEMORY_BALLOON */
1408 #ifdef CONFIG_DEBUG_TLBFLUSH                     1365 #ifdef CONFIG_DEBUG_TLBFLUSH
1409         "nr_tlb_remote_flush",                   1366         "nr_tlb_remote_flush",
1410         "nr_tlb_remote_flush_received",          1367         "nr_tlb_remote_flush_received",
1411         "nr_tlb_local_flush_all",                1368         "nr_tlb_local_flush_all",
1412         "nr_tlb_local_flush_one",                1369         "nr_tlb_local_flush_one",
1413 #endif /* CONFIG_DEBUG_TLBFLUSH */               1370 #endif /* CONFIG_DEBUG_TLBFLUSH */
1414                                                  1371 
                                                   >> 1372 #ifdef CONFIG_DEBUG_VM_VMACACHE
                                                   >> 1373         "vmacache_find_calls",
                                                   >> 1374         "vmacache_find_hits",
                                                   >> 1375 #endif
1415 #ifdef CONFIG_SWAP                               1376 #ifdef CONFIG_SWAP
1416         "swap_ra",                               1377         "swap_ra",
1417         "swap_ra_hit",                           1378         "swap_ra_hit",
1418 #ifdef CONFIG_KSM                             << 
1419         "ksm_swpin_copy",                     << 
1420 #endif                                        << 
1421 #endif                                        << 
1422 #ifdef CONFIG_KSM                             << 
1423         "cow_ksm",                            << 
1424 #endif                                        << 
1425 #ifdef CONFIG_ZSWAP                           << 
1426         "zswpin",                             << 
1427         "zswpout",                            << 
1428         "zswpwb",                             << 
1429 #endif                                           1379 #endif
1430 #ifdef CONFIG_X86                                1380 #ifdef CONFIG_X86
1431         "direct_map_level2_splits",              1381         "direct_map_level2_splits",
1432         "direct_map_level3_splits",              1382         "direct_map_level3_splits",
1433 #endif                                           1383 #endif
1434 #ifdef CONFIG_PER_VMA_LOCK_STATS              << 
1435         "vma_lock_success",                   << 
1436         "vma_lock_abort",                     << 
1437         "vma_lock_retry",                     << 
1438         "vma_lock_miss",                      << 
1439 #endif                                        << 
1440 #ifdef CONFIG_DEBUG_STACK_USAGE               << 
1441         "kstack_1k",                          << 
1442 #if THREAD_SIZE > 1024                        << 
1443         "kstack_2k",                          << 
1444 #endif                                        << 
1445 #if THREAD_SIZE > 2048                        << 
1446         "kstack_4k",                          << 
1447 #endif                                        << 
1448 #if THREAD_SIZE > 4096                        << 
1449         "kstack_8k",                          << 
1450 #endif                                        << 
1451 #if THREAD_SIZE > 8192                        << 
1452         "kstack_16k",                         << 
1453 #endif                                        << 
1454 #if THREAD_SIZE > 16384                       << 
1455         "kstack_32k",                         << 
1456 #endif                                        << 
1457 #if THREAD_SIZE > 32768                       << 
1458         "kstack_64k",                         << 
1459 #endif                                        << 
1460 #if THREAD_SIZE > 65536                       << 
1461         "kstack_rest",                        << 
1462 #endif                                        << 
1463 #endif                                        << 
1464 #endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_    1384 #endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */
1465 };                                               1385 };
1466 #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || C    1386 #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA || CONFIG_MEMCG */
1467                                                  1387 
1468 #if (defined(CONFIG_DEBUG_FS) && defined(CONF    1388 #if (defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)) || \
1469      defined(CONFIG_PROC_FS)                     1389      defined(CONFIG_PROC_FS)
1470 static void *frag_start(struct seq_file *m, l    1390 static void *frag_start(struct seq_file *m, loff_t *pos)
1471 {                                                1391 {
1472         pg_data_t *pgdat;                        1392         pg_data_t *pgdat;
1473         loff_t node = *pos;                      1393         loff_t node = *pos;
1474                                                  1394 
1475         for (pgdat = first_online_pgdat();       1395         for (pgdat = first_online_pgdat();
1476              pgdat && node;                      1396              pgdat && node;
1477              pgdat = next_online_pgdat(pgdat)    1397              pgdat = next_online_pgdat(pgdat))
1478                 --node;                          1398                 --node;
1479                                                  1399 
1480         return pgdat;                            1400         return pgdat;
1481 }                                                1401 }
1482                                                  1402 
1483 static void *frag_next(struct seq_file *m, vo    1403 static void *frag_next(struct seq_file *m, void *arg, loff_t *pos)
1484 {                                                1404 {
1485         pg_data_t *pgdat = (pg_data_t *)arg;     1405         pg_data_t *pgdat = (pg_data_t *)arg;
1486                                                  1406 
1487         (*pos)++;                                1407         (*pos)++;
1488         return next_online_pgdat(pgdat);         1408         return next_online_pgdat(pgdat);
1489 }                                                1409 }
1490                                                  1410 
1491 static void frag_stop(struct seq_file *m, voi    1411 static void frag_stop(struct seq_file *m, void *arg)
1492 {                                                1412 {
1493 }                                                1413 }
1494                                                  1414 
1495 /*                                               1415 /*
1496  * Walk zones in a node and print using a cal    1416  * Walk zones in a node and print using a callback.
1497  * If @assert_populated is true, only use cal    1417  * If @assert_populated is true, only use callback for zones that are populated.
1498  */                                              1418  */
1499 static void walk_zones_in_node(struct seq_fil    1419 static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
1500                 bool assert_populated, bool n    1420                 bool assert_populated, bool nolock,
1501                 void (*print)(struct seq_file    1421                 void (*print)(struct seq_file *m, pg_data_t *, struct zone *))
1502 {                                                1422 {
1503         struct zone *zone;                       1423         struct zone *zone;
1504         struct zone *node_zones = pgdat->node    1424         struct zone *node_zones = pgdat->node_zones;
1505         unsigned long flags;                     1425         unsigned long flags;
1506                                                  1426 
1507         for (zone = node_zones; zone - node_z    1427         for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
1508                 if (assert_populated && !popu    1428                 if (assert_populated && !populated_zone(zone))
1509                         continue;                1429                         continue;
1510                                                  1430 
1511                 if (!nolock)                     1431                 if (!nolock)
1512                         spin_lock_irqsave(&zo    1432                         spin_lock_irqsave(&zone->lock, flags);
1513                 print(m, pgdat, zone);           1433                 print(m, pgdat, zone);
1514                 if (!nolock)                     1434                 if (!nolock)
1515                         spin_unlock_irqrestor    1435                         spin_unlock_irqrestore(&zone->lock, flags);
1516         }                                        1436         }
1517 }                                                1437 }
1518 #endif                                           1438 #endif
1519                                                  1439 
1520 #ifdef CONFIG_PROC_FS                            1440 #ifdef CONFIG_PROC_FS
1521 static void frag_show_print(struct seq_file *    1441 static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
1522                                                  1442                                                 struct zone *zone)
1523 {                                                1443 {
1524         int order;                               1444         int order;
1525                                                  1445 
1526         seq_printf(m, "Node %d, zone %8s ", p    1446         seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
1527         for (order = 0; order < NR_PAGE_ORDER !! 1447         for (order = 0; order < MAX_ORDER; ++order)
1528                 /*                            !! 1448                 seq_printf(m, "%6lu ", zone->free_area[order].nr_free);
1529                  * Access to nr_free is lockl << 
1530                  * printing purposes. Use dat << 
1531                  */                           << 
1532                 seq_printf(m, "%6lu ", data_r << 
1533         seq_putc(m, '\n');                       1449         seq_putc(m, '\n');
1534 }                                                1450 }
1535                                                  1451 
1536 /*                                               1452 /*
1537  * This walks the free areas for each zone.      1453  * This walks the free areas for each zone.
1538  */                                              1454  */
1539 static int frag_show(struct seq_file *m, void    1455 static int frag_show(struct seq_file *m, void *arg)
1540 {                                                1456 {
1541         pg_data_t *pgdat = (pg_data_t *)arg;     1457         pg_data_t *pgdat = (pg_data_t *)arg;
1542         walk_zones_in_node(m, pgdat, true, fa    1458         walk_zones_in_node(m, pgdat, true, false, frag_show_print);
1543         return 0;                                1459         return 0;
1544 }                                                1460 }
1545                                                  1461 
1546 static void pagetypeinfo_showfree_print(struc    1462 static void pagetypeinfo_showfree_print(struct seq_file *m,
1547                                         pg_da    1463                                         pg_data_t *pgdat, struct zone *zone)
1548 {                                                1464 {
1549         int order, mtype;                        1465         int order, mtype;
1550                                                  1466 
1551         for (mtype = 0; mtype < MIGRATE_TYPES    1467         for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) {
1552                 seq_printf(m, "Node %4d, zone    1468                 seq_printf(m, "Node %4d, zone %8s, type %12s ",
1553                                         pgdat    1469                                         pgdat->node_id,
1554                                         zone-    1470                                         zone->name,
1555                                         migra    1471                                         migratetype_names[mtype]);
1556                 for (order = 0; order < NR_PA !! 1472                 for (order = 0; order < MAX_ORDER; ++order) {
1557                         unsigned long freecou    1473                         unsigned long freecount = 0;
1558                         struct free_area *are    1474                         struct free_area *area;
1559                         struct list_head *cur    1475                         struct list_head *curr;
1560                         bool overflow = false    1476                         bool overflow = false;
1561                                                  1477 
1562                         area = &(zone->free_a    1478                         area = &(zone->free_area[order]);
1563                                                  1479 
1564                         list_for_each(curr, &    1480                         list_for_each(curr, &area->free_list[mtype]) {
1565                                 /*               1481                                 /*
1566                                  * Cap the fr    1482                                  * Cap the free_list iteration because it might
1567                                  * be really     1483                                  * be really large and we are under a spinlock
1568                                  * so a long     1484                                  * so a long time spent here could trigger a
1569                                  * hard locku    1485                                  * hard lockup detector. Anyway this is a
1570                                  * debugging     1486                                  * debugging tool so knowing there is a handful
1571                                  * of pages o    1487                                  * of pages of this order should be more than
1572                                  * sufficient    1488                                  * sufficient.
1573                                  */              1489                                  */
1574                                 if (++freecou    1490                                 if (++freecount >= 100000) {
1575                                         overf    1491                                         overflow = true;
1576                                         break    1492                                         break;
1577                                 }                1493                                 }
1578                         }                        1494                         }
1579                         seq_printf(m, "%s%6lu    1495                         seq_printf(m, "%s%6lu ", overflow ? ">" : "", freecount);
1580                         spin_unlock_irq(&zone    1496                         spin_unlock_irq(&zone->lock);
1581                         cond_resched();          1497                         cond_resched();
1582                         spin_lock_irq(&zone->    1498                         spin_lock_irq(&zone->lock);
1583                 }                                1499                 }
1584                 seq_putc(m, '\n');               1500                 seq_putc(m, '\n');
1585         }                                        1501         }
1586 }                                                1502 }
1587                                                  1503 
1588 /* Print out the free pages at each order for    1504 /* Print out the free pages at each order for each migatetype */
1589 static void pagetypeinfo_showfree(struct seq_    1505 static void pagetypeinfo_showfree(struct seq_file *m, void *arg)
1590 {                                                1506 {
1591         int order;                               1507         int order;
1592         pg_data_t *pgdat = (pg_data_t *)arg;     1508         pg_data_t *pgdat = (pg_data_t *)arg;
1593                                                  1509 
1594         /* Print header */                       1510         /* Print header */
1595         seq_printf(m, "%-43s ", "Free pages c    1511         seq_printf(m, "%-43s ", "Free pages count per migrate type at order");
1596         for (order = 0; order < NR_PAGE_ORDER !! 1512         for (order = 0; order < MAX_ORDER; ++order)
1597                 seq_printf(m, "%6d ", order);    1513                 seq_printf(m, "%6d ", order);
1598         seq_putc(m, '\n');                       1514         seq_putc(m, '\n');
1599                                                  1515 
1600         walk_zones_in_node(m, pgdat, true, fa    1516         walk_zones_in_node(m, pgdat, true, false, pagetypeinfo_showfree_print);
1601 }                                                1517 }
1602                                                  1518 
1603 static void pagetypeinfo_showblockcount_print    1519 static void pagetypeinfo_showblockcount_print(struct seq_file *m,
1604                                         pg_da    1520                                         pg_data_t *pgdat, struct zone *zone)
1605 {                                                1521 {
1606         int mtype;                               1522         int mtype;
1607         unsigned long pfn;                       1523         unsigned long pfn;
1608         unsigned long start_pfn = zone->zone_    1524         unsigned long start_pfn = zone->zone_start_pfn;
1609         unsigned long end_pfn = zone_end_pfn(    1525         unsigned long end_pfn = zone_end_pfn(zone);
1610         unsigned long count[MIGRATE_TYPES] =     1526         unsigned long count[MIGRATE_TYPES] = { 0, };
1611                                                  1527 
1612         for (pfn = start_pfn; pfn < end_pfn;     1528         for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
1613                 struct page *page;               1529                 struct page *page;
1614                                                  1530 
1615                 page = pfn_to_online_page(pfn    1531                 page = pfn_to_online_page(pfn);
1616                 if (!page)                       1532                 if (!page)
1617                         continue;                1533                         continue;
1618                                                  1534 
1619                 if (page_zone(page) != zone)     1535                 if (page_zone(page) != zone)
1620                         continue;                1536                         continue;
1621                                                  1537 
1622                 mtype = get_pageblock_migrate    1538                 mtype = get_pageblock_migratetype(page);
1623                                                  1539 
1624                 if (mtype < MIGRATE_TYPES)       1540                 if (mtype < MIGRATE_TYPES)
1625                         count[mtype]++;          1541                         count[mtype]++;
1626         }                                        1542         }
1627                                                  1543 
1628         /* Print counts */                       1544         /* Print counts */
1629         seq_printf(m, "Node %d, zone %8s ", p    1545         seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
1630         for (mtype = 0; mtype < MIGRATE_TYPES    1546         for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
1631                 seq_printf(m, "%12lu ", count    1547                 seq_printf(m, "%12lu ", count[mtype]);
1632         seq_putc(m, '\n');                       1548         seq_putc(m, '\n');
1633 }                                                1549 }
1634                                                  1550 
1635 /* Print out the number of pageblocks for eac    1551 /* Print out the number of pageblocks for each migratetype */
1636 static void pagetypeinfo_showblockcount(struc    1552 static void pagetypeinfo_showblockcount(struct seq_file *m, void *arg)
1637 {                                                1553 {
1638         int mtype;                               1554         int mtype;
1639         pg_data_t *pgdat = (pg_data_t *)arg;     1555         pg_data_t *pgdat = (pg_data_t *)arg;
1640                                                  1556 
1641         seq_printf(m, "\n%-23s", "Number of b    1557         seq_printf(m, "\n%-23s", "Number of blocks type ");
1642         for (mtype = 0; mtype < MIGRATE_TYPES    1558         for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
1643                 seq_printf(m, "%12s ", migrat    1559                 seq_printf(m, "%12s ", migratetype_names[mtype]);
1644         seq_putc(m, '\n');                       1560         seq_putc(m, '\n');
1645         walk_zones_in_node(m, pgdat, true, fa    1561         walk_zones_in_node(m, pgdat, true, false,
1646                 pagetypeinfo_showblockcount_p    1562                 pagetypeinfo_showblockcount_print);
1647 }                                                1563 }
1648                                                  1564 
1649 /*                                               1565 /*
1650  * Print out the number of pageblocks for eac    1566  * Print out the number of pageblocks for each migratetype that contain pages
1651  * of other types. This gives an indication o    1567  * of other types. This gives an indication of how well fallbacks are being
1652  * contained by rmqueue_fallback(). It requir    1568  * contained by rmqueue_fallback(). It requires information from PAGE_OWNER
1653  * to determine what is going on                 1569  * to determine what is going on
1654  */                                              1570  */
1655 static void pagetypeinfo_showmixedcount(struc    1571 static void pagetypeinfo_showmixedcount(struct seq_file *m, pg_data_t *pgdat)
1656 {                                                1572 {
1657 #ifdef CONFIG_PAGE_OWNER                         1573 #ifdef CONFIG_PAGE_OWNER
1658         int mtype;                               1574         int mtype;
1659                                                  1575 
1660         if (!static_branch_unlikely(&page_own    1576         if (!static_branch_unlikely(&page_owner_inited))
1661                 return;                          1577                 return;
1662                                                  1578 
1663         drain_all_pages(NULL);                   1579         drain_all_pages(NULL);
1664                                                  1580 
1665         seq_printf(m, "\n%-23s", "Number of m    1581         seq_printf(m, "\n%-23s", "Number of mixed blocks ");
1666         for (mtype = 0; mtype < MIGRATE_TYPES    1582         for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
1667                 seq_printf(m, "%12s ", migrat    1583                 seq_printf(m, "%12s ", migratetype_names[mtype]);
1668         seq_putc(m, '\n');                       1584         seq_putc(m, '\n');
1669                                                  1585 
1670         walk_zones_in_node(m, pgdat, true, tr    1586         walk_zones_in_node(m, pgdat, true, true,
1671                 pagetypeinfo_showmixedcount_p    1587                 pagetypeinfo_showmixedcount_print);
1672 #endif /* CONFIG_PAGE_OWNER */                   1588 #endif /* CONFIG_PAGE_OWNER */
1673 }                                                1589 }
1674                                                  1590 
1675 /*                                               1591 /*
1676  * This prints out statistics in relation to     1592  * This prints out statistics in relation to grouping pages by mobility.
1677  * It is expensive to collect so do not const    1593  * It is expensive to collect so do not constantly read the file.
1678  */                                              1594  */
1679 static int pagetypeinfo_show(struct seq_file     1595 static int pagetypeinfo_show(struct seq_file *m, void *arg)
1680 {                                                1596 {
1681         pg_data_t *pgdat = (pg_data_t *)arg;     1597         pg_data_t *pgdat = (pg_data_t *)arg;
1682                                                  1598 
1683         /* check memoryless node */              1599         /* check memoryless node */
1684         if (!node_state(pgdat->node_id, N_MEM    1600         if (!node_state(pgdat->node_id, N_MEMORY))
1685                 return 0;                        1601                 return 0;
1686                                                  1602 
1687         seq_printf(m, "Page block order: %d\n    1603         seq_printf(m, "Page block order: %d\n", pageblock_order);
1688         seq_printf(m, "Pages per block:  %lu\    1604         seq_printf(m, "Pages per block:  %lu\n", pageblock_nr_pages);
1689         seq_putc(m, '\n');                       1605         seq_putc(m, '\n');
1690         pagetypeinfo_showfree(m, pgdat);         1606         pagetypeinfo_showfree(m, pgdat);
1691         pagetypeinfo_showblockcount(m, pgdat)    1607         pagetypeinfo_showblockcount(m, pgdat);
1692         pagetypeinfo_showmixedcount(m, pgdat)    1608         pagetypeinfo_showmixedcount(m, pgdat);
1693                                                  1609 
1694         return 0;                                1610         return 0;
1695 }                                                1611 }
1696                                                  1612 
1697 static const struct seq_operations fragmentat    1613 static const struct seq_operations fragmentation_op = {
1698         .start  = frag_start,                    1614         .start  = frag_start,
1699         .next   = frag_next,                     1615         .next   = frag_next,
1700         .stop   = frag_stop,                     1616         .stop   = frag_stop,
1701         .show   = frag_show,                     1617         .show   = frag_show,
1702 };                                               1618 };
1703                                                  1619 
1704 static const struct seq_operations pagetypein    1620 static const struct seq_operations pagetypeinfo_op = {
1705         .start  = frag_start,                    1621         .start  = frag_start,
1706         .next   = frag_next,                     1622         .next   = frag_next,
1707         .stop   = frag_stop,                     1623         .stop   = frag_stop,
1708         .show   = pagetypeinfo_show,             1624         .show   = pagetypeinfo_show,
1709 };                                               1625 };
1710                                                  1626 
1711 static bool is_zone_first_populated(pg_data_t    1627 static bool is_zone_first_populated(pg_data_t *pgdat, struct zone *zone)
1712 {                                                1628 {
1713         int zid;                                 1629         int zid;
1714                                                  1630 
1715         for (zid = 0; zid < MAX_NR_ZONES; zid    1631         for (zid = 0; zid < MAX_NR_ZONES; zid++) {
1716                 struct zone *compare = &pgdat    1632                 struct zone *compare = &pgdat->node_zones[zid];
1717                                                  1633 
1718                 if (populated_zone(compare))     1634                 if (populated_zone(compare))
1719                         return zone == compar    1635                         return zone == compare;
1720         }                                        1636         }
1721                                                  1637 
1722         return false;                            1638         return false;
1723 }                                                1639 }
1724                                                  1640 
1725 static void zoneinfo_show_print(struct seq_fi    1641 static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
1726                                                  1642                                                         struct zone *zone)
1727 {                                                1643 {
1728         int i;                                   1644         int i;
1729         seq_printf(m, "Node %d, zone %8s", pg    1645         seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
1730         if (is_zone_first_populated(pgdat, zo    1646         if (is_zone_first_populated(pgdat, zone)) {
1731                 seq_printf(m, "\n  per-node s    1647                 seq_printf(m, "\n  per-node stats");
1732                 for (i = 0; i < NR_VM_NODE_ST    1648                 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
1733                         unsigned long pages =    1649                         unsigned long pages = node_page_state_pages(pgdat, i);
1734                                                  1650 
1735                         if (vmstat_item_print    1651                         if (vmstat_item_print_in_thp(i))
1736                                 pages /= HPAG    1652                                 pages /= HPAGE_PMD_NR;
1737                         seq_printf(m, "\n        1653                         seq_printf(m, "\n      %-12s %lu", node_stat_name(i),
1738                                    pages);       1654                                    pages);
1739                 }                                1655                 }
1740         }                                        1656         }
1741         seq_printf(m,                            1657         seq_printf(m,
1742                    "\n  pages free     %lu"      1658                    "\n  pages free     %lu"
1743                    "\n        boost    %lu"   << 
1744                    "\n        min      %lu"      1659                    "\n        min      %lu"
1745                    "\n        low      %lu"      1660                    "\n        low      %lu"
1746                    "\n        high     %lu"      1661                    "\n        high     %lu"
1747                    "\n        promo    %lu"   << 
1748                    "\n        spanned  %lu"      1662                    "\n        spanned  %lu"
1749                    "\n        present  %lu"      1663                    "\n        present  %lu"
1750                    "\n        managed  %lu"      1664                    "\n        managed  %lu"
1751                    "\n        cma      %lu",     1665                    "\n        cma      %lu",
1752                    zone_page_state(zone, NR_F    1666                    zone_page_state(zone, NR_FREE_PAGES),
1753                    zone->watermark_boost,     << 
1754                    min_wmark_pages(zone),        1667                    min_wmark_pages(zone),
1755                    low_wmark_pages(zone),        1668                    low_wmark_pages(zone),
1756                    high_wmark_pages(zone),       1669                    high_wmark_pages(zone),
1757                    promo_wmark_pages(zone),   << 
1758                    zone->spanned_pages,          1670                    zone->spanned_pages,
1759                    zone->present_pages,          1671                    zone->present_pages,
1760                    zone_managed_pages(zone),     1672                    zone_managed_pages(zone),
1761                    zone_cma_pages(zone));        1673                    zone_cma_pages(zone));
1762                                                  1674 
1763         seq_printf(m,                            1675         seq_printf(m,
1764                    "\n        protection: (%l    1676                    "\n        protection: (%ld",
1765                    zone->lowmem_reserve[0]);     1677                    zone->lowmem_reserve[0]);
1766         for (i = 1; i < ARRAY_SIZE(zone->lowm    1678         for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
1767                 seq_printf(m, ", %ld", zone->    1679                 seq_printf(m, ", %ld", zone->lowmem_reserve[i]);
1768         seq_putc(m, ')');                        1680         seq_putc(m, ')');
1769                                                  1681 
1770         /* If unpopulated, no other informati    1682         /* If unpopulated, no other information is useful */
1771         if (!populated_zone(zone)) {             1683         if (!populated_zone(zone)) {
1772                 seq_putc(m, '\n');               1684                 seq_putc(m, '\n');
1773                 return;                          1685                 return;
1774         }                                        1686         }
1775                                                  1687 
1776         for (i = 0; i < NR_VM_ZONE_STAT_ITEMS    1688         for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
1777                 seq_printf(m, "\n      %-12s     1689                 seq_printf(m, "\n      %-12s %lu", zone_stat_name(i),
1778                            zone_page_state(zo    1690                            zone_page_state(zone, i));
1779                                                  1691 
1780 #ifdef CONFIG_NUMA                               1692 #ifdef CONFIG_NUMA
1781         for (i = 0; i < NR_VM_NUMA_EVENT_ITEM    1693         for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++)
1782                 seq_printf(m, "\n      %-12s     1694                 seq_printf(m, "\n      %-12s %lu", numa_stat_name(i),
1783                            zone_numa_event_st    1695                            zone_numa_event_state(zone, i));
1784 #endif                                           1696 #endif
1785                                                  1697 
1786         seq_printf(m, "\n  pagesets");           1698         seq_printf(m, "\n  pagesets");
1787         for_each_online_cpu(i) {                 1699         for_each_online_cpu(i) {
1788                 struct per_cpu_pages *pcp;       1700                 struct per_cpu_pages *pcp;
1789                 struct per_cpu_zonestat __may    1701                 struct per_cpu_zonestat __maybe_unused *pzstats;
1790                                                  1702 
1791                 pcp = per_cpu_ptr(zone->per_c    1703                 pcp = per_cpu_ptr(zone->per_cpu_pageset, i);
1792                 seq_printf(m,                    1704                 seq_printf(m,
1793                            "\n    cpu: %i"       1705                            "\n    cpu: %i"
1794                            "\n              c    1706                            "\n              count: %i"
1795                            "\n              h    1707                            "\n              high:  %i"
1796                            "\n              b    1708                            "\n              batch: %i",
1797                            i,                    1709                            i,
1798                            pcp->count,           1710                            pcp->count,
1799                            pcp->high,            1711                            pcp->high,
1800                            pcp->batch);          1712                            pcp->batch);
1801 #ifdef CONFIG_SMP                                1713 #ifdef CONFIG_SMP
1802                 pzstats = per_cpu_ptr(zone->p    1714                 pzstats = per_cpu_ptr(zone->per_cpu_zonestats, i);
1803                 seq_printf(m, "\n  vm stats t    1715                 seq_printf(m, "\n  vm stats threshold: %d",
1804                                 pzstats->stat    1716                                 pzstats->stat_threshold);
1805 #endif                                           1717 #endif
1806         }                                        1718         }
1807         seq_printf(m,                            1719         seq_printf(m,
1808                    "\n  node_unreclaimable:      1720                    "\n  node_unreclaimable:  %u"
1809                    "\n  start_pfn:               1721                    "\n  start_pfn:           %lu",
1810                    pgdat->kswapd_failures >=     1722                    pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES,
1811                    zone->zone_start_pfn);        1723                    zone->zone_start_pfn);
1812         seq_putc(m, '\n');                       1724         seq_putc(m, '\n');
1813 }                                                1725 }
1814                                                  1726 
1815 /*                                               1727 /*
1816  * Output information about zones in @pgdat.     1728  * Output information about zones in @pgdat.  All zones are printed regardless
1817  * of whether they are populated or not: lowm    1729  * of whether they are populated or not: lowmem_reserve_ratio operates on the
1818  * set of all zones and userspace would not b    1730  * set of all zones and userspace would not be aware of such zones if they are
1819  * suppressed here (zoneinfo displays the eff    1731  * suppressed here (zoneinfo displays the effect of lowmem_reserve_ratio).
1820  */                                              1732  */
1821 static int zoneinfo_show(struct seq_file *m,     1733 static int zoneinfo_show(struct seq_file *m, void *arg)
1822 {                                                1734 {
1823         pg_data_t *pgdat = (pg_data_t *)arg;     1735         pg_data_t *pgdat = (pg_data_t *)arg;
1824         walk_zones_in_node(m, pgdat, false, f    1736         walk_zones_in_node(m, pgdat, false, false, zoneinfo_show_print);
1825         return 0;                                1737         return 0;
1826 }                                                1738 }
1827                                                  1739 
1828 static const struct seq_operations zoneinfo_o    1740 static const struct seq_operations zoneinfo_op = {
1829         .start  = frag_start, /* iterate over    1741         .start  = frag_start, /* iterate over all zones. The same as in
1830                                * fragmentatio    1742                                * fragmentation. */
1831         .next   = frag_next,                     1743         .next   = frag_next,
1832         .stop   = frag_stop,                     1744         .stop   = frag_stop,
1833         .show   = zoneinfo_show,                 1745         .show   = zoneinfo_show,
1834 };                                               1746 };
1835                                                  1747 
1836 #define NR_VMSTAT_ITEMS (NR_VM_ZONE_STAT_ITEM    1748 #define NR_VMSTAT_ITEMS (NR_VM_ZONE_STAT_ITEMS + \
1837                          NR_VM_NUMA_EVENT_ITE    1749                          NR_VM_NUMA_EVENT_ITEMS + \
1838                          NR_VM_NODE_STAT_ITEM    1750                          NR_VM_NODE_STAT_ITEMS + \
1839                          NR_VM_STAT_ITEMS + \ !! 1751                          NR_VM_WRITEBACK_STAT_ITEMS + \
1840                          (IS_ENABLED(CONFIG_V    1752                          (IS_ENABLED(CONFIG_VM_EVENT_COUNTERS) ? \
1841                           NR_VM_EVENT_ITEMS :    1753                           NR_VM_EVENT_ITEMS : 0))
1842                                                  1754 
1843 static void *vmstat_start(struct seq_file *m,    1755 static void *vmstat_start(struct seq_file *m, loff_t *pos)
1844 {                                                1756 {
1845         unsigned long *v;                        1757         unsigned long *v;
1846         int i;                                   1758         int i;
1847                                                  1759 
1848         if (*pos >= NR_VMSTAT_ITEMS)             1760         if (*pos >= NR_VMSTAT_ITEMS)
1849                 return NULL;                     1761                 return NULL;
1850                                                  1762 
1851         BUILD_BUG_ON(ARRAY_SIZE(vmstat_text)     1763         BUILD_BUG_ON(ARRAY_SIZE(vmstat_text) < NR_VMSTAT_ITEMS);
1852         fold_vm_numa_events();                   1764         fold_vm_numa_events();
1853         v = kmalloc_array(NR_VMSTAT_ITEMS, si    1765         v = kmalloc_array(NR_VMSTAT_ITEMS, sizeof(unsigned long), GFP_KERNEL);
1854         m->private = v;                          1766         m->private = v;
1855         if (!v)                                  1767         if (!v)
1856                 return ERR_PTR(-ENOMEM);         1768                 return ERR_PTR(-ENOMEM);
1857         for (i = 0; i < NR_VM_ZONE_STAT_ITEMS    1769         for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
1858                 v[i] = global_zone_page_state    1770                 v[i] = global_zone_page_state(i);
1859         v += NR_VM_ZONE_STAT_ITEMS;              1771         v += NR_VM_ZONE_STAT_ITEMS;
1860                                                  1772 
1861 #ifdef CONFIG_NUMA                               1773 #ifdef CONFIG_NUMA
1862         for (i = 0; i < NR_VM_NUMA_EVENT_ITEM    1774         for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++)
1863                 v[i] = global_numa_event_stat    1775                 v[i] = global_numa_event_state(i);
1864         v += NR_VM_NUMA_EVENT_ITEMS;             1776         v += NR_VM_NUMA_EVENT_ITEMS;
1865 #endif                                           1777 #endif
1866                                                  1778 
1867         for (i = 0; i < NR_VM_NODE_STAT_ITEMS    1779         for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
1868                 v[i] = global_node_page_state    1780                 v[i] = global_node_page_state_pages(i);
1869                 if (vmstat_item_print_in_thp(    1781                 if (vmstat_item_print_in_thp(i))
1870                         v[i] /= HPAGE_PMD_NR;    1782                         v[i] /= HPAGE_PMD_NR;
1871         }                                        1783         }
1872         v += NR_VM_NODE_STAT_ITEMS;              1784         v += NR_VM_NODE_STAT_ITEMS;
1873                                                  1785 
1874         global_dirty_limits(v + NR_DIRTY_BG_T    1786         global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD,
1875                             v + NR_DIRTY_THRE    1787                             v + NR_DIRTY_THRESHOLD);
1876         v[NR_MEMMAP_PAGES] = atomic_long_read !! 1788         v += NR_VM_WRITEBACK_STAT_ITEMS;
1877         v[NR_MEMMAP_BOOT_PAGES] = atomic_long << 
1878         v += NR_VM_STAT_ITEMS;                << 
1879                                                  1789 
1880 #ifdef CONFIG_VM_EVENT_COUNTERS                  1790 #ifdef CONFIG_VM_EVENT_COUNTERS
1881         all_vm_events(v);                        1791         all_vm_events(v);
1882         v[PGPGIN] /= 2;         /* sectors ->    1792         v[PGPGIN] /= 2;         /* sectors -> kbytes */
1883         v[PGPGOUT] /= 2;                         1793         v[PGPGOUT] /= 2;
1884 #endif                                           1794 #endif
1885         return (unsigned long *)m->private +     1795         return (unsigned long *)m->private + *pos;
1886 }                                                1796 }
1887                                                  1797 
1888 static void *vmstat_next(struct seq_file *m,     1798 static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)
1889 {                                                1799 {
1890         (*pos)++;                                1800         (*pos)++;
1891         if (*pos >= NR_VMSTAT_ITEMS)             1801         if (*pos >= NR_VMSTAT_ITEMS)
1892                 return NULL;                     1802                 return NULL;
1893         return (unsigned long *)m->private +     1803         return (unsigned long *)m->private + *pos;
1894 }                                                1804 }
1895                                                  1805 
1896 static int vmstat_show(struct seq_file *m, vo    1806 static int vmstat_show(struct seq_file *m, void *arg)
1897 {                                                1807 {
1898         unsigned long *l = arg;                  1808         unsigned long *l = arg;
1899         unsigned long off = l - (unsigned lon    1809         unsigned long off = l - (unsigned long *)m->private;
1900                                                  1810 
1901         seq_puts(m, vmstat_text[off]);           1811         seq_puts(m, vmstat_text[off]);
1902         seq_put_decimal_ull(m, " ", *l);         1812         seq_put_decimal_ull(m, " ", *l);
1903         seq_putc(m, '\n');                       1813         seq_putc(m, '\n');
1904                                                  1814 
1905         if (off == NR_VMSTAT_ITEMS - 1) {        1815         if (off == NR_VMSTAT_ITEMS - 1) {
1906                 /*                               1816                 /*
1907                  * We've come to the end - ad    1817                  * We've come to the end - add any deprecated counters to avoid
1908                  * breaking userspace which m    1818                  * breaking userspace which might depend on them being present.
1909                  */                              1819                  */
1910                 seq_puts(m, "nr_unstable 0\n"    1820                 seq_puts(m, "nr_unstable 0\n");
1911         }                                        1821         }
1912         return 0;                                1822         return 0;
1913 }                                                1823 }
1914                                                  1824 
1915 static void vmstat_stop(struct seq_file *m, v    1825 static void vmstat_stop(struct seq_file *m, void *arg)
1916 {                                                1826 {
1917         kfree(m->private);                       1827         kfree(m->private);
1918         m->private = NULL;                       1828         m->private = NULL;
1919 }                                                1829 }
1920                                                  1830 
1921 static const struct seq_operations vmstat_op     1831 static const struct seq_operations vmstat_op = {
1922         .start  = vmstat_start,                  1832         .start  = vmstat_start,
1923         .next   = vmstat_next,                   1833         .next   = vmstat_next,
1924         .stop   = vmstat_stop,                   1834         .stop   = vmstat_stop,
1925         .show   = vmstat_show,                   1835         .show   = vmstat_show,
1926 };                                               1836 };
1927 #endif /* CONFIG_PROC_FS */                      1837 #endif /* CONFIG_PROC_FS */
1928                                                  1838 
1929 #ifdef CONFIG_SMP                                1839 #ifdef CONFIG_SMP
1930 static DEFINE_PER_CPU(struct delayed_work, vm    1840 static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
1931 int sysctl_stat_interval __read_mostly = HZ;     1841 int sysctl_stat_interval __read_mostly = HZ;
1932                                                  1842 
1933 #ifdef CONFIG_PROC_FS                            1843 #ifdef CONFIG_PROC_FS
1934 static void refresh_vm_stats(struct work_stru    1844 static void refresh_vm_stats(struct work_struct *work)
1935 {                                                1845 {
1936         refresh_cpu_vm_stats(true);              1846         refresh_cpu_vm_stats(true);
1937 }                                                1847 }
1938                                                  1848 
1939 int vmstat_refresh(const struct ctl_table *ta !! 1849 int vmstat_refresh(struct ctl_table *table, int write,
1940                    void *buffer, size_t *lenp    1850                    void *buffer, size_t *lenp, loff_t *ppos)
1941 {                                                1851 {
1942         long val;                                1852         long val;
1943         int err;                                 1853         int err;
1944         int i;                                   1854         int i;
1945                                                  1855 
1946         /*                                       1856         /*
1947          * The regular update, every sysctl_s    1857          * The regular update, every sysctl_stat_interval, may come later
1948          * than expected: leaving a significa    1858          * than expected: leaving a significant amount in per_cpu buckets.
1949          * This is particularly misleading wh    1859          * This is particularly misleading when checking a quantity of HUGE
1950          * pages, immediately after running a    1860          * pages, immediately after running a test.  /proc/sys/vm/stat_refresh,
1951          * which can equally be echo'ed to or    1861          * which can equally be echo'ed to or cat'ted from (by root),
1952          * can be used to update the stats ju    1862          * can be used to update the stats just before reading them.
1953          *                                       1863          *
1954          * Oh, and since global_zone_page_sta    1864          * Oh, and since global_zone_page_state() etc. are so careful to hide
1955          * transiently negative values, repor    1865          * transiently negative values, report an error here if any of
1956          * the stats is negative, so we know     1866          * the stats is negative, so we know to go looking for imbalance.
1957          */                                      1867          */
1958         err = schedule_on_each_cpu(refresh_vm    1868         err = schedule_on_each_cpu(refresh_vm_stats);
1959         if (err)                                 1869         if (err)
1960                 return err;                      1870                 return err;
1961         for (i = 0; i < NR_VM_ZONE_STAT_ITEMS    1871         for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
1962                 /*                               1872                 /*
1963                  * Skip checking stats known     1873                  * Skip checking stats known to go negative occasionally.
1964                  */                              1874                  */
1965                 switch (i) {                     1875                 switch (i) {
1966                 case NR_ZONE_WRITE_PENDING:      1876                 case NR_ZONE_WRITE_PENDING:
1967                 case NR_FREE_CMA_PAGES:          1877                 case NR_FREE_CMA_PAGES:
1968                         continue;                1878                         continue;
1969                 }                                1879                 }
1970                 val = atomic_long_read(&vm_zo    1880                 val = atomic_long_read(&vm_zone_stat[i]);
1971                 if (val < 0) {                   1881                 if (val < 0) {
1972                         pr_warn("%s: %s %ld\n    1882                         pr_warn("%s: %s %ld\n",
1973                                 __func__, zon    1883                                 __func__, zone_stat_name(i), val);
1974                 }                                1884                 }
1975         }                                        1885         }
1976         for (i = 0; i < NR_VM_NODE_STAT_ITEMS    1886         for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
1977                 /*                               1887                 /*
1978                  * Skip checking stats known     1888                  * Skip checking stats known to go negative occasionally.
1979                  */                              1889                  */
1980                 switch (i) {                     1890                 switch (i) {
1981                 case NR_WRITEBACK:               1891                 case NR_WRITEBACK:
1982                         continue;                1892                         continue;
1983                 }                                1893                 }
1984                 val = atomic_long_read(&vm_no    1894                 val = atomic_long_read(&vm_node_stat[i]);
1985                 if (val < 0) {                   1895                 if (val < 0) {
1986                         pr_warn("%s: %s %ld\n    1896                         pr_warn("%s: %s %ld\n",
1987                                 __func__, nod    1897                                 __func__, node_stat_name(i), val);
1988                 }                                1898                 }
1989         }                                        1899         }
1990         if (write)                               1900         if (write)
1991                 *ppos += *lenp;                  1901                 *ppos += *lenp;
1992         else                                     1902         else
1993                 *lenp = 0;                       1903                 *lenp = 0;
1994         return 0;                                1904         return 0;
1995 }                                                1905 }
1996 #endif /* CONFIG_PROC_FS */                      1906 #endif /* CONFIG_PROC_FS */
1997                                                  1907 
1998 static void vmstat_update(struct work_struct     1908 static void vmstat_update(struct work_struct *w)
1999 {                                                1909 {
2000         if (refresh_cpu_vm_stats(true)) {        1910         if (refresh_cpu_vm_stats(true)) {
2001                 /*                               1911                 /*
2002                  * Counters were updated so w    1912                  * Counters were updated so we expect more updates
2003                  * to occur in the future. Ke    1913                  * to occur in the future. Keep on running the
2004                  * update worker thread.         1914                  * update worker thread.
2005                  */                              1915                  */
2006                 queue_delayed_work_on(smp_pro    1916                 queue_delayed_work_on(smp_processor_id(), mm_percpu_wq,
2007                                 this_cpu_ptr(    1917                                 this_cpu_ptr(&vmstat_work),
2008                                 round_jiffies    1918                                 round_jiffies_relative(sysctl_stat_interval));
2009         }                                        1919         }
2010 }                                                1920 }
2011                                                  1921 
2012 /*                                               1922 /*
2013  * Check if the diffs for a certain cpu indic    1923  * Check if the diffs for a certain cpu indicate that
2014  * an update is needed.                          1924  * an update is needed.
2015  */                                              1925  */
2016 static bool need_update(int cpu)                 1926 static bool need_update(int cpu)
2017 {                                                1927 {
2018         pg_data_t *last_pgdat = NULL;            1928         pg_data_t *last_pgdat = NULL;
2019         struct zone *zone;                       1929         struct zone *zone;
2020                                                  1930 
2021         for_each_populated_zone(zone) {          1931         for_each_populated_zone(zone) {
2022                 struct per_cpu_zonestat *pzst    1932                 struct per_cpu_zonestat *pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
2023                 struct per_cpu_nodestat *n;      1933                 struct per_cpu_nodestat *n;
2024                                                  1934 
2025                 /*                               1935                 /*
2026                  * The fast way of checking i    1936                  * The fast way of checking if there are any vmstat diffs.
2027                  */                              1937                  */
2028                 if (memchr_inv(pzstats->vm_st    1938                 if (memchr_inv(pzstats->vm_stat_diff, 0, sizeof(pzstats->vm_stat_diff)))
2029                         return true;             1939                         return true;
2030                                                  1940 
2031                 if (last_pgdat == zone->zone_    1941                 if (last_pgdat == zone->zone_pgdat)
2032                         continue;                1942                         continue;
2033                 last_pgdat = zone->zone_pgdat    1943                 last_pgdat = zone->zone_pgdat;
2034                 n = per_cpu_ptr(zone->zone_pg    1944                 n = per_cpu_ptr(zone->zone_pgdat->per_cpu_nodestats, cpu);
2035                 if (memchr_inv(n->vm_node_sta    1945                 if (memchr_inv(n->vm_node_stat_diff, 0, sizeof(n->vm_node_stat_diff)))
2036                         return true;             1946                         return true;
2037         }                                        1947         }
2038         return false;                            1948         return false;
2039 }                                                1949 }
2040                                                  1950 
2041 /*                                               1951 /*
2042  * Switch off vmstat processing and then fold    1952  * Switch off vmstat processing and then fold all the remaining differentials
2043  * until the diffs stay at zero. The function    1953  * until the diffs stay at zero. The function is used by NOHZ and can only be
2044  * invoked when tick processing is not active    1954  * invoked when tick processing is not active.
2045  */                                              1955  */
2046 void quiet_vmstat(void)                          1956 void quiet_vmstat(void)
2047 {                                                1957 {
2048         if (system_state != SYSTEM_RUNNING)      1958         if (system_state != SYSTEM_RUNNING)
2049                 return;                          1959                 return;
2050                                                  1960 
2051         if (!delayed_work_pending(this_cpu_pt    1961         if (!delayed_work_pending(this_cpu_ptr(&vmstat_work)))
2052                 return;                          1962                 return;
2053                                                  1963 
2054         if (!need_update(smp_processor_id()))    1964         if (!need_update(smp_processor_id()))
2055                 return;                          1965                 return;
2056                                                  1966 
2057         /*                                       1967         /*
2058          * Just refresh counters and do not c    1968          * Just refresh counters and do not care about the pending delayed
2059          * vmstat_update. It doesn't fire tha    1969          * vmstat_update. It doesn't fire that often to matter and canceling
2060          * it would be too expensive from thi    1970          * it would be too expensive from this path.
2061          * vmstat_shepherd will take care abo    1971          * vmstat_shepherd will take care about that for us.
2062          */                                      1972          */
2063         refresh_cpu_vm_stats(false);             1973         refresh_cpu_vm_stats(false);
2064 }                                                1974 }
2065                                                  1975 
2066 /*                                               1976 /*
2067  * Shepherd worker thread that checks the        1977  * Shepherd worker thread that checks the
2068  * differentials of processors that have thei    1978  * differentials of processors that have their worker
2069  * threads for vm statistics updates disabled    1979  * threads for vm statistics updates disabled because of
2070  * inactivity.                                   1980  * inactivity.
2071  */                                              1981  */
2072 static void vmstat_shepherd(struct work_struc    1982 static void vmstat_shepherd(struct work_struct *w);
2073                                                  1983 
2074 static DECLARE_DEFERRABLE_WORK(shepherd, vmst    1984 static DECLARE_DEFERRABLE_WORK(shepherd, vmstat_shepherd);
2075                                                  1985 
2076 static void vmstat_shepherd(struct work_struc    1986 static void vmstat_shepherd(struct work_struct *w)
2077 {                                                1987 {
2078         int cpu;                                 1988         int cpu;
2079                                                  1989 
2080         cpus_read_lock();                        1990         cpus_read_lock();
2081         /* Check processors whose vmstat work    1991         /* Check processors whose vmstat worker threads have been disabled */
2082         for_each_online_cpu(cpu) {               1992         for_each_online_cpu(cpu) {
2083                 struct delayed_work *dw = &pe    1993                 struct delayed_work *dw = &per_cpu(vmstat_work, cpu);
2084                                                  1994 
2085                 /*                            << 
2086                  * In kernel users of vmstat  << 
2087                  * they are using zone_page_s << 
2088                  * an imprecision as the regu << 
2089                  * cumulative error can grow  << 
2090                  *                            << 
2091                  * From that POV the regular  << 
2092                  * been isolated from the ker << 
2093                  * infrastructure ever notici << 
2094                  * for all isolated CPUs to a << 
2095                  */                           << 
2096                 if (cpu_is_isolated(cpu))     << 
2097                         continue;             << 
2098                                               << 
2099                 if (!delayed_work_pending(dw)    1995                 if (!delayed_work_pending(dw) && need_update(cpu))
2100                         queue_delayed_work_on    1996                         queue_delayed_work_on(cpu, mm_percpu_wq, dw, 0);
2101                                                  1997 
2102                 cond_resched();                  1998                 cond_resched();
2103         }                                        1999         }
2104         cpus_read_unlock();                      2000         cpus_read_unlock();
2105                                                  2001 
2106         schedule_delayed_work(&shepherd,         2002         schedule_delayed_work(&shepherd,
2107                 round_jiffies_relative(sysctl    2003                 round_jiffies_relative(sysctl_stat_interval));
2108 }                                                2004 }
2109                                                  2005 
2110 static void __init start_shepherd_timer(void)    2006 static void __init start_shepherd_timer(void)
2111 {                                                2007 {
2112         int cpu;                                 2008         int cpu;
2113                                                  2009 
2114         for_each_possible_cpu(cpu)               2010         for_each_possible_cpu(cpu)
2115                 INIT_DEFERRABLE_WORK(per_cpu_    2011                 INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu),
2116                         vmstat_update);          2012                         vmstat_update);
2117                                                  2013 
2118         schedule_delayed_work(&shepherd,         2014         schedule_delayed_work(&shepherd,
2119                 round_jiffies_relative(sysctl    2015                 round_jiffies_relative(sysctl_stat_interval));
2120 }                                                2016 }
2121                                                  2017 
2122 static void __init init_cpu_node_state(void)     2018 static void __init init_cpu_node_state(void)
2123 {                                                2019 {
2124         int node;                                2020         int node;
2125                                                  2021 
2126         for_each_online_node(node) {             2022         for_each_online_node(node) {
2127                 if (!cpumask_empty(cpumask_of !! 2023                 if (cpumask_weight(cpumask_of_node(node)) > 0)
2128                         node_set_state(node,     2024                         node_set_state(node, N_CPU);
2129         }                                        2025         }
2130 }                                                2026 }
2131                                                  2027 
2132 static int vmstat_cpu_online(unsigned int cpu    2028 static int vmstat_cpu_online(unsigned int cpu)
2133 {                                                2029 {
2134         refresh_zone_stat_thresholds();          2030         refresh_zone_stat_thresholds();
2135                                               !! 2031         node_set_state(cpu_to_node(cpu), N_CPU);
2136         if (!node_state(cpu_to_node(cpu), N_C << 
2137                 node_set_state(cpu_to_node(cp << 
2138         }                                     << 
2139                                               << 
2140         return 0;                                2032         return 0;
2141 }                                                2033 }
2142                                                  2034 
2143 static int vmstat_cpu_down_prep(unsigned int     2035 static int vmstat_cpu_down_prep(unsigned int cpu)
2144 {                                                2036 {
2145         cancel_delayed_work_sync(&per_cpu(vms    2037         cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu));
2146         return 0;                                2038         return 0;
2147 }                                                2039 }
2148                                                  2040 
2149 static int vmstat_cpu_dead(unsigned int cpu)     2041 static int vmstat_cpu_dead(unsigned int cpu)
2150 {                                                2042 {
2151         const struct cpumask *node_cpus;         2043         const struct cpumask *node_cpus;
2152         int node;                                2044         int node;
2153                                                  2045 
2154         node = cpu_to_node(cpu);                 2046         node = cpu_to_node(cpu);
2155                                                  2047 
2156         refresh_zone_stat_thresholds();          2048         refresh_zone_stat_thresholds();
2157         node_cpus = cpumask_of_node(node);       2049         node_cpus = cpumask_of_node(node);
2158         if (!cpumask_empty(node_cpus))        !! 2050         if (cpumask_weight(node_cpus) > 0)
2159                 return 0;                        2051                 return 0;
2160                                                  2052 
2161         node_clear_state(node, N_CPU);           2053         node_clear_state(node, N_CPU);
2162                                               << 
2163         return 0;                                2054         return 0;
2164 }                                                2055 }
2165                                                  2056 
2166 #endif                                           2057 #endif
2167                                                  2058 
2168 struct workqueue_struct *mm_percpu_wq;           2059 struct workqueue_struct *mm_percpu_wq;
2169                                                  2060 
2170 void __init init_mm_internals(void)              2061 void __init init_mm_internals(void)
2171 {                                                2062 {
2172         int ret __maybe_unused;                  2063         int ret __maybe_unused;
2173                                                  2064 
2174         mm_percpu_wq = alloc_workqueue("mm_pe    2065         mm_percpu_wq = alloc_workqueue("mm_percpu_wq", WQ_MEM_RECLAIM, 0);
2175                                                  2066 
2176 #ifdef CONFIG_SMP                                2067 #ifdef CONFIG_SMP
2177         ret = cpuhp_setup_state_nocalls(CPUHP    2068         ret = cpuhp_setup_state_nocalls(CPUHP_MM_VMSTAT_DEAD, "mm/vmstat:dead",
2178                                         NULL,    2069                                         NULL, vmstat_cpu_dead);
2179         if (ret < 0)                             2070         if (ret < 0)
2180                 pr_err("vmstat: failed to reg    2071                 pr_err("vmstat: failed to register 'dead' hotplug state\n");
2181                                                  2072 
2182         ret = cpuhp_setup_state_nocalls(CPUHP    2073         ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "mm/vmstat:online",
2183                                         vmsta    2074                                         vmstat_cpu_online,
2184                                         vmsta    2075                                         vmstat_cpu_down_prep);
2185         if (ret < 0)                             2076         if (ret < 0)
2186                 pr_err("vmstat: failed to reg    2077                 pr_err("vmstat: failed to register 'online' hotplug state\n");
2187                                                  2078 
2188         cpus_read_lock();                        2079         cpus_read_lock();
2189         init_cpu_node_state();                   2080         init_cpu_node_state();
2190         cpus_read_unlock();                      2081         cpus_read_unlock();
2191                                                  2082 
2192         start_shepherd_timer();                  2083         start_shepherd_timer();
2193 #endif                                           2084 #endif
2194 #ifdef CONFIG_PROC_FS                            2085 #ifdef CONFIG_PROC_FS
2195         proc_create_seq("buddyinfo", 0444, NU    2086         proc_create_seq("buddyinfo", 0444, NULL, &fragmentation_op);
2196         proc_create_seq("pagetypeinfo", 0400,    2087         proc_create_seq("pagetypeinfo", 0400, NULL, &pagetypeinfo_op);
2197         proc_create_seq("vmstat", 0444, NULL,    2088         proc_create_seq("vmstat", 0444, NULL, &vmstat_op);
2198         proc_create_seq("zoneinfo", 0444, NUL    2089         proc_create_seq("zoneinfo", 0444, NULL, &zoneinfo_op);
2199 #endif                                           2090 #endif
2200 }                                                2091 }
2201                                                  2092 
2202 #if defined(CONFIG_DEBUG_FS) && defined(CONFI    2093 #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)
2203                                                  2094 
2204 /*                                               2095 /*
2205  * Return an index indicating how much of the    2096  * Return an index indicating how much of the available free memory is
2206  * unusable for an allocation of the requeste    2097  * unusable for an allocation of the requested size.
2207  */                                              2098  */
2208 static int unusable_free_index(unsigned int o    2099 static int unusable_free_index(unsigned int order,
2209                                 struct contig    2100                                 struct contig_page_info *info)
2210 {                                                2101 {
2211         /* No free memory is interpreted as a    2102         /* No free memory is interpreted as all free memory is unusable */
2212         if (info->free_pages == 0)               2103         if (info->free_pages == 0)
2213                 return 1000;                     2104                 return 1000;
2214                                                  2105 
2215         /*                                       2106         /*
2216          * Index should be a value between 0     2107          * Index should be a value between 0 and 1. Return a value to 3
2217          * decimal places.                       2108          * decimal places.
2218          *                                       2109          *
2219          * 0 => no fragmentation                 2110          * 0 => no fragmentation
2220          * 1 => high fragmentation               2111          * 1 => high fragmentation
2221          */                                      2112          */
2222         return div_u64((info->free_pages - (i    2113         return div_u64((info->free_pages - (info->free_blocks_suitable << order)) * 1000ULL, info->free_pages);
2223                                                  2114 
2224 }                                                2115 }
2225                                                  2116 
2226 static void unusable_show_print(struct seq_fi    2117 static void unusable_show_print(struct seq_file *m,
2227                                         pg_da    2118                                         pg_data_t *pgdat, struct zone *zone)
2228 {                                                2119 {
2229         unsigned int order;                      2120         unsigned int order;
2230         int index;                               2121         int index;
2231         struct contig_page_info info;            2122         struct contig_page_info info;
2232                                                  2123 
2233         seq_printf(m, "Node %d, zone %8s ",      2124         seq_printf(m, "Node %d, zone %8s ",
2234                                 pgdat->node_i    2125                                 pgdat->node_id,
2235                                 zone->name);     2126                                 zone->name);
2236         for (order = 0; order < NR_PAGE_ORDER !! 2127         for (order = 0; order < MAX_ORDER; ++order) {
2237                 fill_contig_page_info(zone, o    2128                 fill_contig_page_info(zone, order, &info);
2238                 index = unusable_free_index(o    2129                 index = unusable_free_index(order, &info);
2239                 seq_printf(m, "%d.%03d ", ind    2130                 seq_printf(m, "%d.%03d ", index / 1000, index % 1000);
2240         }                                        2131         }
2241                                                  2132 
2242         seq_putc(m, '\n');                       2133         seq_putc(m, '\n');
2243 }                                                2134 }
2244                                                  2135 
2245 /*                                               2136 /*
2246  * Display unusable free space index             2137  * Display unusable free space index
2247  *                                               2138  *
2248  * The unusable free space index measures how    2139  * The unusable free space index measures how much of the available free
2249  * memory cannot be used to satisfy an alloca    2140  * memory cannot be used to satisfy an allocation of a given size and is a
2250  * value between 0 and 1. The higher the valu    2141  * value between 0 and 1. The higher the value, the more of free memory is
2251  * unusable and by implication, the worse the    2142  * unusable and by implication, the worse the external fragmentation is. This
2252  * can be expressed as a percentage by multip    2143  * can be expressed as a percentage by multiplying by 100.
2253  */                                              2144  */
2254 static int unusable_show(struct seq_file *m,     2145 static int unusable_show(struct seq_file *m, void *arg)
2255 {                                                2146 {
2256         pg_data_t *pgdat = (pg_data_t *)arg;     2147         pg_data_t *pgdat = (pg_data_t *)arg;
2257                                                  2148 
2258         /* check memoryless node */              2149         /* check memoryless node */
2259         if (!node_state(pgdat->node_id, N_MEM    2150         if (!node_state(pgdat->node_id, N_MEMORY))
2260                 return 0;                        2151                 return 0;
2261                                                  2152 
2262         walk_zones_in_node(m, pgdat, true, fa    2153         walk_zones_in_node(m, pgdat, true, false, unusable_show_print);
2263                                                  2154 
2264         return 0;                                2155         return 0;
2265 }                                                2156 }
2266                                                  2157 
2267 static const struct seq_operations unusable_s    2158 static const struct seq_operations unusable_sops = {
2268         .start  = frag_start,                    2159         .start  = frag_start,
2269         .next   = frag_next,                     2160         .next   = frag_next,
2270         .stop   = frag_stop,                     2161         .stop   = frag_stop,
2271         .show   = unusable_show,                 2162         .show   = unusable_show,
2272 };                                               2163 };
2273                                                  2164 
2274 DEFINE_SEQ_ATTRIBUTE(unusable);                  2165 DEFINE_SEQ_ATTRIBUTE(unusable);
2275                                                  2166 
2276 static void extfrag_show_print(struct seq_fil    2167 static void extfrag_show_print(struct seq_file *m,
2277                                         pg_da    2168                                         pg_data_t *pgdat, struct zone *zone)
2278 {                                                2169 {
2279         unsigned int order;                      2170         unsigned int order;
2280         int index;                               2171         int index;
2281                                                  2172 
2282         /* Alloc on stack as interrupts are d    2173         /* Alloc on stack as interrupts are disabled for zone walk */
2283         struct contig_page_info info;            2174         struct contig_page_info info;
2284                                                  2175 
2285         seq_printf(m, "Node %d, zone %8s ",      2176         seq_printf(m, "Node %d, zone %8s ",
2286                                 pgdat->node_i    2177                                 pgdat->node_id,
2287                                 zone->name);     2178                                 zone->name);
2288         for (order = 0; order < NR_PAGE_ORDER !! 2179         for (order = 0; order < MAX_ORDER; ++order) {
2289                 fill_contig_page_info(zone, o    2180                 fill_contig_page_info(zone, order, &info);
2290                 index = __fragmentation_index    2181                 index = __fragmentation_index(order, &info);
2291                 seq_printf(m, "%2d.%03d ", in !! 2182                 seq_printf(m, "%d.%03d ", index / 1000, index % 1000);
2292         }                                        2183         }
2293                                                  2184 
2294         seq_putc(m, '\n');                       2185         seq_putc(m, '\n');
2295 }                                                2186 }
2296                                                  2187 
2297 /*                                               2188 /*
2298  * Display fragmentation index for orders tha    2189  * Display fragmentation index for orders that allocations would fail for
2299  */                                              2190  */
2300 static int extfrag_show(struct seq_file *m, v    2191 static int extfrag_show(struct seq_file *m, void *arg)
2301 {                                                2192 {
2302         pg_data_t *pgdat = (pg_data_t *)arg;     2193         pg_data_t *pgdat = (pg_data_t *)arg;
2303                                                  2194 
2304         walk_zones_in_node(m, pgdat, true, fa    2195         walk_zones_in_node(m, pgdat, true, false, extfrag_show_print);
2305                                                  2196 
2306         return 0;                                2197         return 0;
2307 }                                                2198 }
2308                                                  2199 
2309 static const struct seq_operations extfrag_so    2200 static const struct seq_operations extfrag_sops = {
2310         .start  = frag_start,                    2201         .start  = frag_start,
2311         .next   = frag_next,                     2202         .next   = frag_next,
2312         .stop   = frag_stop,                     2203         .stop   = frag_stop,
2313         .show   = extfrag_show,                  2204         .show   = extfrag_show,
2314 };                                               2205 };
2315                                                  2206 
2316 DEFINE_SEQ_ATTRIBUTE(extfrag);                   2207 DEFINE_SEQ_ATTRIBUTE(extfrag);
2317                                                  2208 
2318 static int __init extfrag_debug_init(void)       2209 static int __init extfrag_debug_init(void)
2319 {                                                2210 {
2320         struct dentry *extfrag_debug_root;       2211         struct dentry *extfrag_debug_root;
2321                                                  2212 
2322         extfrag_debug_root = debugfs_create_d    2213         extfrag_debug_root = debugfs_create_dir("extfrag", NULL);
2323                                                  2214 
2324         debugfs_create_file("unusable_index",    2215         debugfs_create_file("unusable_index", 0444, extfrag_debug_root, NULL,
2325                             &unusable_fops);     2216                             &unusable_fops);
2326                                                  2217 
2327         debugfs_create_file("extfrag_index",     2218         debugfs_create_file("extfrag_index", 0444, extfrag_debug_root, NULL,
2328                             &extfrag_fops);      2219                             &extfrag_fops);
2329                                                  2220 
2330         return 0;                                2221         return 0;
2331 }                                                2222 }
2332                                                  2223 
2333 module_init(extfrag_debug_init);                 2224 module_init(extfrag_debug_init);
2334                                               << 
2335 #endif                                           2225 #endif
2336                                                  2226
~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.
TOMOYO Linux Cross Reference Linux/mm/vmstat.c

Diff markup

Differences between /mm/vmstat.c (Version linux-6.12-rc7) and /mm/vmstat.c (Version linux-5.15.171)

TOMOYO Linux Cross Reference
Linux/mm/vmstat.c