1 // SPDX-License-Identifier: GPL-2.0-only 1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 2 /* 3 * linux/mm/vmstat.c 3 * linux/mm/vmstat.c 4 * 4 * 5 * Manages VM statistics 5 * Manages VM statistics 6 * Copyright (C) 1991, 1992, 1993, 1994 Linu 6 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 7 * 7 * 8 * zoned VM statistics 8 * zoned VM statistics 9 * Copyright (C) 2006 Silicon Graphics, Inc., 9 * Copyright (C) 2006 Silicon Graphics, Inc., 10 * Christoph Lameter <christoph@l 10 * Christoph Lameter <christoph@lameter.com> 11 * Copyright (C) 2008-2014 Christoph Lameter 11 * Copyright (C) 2008-2014 Christoph Lameter 12 */ 12 */ 13 #include <linux/fs.h> 13 #include <linux/fs.h> 14 #include <linux/mm.h> 14 #include <linux/mm.h> 15 #include <linux/err.h> 15 #include <linux/err.h> 16 #include <linux/module.h> 16 #include <linux/module.h> 17 #include <linux/slab.h> 17 #include <linux/slab.h> 18 #include <linux/cpu.h> 18 #include <linux/cpu.h> 19 #include <linux/cpumask.h> 19 #include <linux/cpumask.h> 20 #include <linux/vmstat.h> 20 #include <linux/vmstat.h> 21 #include <linux/proc_fs.h> 21 #include <linux/proc_fs.h> 22 #include <linux/seq_file.h> 22 #include <linux/seq_file.h> 23 #include <linux/debugfs.h> 23 #include <linux/debugfs.h> 24 #include <linux/sched.h> 24 #include <linux/sched.h> 25 #include <linux/math64.h> 25 #include <linux/math64.h> 26 #include <linux/writeback.h> 26 #include <linux/writeback.h> 27 #include <linux/compaction.h> 27 #include <linux/compaction.h> 28 #include <linux/mm_inline.h> 28 #include <linux/mm_inline.h> >> 29 #include <linux/page_ext.h> 29 #include <linux/page_owner.h> 30 #include <linux/page_owner.h> 30 #include <linux/sched/isolation.h> << 31 31 32 #include "internal.h" 32 #include "internal.h" 33 33 >> 34 #define NUMA_STATS_THRESHOLD (U16_MAX - 2) >> 35 34 #ifdef CONFIG_NUMA 36 #ifdef CONFIG_NUMA 35 int sysctl_vm_numa_stat = ENABLE_NUMA_STAT; 37 int sysctl_vm_numa_stat = ENABLE_NUMA_STAT; 36 38 37 /* zero numa counters within a zone */ 39 /* zero numa counters within a zone */ 38 static void zero_zone_numa_counters(struct zon 40 static void zero_zone_numa_counters(struct zone *zone) 39 { 41 { 40 int item, cpu; 42 int item, cpu; 41 43 42 for (item = 0; item < NR_VM_NUMA_EVENT !! 44 for (item = 0; item < NR_VM_NUMA_STAT_ITEMS; item++) { 43 atomic_long_set(&zone->vm_numa !! 45 atomic_long_set(&zone->vm_numa_stat[item], 0); 44 for_each_online_cpu(cpu) { !! 46 for_each_online_cpu(cpu) 45 per_cpu_ptr(zone->per_ !! 47 per_cpu_ptr(zone->pageset, cpu)->vm_numa_stat_diff[item] 46 48 = 0; 47 } << 48 } 49 } 49 } 50 } 50 51 51 /* zero numa counters of all the populated zon 52 /* zero numa counters of all the populated zones */ 52 static void zero_zones_numa_counters(void) 53 static void zero_zones_numa_counters(void) 53 { 54 { 54 struct zone *zone; 55 struct zone *zone; 55 56 56 for_each_populated_zone(zone) 57 for_each_populated_zone(zone) 57 zero_zone_numa_counters(zone); 58 zero_zone_numa_counters(zone); 58 } 59 } 59 60 60 /* zero global numa counters */ 61 /* zero global numa counters */ 61 static void zero_global_numa_counters(void) 62 static void zero_global_numa_counters(void) 62 { 63 { 63 int item; 64 int item; 64 65 65 for (item = 0; item < NR_VM_NUMA_EVENT !! 66 for (item = 0; item < NR_VM_NUMA_STAT_ITEMS; item++) 66 atomic_long_set(&vm_numa_event !! 67 atomic_long_set(&vm_numa_stat[item], 0); 67 } 68 } 68 69 69 static void invalid_numa_statistics(void) 70 static void invalid_numa_statistics(void) 70 { 71 { 71 zero_zones_numa_counters(); 72 zero_zones_numa_counters(); 72 zero_global_numa_counters(); 73 zero_global_numa_counters(); 73 } 74 } 74 75 75 static DEFINE_MUTEX(vm_numa_stat_lock); 76 static DEFINE_MUTEX(vm_numa_stat_lock); 76 77 77 int sysctl_vm_numa_stat_handler(const struct c !! 78 int sysctl_vm_numa_stat_handler(struct ctl_table *table, int write, 78 void *buffer, size_t *length, 79 void *buffer, size_t *length, loff_t *ppos) 79 { 80 { 80 int ret, oldval; 81 int ret, oldval; 81 82 82 mutex_lock(&vm_numa_stat_lock); 83 mutex_lock(&vm_numa_stat_lock); 83 if (write) 84 if (write) 84 oldval = sysctl_vm_numa_stat; 85 oldval = sysctl_vm_numa_stat; 85 ret = proc_dointvec_minmax(table, writ 86 ret = proc_dointvec_minmax(table, write, buffer, length, ppos); 86 if (ret || !write) 87 if (ret || !write) 87 goto out; 88 goto out; 88 89 89 if (oldval == sysctl_vm_numa_stat) 90 if (oldval == sysctl_vm_numa_stat) 90 goto out; 91 goto out; 91 else if (sysctl_vm_numa_stat == ENABLE 92 else if (sysctl_vm_numa_stat == ENABLE_NUMA_STAT) { 92 static_branch_enable(&vm_numa_ 93 static_branch_enable(&vm_numa_stat_key); 93 pr_info("enable numa statistic 94 pr_info("enable numa statistics\n"); 94 } else { 95 } else { 95 static_branch_disable(&vm_numa 96 static_branch_disable(&vm_numa_stat_key); 96 invalid_numa_statistics(); 97 invalid_numa_statistics(); 97 pr_info("disable numa statisti 98 pr_info("disable numa statistics, and clear numa counters\n"); 98 } 99 } 99 100 100 out: 101 out: 101 mutex_unlock(&vm_numa_stat_lock); 102 mutex_unlock(&vm_numa_stat_lock); 102 return ret; 103 return ret; 103 } 104 } 104 #endif 105 #endif 105 106 106 #ifdef CONFIG_VM_EVENT_COUNTERS 107 #ifdef CONFIG_VM_EVENT_COUNTERS 107 DEFINE_PER_CPU(struct vm_event_state, vm_event 108 DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}}; 108 EXPORT_PER_CPU_SYMBOL(vm_event_states); 109 EXPORT_PER_CPU_SYMBOL(vm_event_states); 109 110 110 static void sum_vm_events(unsigned long *ret) 111 static void sum_vm_events(unsigned long *ret) 111 { 112 { 112 int cpu; 113 int cpu; 113 int i; 114 int i; 114 115 115 memset(ret, 0, NR_VM_EVENT_ITEMS * siz 116 memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long)); 116 117 117 for_each_online_cpu(cpu) { 118 for_each_online_cpu(cpu) { 118 struct vm_event_state *this = 119 struct vm_event_state *this = &per_cpu(vm_event_states, cpu); 119 120 120 for (i = 0; i < NR_VM_EVENT_IT 121 for (i = 0; i < NR_VM_EVENT_ITEMS; i++) 121 ret[i] += this->event[ 122 ret[i] += this->event[i]; 122 } 123 } 123 } 124 } 124 125 125 /* 126 /* 126 * Accumulate the vm event counters across all 127 * Accumulate the vm event counters across all CPUs. 127 * The result is unavoidably approximate - it 128 * The result is unavoidably approximate - it can change 128 * during and after execution of this function 129 * during and after execution of this function. 129 */ 130 */ 130 void all_vm_events(unsigned long *ret) 131 void all_vm_events(unsigned long *ret) 131 { 132 { 132 cpus_read_lock(); !! 133 get_online_cpus(); 133 sum_vm_events(ret); 134 sum_vm_events(ret); 134 cpus_read_unlock(); !! 135 put_online_cpus(); 135 } 136 } 136 EXPORT_SYMBOL_GPL(all_vm_events); 137 EXPORT_SYMBOL_GPL(all_vm_events); 137 138 138 /* 139 /* 139 * Fold the foreign cpu events into our own. 140 * Fold the foreign cpu events into our own. 140 * 141 * 141 * This is adding to the events on one process 142 * This is adding to the events on one processor 142 * but keeps the global counts constant. 143 * but keeps the global counts constant. 143 */ 144 */ 144 void vm_events_fold_cpu(int cpu) 145 void vm_events_fold_cpu(int cpu) 145 { 146 { 146 struct vm_event_state *fold_state = &p 147 struct vm_event_state *fold_state = &per_cpu(vm_event_states, cpu); 147 int i; 148 int i; 148 149 149 for (i = 0; i < NR_VM_EVENT_ITEMS; i++ 150 for (i = 0; i < NR_VM_EVENT_ITEMS; i++) { 150 count_vm_events(i, fold_state- 151 count_vm_events(i, fold_state->event[i]); 151 fold_state->event[i] = 0; 152 fold_state->event[i] = 0; 152 } 153 } 153 } 154 } 154 155 155 #endif /* CONFIG_VM_EVENT_COUNTERS */ 156 #endif /* CONFIG_VM_EVENT_COUNTERS */ 156 157 157 /* 158 /* 158 * Manage combined zone based / global counter 159 * Manage combined zone based / global counters 159 * 160 * 160 * vm_stat contains the global counters 161 * vm_stat contains the global counters 161 */ 162 */ 162 atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITE 163 atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp; >> 164 atomic_long_t vm_numa_stat[NR_VM_NUMA_STAT_ITEMS] __cacheline_aligned_in_smp; 163 atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITE 165 atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS] __cacheline_aligned_in_smp; 164 atomic_long_t vm_numa_event[NR_VM_NUMA_EVENT_I << 165 EXPORT_SYMBOL(vm_zone_stat); 166 EXPORT_SYMBOL(vm_zone_stat); >> 167 EXPORT_SYMBOL(vm_numa_stat); 166 EXPORT_SYMBOL(vm_node_stat); 168 EXPORT_SYMBOL(vm_node_stat); 167 169 168 #ifdef CONFIG_NUMA << 169 static void fold_vm_zone_numa_events(struct zo << 170 { << 171 unsigned long zone_numa_events[NR_VM_N << 172 int cpu; << 173 enum numa_stat_item item; << 174 << 175 for_each_online_cpu(cpu) { << 176 struct per_cpu_zonestat *pzsta << 177 << 178 pzstats = per_cpu_ptr(zone->pe << 179 for (item = 0; item < NR_VM_NU << 180 zone_numa_events[item] << 181 } << 182 << 183 for (item = 0; item < NR_VM_NUMA_EVENT << 184 zone_numa_event_add(zone_numa_ << 185 } << 186 << 187 void fold_vm_numa_events(void) << 188 { << 189 struct zone *zone; << 190 << 191 for_each_populated_zone(zone) << 192 fold_vm_zone_numa_events(zone) << 193 } << 194 #endif << 195 << 196 #ifdef CONFIG_SMP 170 #ifdef CONFIG_SMP 197 171 198 int calculate_pressure_threshold(struct zone * 172 int calculate_pressure_threshold(struct zone *zone) 199 { 173 { 200 int threshold; 174 int threshold; 201 int watermark_distance; 175 int watermark_distance; 202 176 203 /* 177 /* 204 * As vmstats are not up to date, ther 178 * As vmstats are not up to date, there is drift between the estimated 205 * and real values. For high threshold 179 * and real values. For high thresholds and a high number of CPUs, it 206 * is possible for the min watermark t 180 * is possible for the min watermark to be breached while the estimated 207 * value looks fine. The pressure thre 181 * value looks fine. The pressure threshold is a reduced value such 208 * that even the maximum amount of dri 182 * that even the maximum amount of drift will not accidentally breach 209 * the min watermark 183 * the min watermark 210 */ 184 */ 211 watermark_distance = low_wmark_pages(z 185 watermark_distance = low_wmark_pages(zone) - min_wmark_pages(zone); 212 threshold = max(1, (int)(watermark_dis 186 threshold = max(1, (int)(watermark_distance / num_online_cpus())); 213 187 214 /* 188 /* 215 * Maximum threshold is 125 189 * Maximum threshold is 125 216 */ 190 */ 217 threshold = min(125, threshold); 191 threshold = min(125, threshold); 218 192 219 return threshold; 193 return threshold; 220 } 194 } 221 195 222 int calculate_normal_threshold(struct zone *zo 196 int calculate_normal_threshold(struct zone *zone) 223 { 197 { 224 int threshold; 198 int threshold; 225 int mem; /* memory in 128 MB un 199 int mem; /* memory in 128 MB units */ 226 200 227 /* 201 /* 228 * The threshold scales with the numbe 202 * The threshold scales with the number of processors and the amount 229 * of memory per zone. More memory mea 203 * of memory per zone. More memory means that we can defer updates for 230 * longer, more processors could lead 204 * longer, more processors could lead to more contention. 231 * fls() is used to have a cheap way o 205 * fls() is used to have a cheap way of logarithmic scaling. 232 * 206 * 233 * Some sample thresholds: 207 * Some sample thresholds: 234 * 208 * 235 * Threshold Processors (fls) !! 209 * Threshold Processors (fls) Zonesize fls(mem+1) 236 * ----------------------------------- 210 * ------------------------------------------------------------------ 237 * 8 1 1 211 * 8 1 1 0.9-1 GB 4 238 * 16 2 2 212 * 16 2 2 0.9-1 GB 4 239 * 20 2 2 213 * 20 2 2 1-2 GB 5 240 * 24 2 2 214 * 24 2 2 2-4 GB 6 241 * 28 2 2 215 * 28 2 2 4-8 GB 7 242 * 32 2 2 216 * 32 2 2 8-16 GB 8 243 * 4 2 2 217 * 4 2 2 <128M 1 244 * 30 4 3 218 * 30 4 3 2-4 GB 5 245 * 48 4 3 219 * 48 4 3 8-16 GB 8 246 * 32 8 4 220 * 32 8 4 1-2 GB 4 247 * 32 8 4 221 * 32 8 4 0.9-1GB 4 248 * 10 16 5 222 * 10 16 5 <128M 1 249 * 40 16 5 223 * 40 16 5 900M 4 250 * 70 64 7 224 * 70 64 7 2-4 GB 5 251 * 84 64 7 225 * 84 64 7 4-8 GB 6 252 * 108 512 9 226 * 108 512 9 4-8 GB 6 253 * 125 1024 10 227 * 125 1024 10 8-16 GB 8 254 * 125 1024 10 228 * 125 1024 10 16-32 GB 9 255 */ 229 */ 256 230 257 mem = zone_managed_pages(zone) >> (27 231 mem = zone_managed_pages(zone) >> (27 - PAGE_SHIFT); 258 232 259 threshold = 2 * fls(num_online_cpus()) 233 threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem)); 260 234 261 /* 235 /* 262 * Maximum threshold is 125 236 * Maximum threshold is 125 263 */ 237 */ 264 threshold = min(125, threshold); 238 threshold = min(125, threshold); 265 239 266 return threshold; 240 return threshold; 267 } 241 } 268 242 269 /* 243 /* 270 * Refresh the thresholds for each zone. 244 * Refresh the thresholds for each zone. 271 */ 245 */ 272 void refresh_zone_stat_thresholds(void) 246 void refresh_zone_stat_thresholds(void) 273 { 247 { 274 struct pglist_data *pgdat; 248 struct pglist_data *pgdat; 275 struct zone *zone; 249 struct zone *zone; 276 int cpu; 250 int cpu; 277 int threshold; 251 int threshold; 278 252 279 /* Zero current pgdat thresholds */ 253 /* Zero current pgdat thresholds */ 280 for_each_online_pgdat(pgdat) { 254 for_each_online_pgdat(pgdat) { 281 for_each_online_cpu(cpu) { 255 for_each_online_cpu(cpu) { 282 per_cpu_ptr(pgdat->per 256 per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold = 0; 283 } 257 } 284 } 258 } 285 259 286 for_each_populated_zone(zone) { 260 for_each_populated_zone(zone) { 287 struct pglist_data *pgdat = zo 261 struct pglist_data *pgdat = zone->zone_pgdat; 288 unsigned long max_drift, toler 262 unsigned long max_drift, tolerate_drift; 289 263 290 threshold = calculate_normal_t 264 threshold = calculate_normal_threshold(zone); 291 265 292 for_each_online_cpu(cpu) { 266 for_each_online_cpu(cpu) { 293 int pgdat_threshold; 267 int pgdat_threshold; 294 268 295 per_cpu_ptr(zone->per_ !! 269 per_cpu_ptr(zone->pageset, cpu)->stat_threshold 296 270 = threshold; 297 271 298 /* Base nodestat thres 272 /* Base nodestat threshold on the largest populated zone. */ 299 pgdat_threshold = per_ 273 pgdat_threshold = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold; 300 per_cpu_ptr(pgdat->per 274 per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold 301 = max(threshol 275 = max(threshold, pgdat_threshold); 302 } 276 } 303 277 304 /* 278 /* 305 * Only set percpu_drift_mark 279 * Only set percpu_drift_mark if there is a danger that 306 * NR_FREE_PAGES reports the l 280 * NR_FREE_PAGES reports the low watermark is ok when in fact 307 * the min watermark could be 281 * the min watermark could be breached by an allocation 308 */ 282 */ 309 tolerate_drift = low_wmark_pag 283 tolerate_drift = low_wmark_pages(zone) - min_wmark_pages(zone); 310 max_drift = num_online_cpus() 284 max_drift = num_online_cpus() * threshold; 311 if (max_drift > tolerate_drift 285 if (max_drift > tolerate_drift) 312 zone->percpu_drift_mar 286 zone->percpu_drift_mark = high_wmark_pages(zone) + 313 max_dr 287 max_drift; 314 } 288 } 315 } 289 } 316 290 317 void set_pgdat_percpu_threshold(pg_data_t *pgd 291 void set_pgdat_percpu_threshold(pg_data_t *pgdat, 318 int (*calculat 292 int (*calculate_pressure)(struct zone *)) 319 { 293 { 320 struct zone *zone; 294 struct zone *zone; 321 int cpu; 295 int cpu; 322 int threshold; 296 int threshold; 323 int i; 297 int i; 324 298 325 for (i = 0; i < pgdat->nr_zones; i++) 299 for (i = 0; i < pgdat->nr_zones; i++) { 326 zone = &pgdat->node_zones[i]; 300 zone = &pgdat->node_zones[i]; 327 if (!zone->percpu_drift_mark) 301 if (!zone->percpu_drift_mark) 328 continue; 302 continue; 329 303 330 threshold = (*calculate_pressu 304 threshold = (*calculate_pressure)(zone); 331 for_each_online_cpu(cpu) 305 for_each_online_cpu(cpu) 332 per_cpu_ptr(zone->per_ !! 306 per_cpu_ptr(zone->pageset, cpu)->stat_threshold 333 307 = threshold; 334 } 308 } 335 } 309 } 336 310 337 /* 311 /* 338 * For use when we know that interrupts are di 312 * For use when we know that interrupts are disabled, 339 * or when we know that preemption is disabled 313 * or when we know that preemption is disabled and that 340 * particular counter cannot be updated from i 314 * particular counter cannot be updated from interrupt context. 341 */ 315 */ 342 void __mod_zone_page_state(struct zone *zone, 316 void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, 343 long delta) 317 long delta) 344 { 318 { 345 struct per_cpu_zonestat __percpu *pcp !! 319 struct per_cpu_pageset __percpu *pcp = zone->pageset; 346 s8 __percpu *p = pcp->vm_stat_diff + i 320 s8 __percpu *p = pcp->vm_stat_diff + item; 347 long x; 321 long x; 348 long t; 322 long t; 349 323 350 /* << 351 * Accurate vmstat updates require a R << 352 * atomicity is provided by IRQs being << 353 * or via local_lock_irq. On PREEMPT_R << 354 * CPU migrations and preemption poten << 355 * disable preemption. << 356 */ << 357 preempt_disable_nested(); << 358 << 359 x = delta + __this_cpu_read(*p); 324 x = delta + __this_cpu_read(*p); 360 325 361 t = __this_cpu_read(pcp->stat_threshol 326 t = __this_cpu_read(pcp->stat_threshold); 362 327 363 if (unlikely(abs(x) > t)) { !! 328 if (unlikely(x > t || x < -t)) { 364 zone_page_state_add(x, zone, i 329 zone_page_state_add(x, zone, item); 365 x = 0; 330 x = 0; 366 } 331 } 367 __this_cpu_write(*p, x); 332 __this_cpu_write(*p, x); 368 << 369 preempt_enable_nested(); << 370 } 333 } 371 EXPORT_SYMBOL(__mod_zone_page_state); 334 EXPORT_SYMBOL(__mod_zone_page_state); 372 335 373 void __mod_node_page_state(struct pglist_data 336 void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item, 374 long delta) 337 long delta) 375 { 338 { 376 struct per_cpu_nodestat __percpu *pcp 339 struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats; 377 s8 __percpu *p = pcp->vm_node_stat_dif 340 s8 __percpu *p = pcp->vm_node_stat_diff + item; 378 long x; 341 long x; 379 long t; 342 long t; 380 343 381 if (vmstat_item_in_bytes(item)) { << 382 /* << 383 * Only cgroups use subpage ac << 384 * the global level, these ite << 385 * multiples of whole pages. S << 386 * internally to keep the per- << 387 */ << 388 VM_WARN_ON_ONCE(delta & (PAGE_ << 389 delta >>= PAGE_SHIFT; << 390 } << 391 << 392 /* See __mod_node_page_state */ << 393 preempt_disable_nested(); << 394 << 395 x = delta + __this_cpu_read(*p); 344 x = delta + __this_cpu_read(*p); 396 345 397 t = __this_cpu_read(pcp->stat_threshol 346 t = __this_cpu_read(pcp->stat_threshold); 398 347 399 if (unlikely(abs(x) > t)) { !! 348 if (unlikely(x > t || x < -t)) { 400 node_page_state_add(x, pgdat, 349 node_page_state_add(x, pgdat, item); 401 x = 0; 350 x = 0; 402 } 351 } 403 __this_cpu_write(*p, x); 352 __this_cpu_write(*p, x); 404 << 405 preempt_enable_nested(); << 406 } 353 } 407 EXPORT_SYMBOL(__mod_node_page_state); 354 EXPORT_SYMBOL(__mod_node_page_state); 408 355 409 /* 356 /* 410 * Optimized increment and decrement functions 357 * Optimized increment and decrement functions. 411 * 358 * 412 * These are only for a single page and theref 359 * These are only for a single page and therefore can take a struct page * 413 * argument instead of struct zone *. This all 360 * argument instead of struct zone *. This allows the inclusion of the code 414 * generated for page_zone(page) into the opti 361 * generated for page_zone(page) into the optimized functions. 415 * 362 * 416 * No overflow check is necessary and therefor 363 * No overflow check is necessary and therefore the differential can be 417 * incremented or decremented in place which m 364 * incremented or decremented in place which may allow the compilers to 418 * generate better code. 365 * generate better code. 419 * The increment or decrement is known and the 366 * The increment or decrement is known and therefore one boundary check can 420 * be omitted. 367 * be omitted. 421 * 368 * 422 * NOTE: These functions are very performance 369 * NOTE: These functions are very performance sensitive. Change only 423 * with care. 370 * with care. 424 * 371 * 425 * Some processors have inc/dec instructions t 372 * Some processors have inc/dec instructions that are atomic vs an interrupt. 426 * However, the code must first determine the 373 * However, the code must first determine the differential location in a zone 427 * based on the processor number and then inc/ 374 * based on the processor number and then inc/dec the counter. There is no 428 * guarantee without disabling preemption that 375 * guarantee without disabling preemption that the processor will not change 429 * in between and therefore the atomicity vs. 376 * in between and therefore the atomicity vs. interrupt cannot be exploited 430 * in a useful way here. 377 * in a useful way here. 431 */ 378 */ 432 void __inc_zone_state(struct zone *zone, enum 379 void __inc_zone_state(struct zone *zone, enum zone_stat_item item) 433 { 380 { 434 struct per_cpu_zonestat __percpu *pcp !! 381 struct per_cpu_pageset __percpu *pcp = zone->pageset; 435 s8 __percpu *p = pcp->vm_stat_diff + i 382 s8 __percpu *p = pcp->vm_stat_diff + item; 436 s8 v, t; 383 s8 v, t; 437 384 438 /* See __mod_node_page_state */ << 439 preempt_disable_nested(); << 440 << 441 v = __this_cpu_inc_return(*p); 385 v = __this_cpu_inc_return(*p); 442 t = __this_cpu_read(pcp->stat_threshol 386 t = __this_cpu_read(pcp->stat_threshold); 443 if (unlikely(v > t)) { 387 if (unlikely(v > t)) { 444 s8 overstep = t >> 1; 388 s8 overstep = t >> 1; 445 389 446 zone_page_state_add(v + overst 390 zone_page_state_add(v + overstep, zone, item); 447 __this_cpu_write(*p, -overstep 391 __this_cpu_write(*p, -overstep); 448 } 392 } 449 << 450 preempt_enable_nested(); << 451 } 393 } 452 394 453 void __inc_node_state(struct pglist_data *pgda 395 void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) 454 { 396 { 455 struct per_cpu_nodestat __percpu *pcp 397 struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats; 456 s8 __percpu *p = pcp->vm_node_stat_dif 398 s8 __percpu *p = pcp->vm_node_stat_diff + item; 457 s8 v, t; 399 s8 v, t; 458 400 459 VM_WARN_ON_ONCE(vmstat_item_in_bytes(i << 460 << 461 /* See __mod_node_page_state */ << 462 preempt_disable_nested(); << 463 << 464 v = __this_cpu_inc_return(*p); 401 v = __this_cpu_inc_return(*p); 465 t = __this_cpu_read(pcp->stat_threshol 402 t = __this_cpu_read(pcp->stat_threshold); 466 if (unlikely(v > t)) { 403 if (unlikely(v > t)) { 467 s8 overstep = t >> 1; 404 s8 overstep = t >> 1; 468 405 469 node_page_state_add(v + overst 406 node_page_state_add(v + overstep, pgdat, item); 470 __this_cpu_write(*p, -overstep 407 __this_cpu_write(*p, -overstep); 471 } 408 } 472 << 473 preempt_enable_nested(); << 474 } 409 } 475 410 476 void __inc_zone_page_state(struct page *page, 411 void __inc_zone_page_state(struct page *page, enum zone_stat_item item) 477 { 412 { 478 __inc_zone_state(page_zone(page), item 413 __inc_zone_state(page_zone(page), item); 479 } 414 } 480 EXPORT_SYMBOL(__inc_zone_page_state); 415 EXPORT_SYMBOL(__inc_zone_page_state); 481 416 482 void __inc_node_page_state(struct page *page, 417 void __inc_node_page_state(struct page *page, enum node_stat_item item) 483 { 418 { 484 __inc_node_state(page_pgdat(page), ite 419 __inc_node_state(page_pgdat(page), item); 485 } 420 } 486 EXPORT_SYMBOL(__inc_node_page_state); 421 EXPORT_SYMBOL(__inc_node_page_state); 487 422 488 void __dec_zone_state(struct zone *zone, enum 423 void __dec_zone_state(struct zone *zone, enum zone_stat_item item) 489 { 424 { 490 struct per_cpu_zonestat __percpu *pcp !! 425 struct per_cpu_pageset __percpu *pcp = zone->pageset; 491 s8 __percpu *p = pcp->vm_stat_diff + i 426 s8 __percpu *p = pcp->vm_stat_diff + item; 492 s8 v, t; 427 s8 v, t; 493 428 494 /* See __mod_node_page_state */ << 495 preempt_disable_nested(); << 496 << 497 v = __this_cpu_dec_return(*p); 429 v = __this_cpu_dec_return(*p); 498 t = __this_cpu_read(pcp->stat_threshol 430 t = __this_cpu_read(pcp->stat_threshold); 499 if (unlikely(v < - t)) { 431 if (unlikely(v < - t)) { 500 s8 overstep = t >> 1; 432 s8 overstep = t >> 1; 501 433 502 zone_page_state_add(v - overst 434 zone_page_state_add(v - overstep, zone, item); 503 __this_cpu_write(*p, overstep) 435 __this_cpu_write(*p, overstep); 504 } 436 } 505 << 506 preempt_enable_nested(); << 507 } 437 } 508 438 509 void __dec_node_state(struct pglist_data *pgda 439 void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item) 510 { 440 { 511 struct per_cpu_nodestat __percpu *pcp 441 struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats; 512 s8 __percpu *p = pcp->vm_node_stat_dif 442 s8 __percpu *p = pcp->vm_node_stat_diff + item; 513 s8 v, t; 443 s8 v, t; 514 444 515 VM_WARN_ON_ONCE(vmstat_item_in_bytes(i << 516 << 517 /* See __mod_node_page_state */ << 518 preempt_disable_nested(); << 519 << 520 v = __this_cpu_dec_return(*p); 445 v = __this_cpu_dec_return(*p); 521 t = __this_cpu_read(pcp->stat_threshol 446 t = __this_cpu_read(pcp->stat_threshold); 522 if (unlikely(v < - t)) { 447 if (unlikely(v < - t)) { 523 s8 overstep = t >> 1; 448 s8 overstep = t >> 1; 524 449 525 node_page_state_add(v - overst 450 node_page_state_add(v - overstep, pgdat, item); 526 __this_cpu_write(*p, overstep) 451 __this_cpu_write(*p, overstep); 527 } 452 } 528 << 529 preempt_enable_nested(); << 530 } 453 } 531 454 532 void __dec_zone_page_state(struct page *page, 455 void __dec_zone_page_state(struct page *page, enum zone_stat_item item) 533 { 456 { 534 __dec_zone_state(page_zone(page), item 457 __dec_zone_state(page_zone(page), item); 535 } 458 } 536 EXPORT_SYMBOL(__dec_zone_page_state); 459 EXPORT_SYMBOL(__dec_zone_page_state); 537 460 538 void __dec_node_page_state(struct page *page, 461 void __dec_node_page_state(struct page *page, enum node_stat_item item) 539 { 462 { 540 __dec_node_state(page_pgdat(page), ite 463 __dec_node_state(page_pgdat(page), item); 541 } 464 } 542 EXPORT_SYMBOL(__dec_node_page_state); 465 EXPORT_SYMBOL(__dec_node_page_state); 543 466 544 #ifdef CONFIG_HAVE_CMPXCHG_LOCAL 467 #ifdef CONFIG_HAVE_CMPXCHG_LOCAL 545 /* 468 /* 546 * If we have cmpxchg_local support then we do 469 * If we have cmpxchg_local support then we do not need to incur the overhead 547 * that comes with local_irq_save/restore if w 470 * that comes with local_irq_save/restore if we use this_cpu_cmpxchg. 548 * 471 * 549 * mod_state() modifies the zone counter state 472 * mod_state() modifies the zone counter state through atomic per cpu 550 * operations. 473 * operations. 551 * 474 * 552 * Overstep mode specifies how overstep should 475 * Overstep mode specifies how overstep should handled: 553 * 0 No overstepping 476 * 0 No overstepping 554 * 1 Overstepping half of threshold 477 * 1 Overstepping half of threshold 555 * -1 Overstepping minus half of thre 478 * -1 Overstepping minus half of threshold 556 */ 479 */ 557 static inline void mod_zone_state(struct zone 480 static inline void mod_zone_state(struct zone *zone, 558 enum zone_stat_item item, long delta, i 481 enum zone_stat_item item, long delta, int overstep_mode) 559 { 482 { 560 struct per_cpu_zonestat __percpu *pcp !! 483 struct per_cpu_pageset __percpu *pcp = zone->pageset; 561 s8 __percpu *p = pcp->vm_stat_diff + i 484 s8 __percpu *p = pcp->vm_stat_diff + item; 562 long n, t, z; !! 485 long o, n, t, z; 563 s8 o; << 564 486 565 o = this_cpu_read(*p); << 566 do { 487 do { 567 z = 0; /* overflow to zone co 488 z = 0; /* overflow to zone counters */ 568 489 569 /* 490 /* 570 * The fetching of the stat_th 491 * The fetching of the stat_threshold is racy. We may apply 571 * a counter threshold to the 492 * a counter threshold to the wrong the cpu if we get 572 * rescheduled while executing 493 * rescheduled while executing here. However, the next 573 * counter update will apply t 494 * counter update will apply the threshold again and 574 * therefore bring the counter 495 * therefore bring the counter under the threshold again. 575 * 496 * 576 * Most of the time the thresh 497 * Most of the time the thresholds are the same anyways 577 * for all cpus in a zone. 498 * for all cpus in a zone. 578 */ 499 */ 579 t = this_cpu_read(pcp->stat_th 500 t = this_cpu_read(pcp->stat_threshold); 580 501 581 n = delta + (long)o; !! 502 o = this_cpu_read(*p); >> 503 n = delta + o; 582 504 583 if (abs(n) > t) { !! 505 if (n > t || n < -t) { 584 int os = overstep_mode 506 int os = overstep_mode * (t >> 1) ; 585 507 586 /* Overflow must be ad 508 /* Overflow must be added to zone counters */ 587 z = n + os; 509 z = n + os; 588 n = -os; 510 n = -os; 589 } 511 } 590 } while (!this_cpu_try_cmpxchg(*p, &o, !! 512 } while (this_cpu_cmpxchg(*p, o, n) != o); 591 513 592 if (z) 514 if (z) 593 zone_page_state_add(z, zone, i 515 zone_page_state_add(z, zone, item); 594 } 516 } 595 517 596 void mod_zone_page_state(struct zone *zone, en 518 void mod_zone_page_state(struct zone *zone, enum zone_stat_item item, 597 long delta) 519 long delta) 598 { 520 { 599 mod_zone_state(zone, item, delta, 0); 521 mod_zone_state(zone, item, delta, 0); 600 } 522 } 601 EXPORT_SYMBOL(mod_zone_page_state); 523 EXPORT_SYMBOL(mod_zone_page_state); 602 524 603 void inc_zone_page_state(struct page *page, en 525 void inc_zone_page_state(struct page *page, enum zone_stat_item item) 604 { 526 { 605 mod_zone_state(page_zone(page), item, 527 mod_zone_state(page_zone(page), item, 1, 1); 606 } 528 } 607 EXPORT_SYMBOL(inc_zone_page_state); 529 EXPORT_SYMBOL(inc_zone_page_state); 608 530 609 void dec_zone_page_state(struct page *page, en 531 void dec_zone_page_state(struct page *page, enum zone_stat_item item) 610 { 532 { 611 mod_zone_state(page_zone(page), item, 533 mod_zone_state(page_zone(page), item, -1, -1); 612 } 534 } 613 EXPORT_SYMBOL(dec_zone_page_state); 535 EXPORT_SYMBOL(dec_zone_page_state); 614 536 615 static inline void mod_node_state(struct pglis 537 static inline void mod_node_state(struct pglist_data *pgdat, 616 enum node_stat_item item, int delta, in 538 enum node_stat_item item, int delta, int overstep_mode) 617 { 539 { 618 struct per_cpu_nodestat __percpu *pcp 540 struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats; 619 s8 __percpu *p = pcp->vm_node_stat_dif 541 s8 __percpu *p = pcp->vm_node_stat_diff + item; 620 long n, t, z; !! 542 long o, n, t, z; 621 s8 o; << 622 << 623 if (vmstat_item_in_bytes(item)) { << 624 /* << 625 * Only cgroups use subpage ac << 626 * the global level, these ite << 627 * multiples of whole pages. S << 628 * internally to keep the per- << 629 */ << 630 VM_WARN_ON_ONCE(delta & (PAGE_ << 631 delta >>= PAGE_SHIFT; << 632 } << 633 543 634 o = this_cpu_read(*p); << 635 do { 544 do { 636 z = 0; /* overflow to node co 545 z = 0; /* overflow to node counters */ 637 546 638 /* 547 /* 639 * The fetching of the stat_th 548 * The fetching of the stat_threshold is racy. We may apply 640 * a counter threshold to the 549 * a counter threshold to the wrong the cpu if we get 641 * rescheduled while executing 550 * rescheduled while executing here. However, the next 642 * counter update will apply t 551 * counter update will apply the threshold again and 643 * therefore bring the counter 552 * therefore bring the counter under the threshold again. 644 * 553 * 645 * Most of the time the thresh 554 * Most of the time the thresholds are the same anyways 646 * for all cpus in a node. 555 * for all cpus in a node. 647 */ 556 */ 648 t = this_cpu_read(pcp->stat_th 557 t = this_cpu_read(pcp->stat_threshold); 649 558 650 n = delta + (long)o; !! 559 o = this_cpu_read(*p); >> 560 n = delta + o; 651 561 652 if (abs(n) > t) { !! 562 if (n > t || n < -t) { 653 int os = overstep_mode 563 int os = overstep_mode * (t >> 1) ; 654 564 655 /* Overflow must be ad 565 /* Overflow must be added to node counters */ 656 z = n + os; 566 z = n + os; 657 n = -os; 567 n = -os; 658 } 568 } 659 } while (!this_cpu_try_cmpxchg(*p, &o, !! 569 } while (this_cpu_cmpxchg(*p, o, n) != o); 660 570 661 if (z) 571 if (z) 662 node_page_state_add(z, pgdat, 572 node_page_state_add(z, pgdat, item); 663 } 573 } 664 574 665 void mod_node_page_state(struct pglist_data *p 575 void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item, 666 long d 576 long delta) 667 { 577 { 668 mod_node_state(pgdat, item, delta, 0); 578 mod_node_state(pgdat, item, delta, 0); 669 } 579 } 670 EXPORT_SYMBOL(mod_node_page_state); 580 EXPORT_SYMBOL(mod_node_page_state); 671 581 672 void inc_node_state(struct pglist_data *pgdat, 582 void inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) 673 { 583 { 674 mod_node_state(pgdat, item, 1, 1); 584 mod_node_state(pgdat, item, 1, 1); 675 } 585 } 676 586 677 void inc_node_page_state(struct page *page, en 587 void inc_node_page_state(struct page *page, enum node_stat_item item) 678 { 588 { 679 mod_node_state(page_pgdat(page), item, 589 mod_node_state(page_pgdat(page), item, 1, 1); 680 } 590 } 681 EXPORT_SYMBOL(inc_node_page_state); 591 EXPORT_SYMBOL(inc_node_page_state); 682 592 683 void dec_node_page_state(struct page *page, en 593 void dec_node_page_state(struct page *page, enum node_stat_item item) 684 { 594 { 685 mod_node_state(page_pgdat(page), item, 595 mod_node_state(page_pgdat(page), item, -1, -1); 686 } 596 } 687 EXPORT_SYMBOL(dec_node_page_state); 597 EXPORT_SYMBOL(dec_node_page_state); 688 #else 598 #else 689 /* 599 /* 690 * Use interrupt disable to serialize counter 600 * Use interrupt disable to serialize counter updates 691 */ 601 */ 692 void mod_zone_page_state(struct zone *zone, en 602 void mod_zone_page_state(struct zone *zone, enum zone_stat_item item, 693 long delta) 603 long delta) 694 { 604 { 695 unsigned long flags; 605 unsigned long flags; 696 606 697 local_irq_save(flags); 607 local_irq_save(flags); 698 __mod_zone_page_state(zone, item, delt 608 __mod_zone_page_state(zone, item, delta); 699 local_irq_restore(flags); 609 local_irq_restore(flags); 700 } 610 } 701 EXPORT_SYMBOL(mod_zone_page_state); 611 EXPORT_SYMBOL(mod_zone_page_state); 702 612 703 void inc_zone_page_state(struct page *page, en 613 void inc_zone_page_state(struct page *page, enum zone_stat_item item) 704 { 614 { 705 unsigned long flags; 615 unsigned long flags; 706 struct zone *zone; 616 struct zone *zone; 707 617 708 zone = page_zone(page); 618 zone = page_zone(page); 709 local_irq_save(flags); 619 local_irq_save(flags); 710 __inc_zone_state(zone, item); 620 __inc_zone_state(zone, item); 711 local_irq_restore(flags); 621 local_irq_restore(flags); 712 } 622 } 713 EXPORT_SYMBOL(inc_zone_page_state); 623 EXPORT_SYMBOL(inc_zone_page_state); 714 624 715 void dec_zone_page_state(struct page *page, en 625 void dec_zone_page_state(struct page *page, enum zone_stat_item item) 716 { 626 { 717 unsigned long flags; 627 unsigned long flags; 718 628 719 local_irq_save(flags); 629 local_irq_save(flags); 720 __dec_zone_page_state(page, item); 630 __dec_zone_page_state(page, item); 721 local_irq_restore(flags); 631 local_irq_restore(flags); 722 } 632 } 723 EXPORT_SYMBOL(dec_zone_page_state); 633 EXPORT_SYMBOL(dec_zone_page_state); 724 634 725 void inc_node_state(struct pglist_data *pgdat, 635 void inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) 726 { 636 { 727 unsigned long flags; 637 unsigned long flags; 728 638 729 local_irq_save(flags); 639 local_irq_save(flags); 730 __inc_node_state(pgdat, item); 640 __inc_node_state(pgdat, item); 731 local_irq_restore(flags); 641 local_irq_restore(flags); 732 } 642 } 733 EXPORT_SYMBOL(inc_node_state); 643 EXPORT_SYMBOL(inc_node_state); 734 644 735 void mod_node_page_state(struct pglist_data *p 645 void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item, 736 long d 646 long delta) 737 { 647 { 738 unsigned long flags; 648 unsigned long flags; 739 649 740 local_irq_save(flags); 650 local_irq_save(flags); 741 __mod_node_page_state(pgdat, item, del 651 __mod_node_page_state(pgdat, item, delta); 742 local_irq_restore(flags); 652 local_irq_restore(flags); 743 } 653 } 744 EXPORT_SYMBOL(mod_node_page_state); 654 EXPORT_SYMBOL(mod_node_page_state); 745 655 746 void inc_node_page_state(struct page *page, en 656 void inc_node_page_state(struct page *page, enum node_stat_item item) 747 { 657 { 748 unsigned long flags; 658 unsigned long flags; 749 struct pglist_data *pgdat; 659 struct pglist_data *pgdat; 750 660 751 pgdat = page_pgdat(page); 661 pgdat = page_pgdat(page); 752 local_irq_save(flags); 662 local_irq_save(flags); 753 __inc_node_state(pgdat, item); 663 __inc_node_state(pgdat, item); 754 local_irq_restore(flags); 664 local_irq_restore(flags); 755 } 665 } 756 EXPORT_SYMBOL(inc_node_page_state); 666 EXPORT_SYMBOL(inc_node_page_state); 757 667 758 void dec_node_page_state(struct page *page, en 668 void dec_node_page_state(struct page *page, enum node_stat_item item) 759 { 669 { 760 unsigned long flags; 670 unsigned long flags; 761 671 762 local_irq_save(flags); 672 local_irq_save(flags); 763 __dec_node_page_state(page, item); 673 __dec_node_page_state(page, item); 764 local_irq_restore(flags); 674 local_irq_restore(flags); 765 } 675 } 766 EXPORT_SYMBOL(dec_node_page_state); 676 EXPORT_SYMBOL(dec_node_page_state); 767 #endif 677 #endif 768 678 769 /* 679 /* 770 * Fold a differential into the global counter 680 * Fold a differential into the global counters. 771 * Returns the number of counters updated. 681 * Returns the number of counters updated. 772 */ 682 */ >> 683 #ifdef CONFIG_NUMA >> 684 static int fold_diff(int *zone_diff, int *numa_diff, int *node_diff) >> 685 { >> 686 int i; >> 687 int changes = 0; >> 688 >> 689 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) >> 690 if (zone_diff[i]) { >> 691 atomic_long_add(zone_diff[i], &vm_zone_stat[i]); >> 692 changes++; >> 693 } >> 694 >> 695 for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) >> 696 if (numa_diff[i]) { >> 697 atomic_long_add(numa_diff[i], &vm_numa_stat[i]); >> 698 changes++; >> 699 } >> 700 >> 701 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) >> 702 if (node_diff[i]) { >> 703 atomic_long_add(node_diff[i], &vm_node_stat[i]); >> 704 changes++; >> 705 } >> 706 return changes; >> 707 } >> 708 #else 773 static int fold_diff(int *zone_diff, int *node 709 static int fold_diff(int *zone_diff, int *node_diff) 774 { 710 { 775 int i; 711 int i; 776 int changes = 0; 712 int changes = 0; 777 713 778 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; 714 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) 779 if (zone_diff[i]) { 715 if (zone_diff[i]) { 780 atomic_long_add(zone_d 716 atomic_long_add(zone_diff[i], &vm_zone_stat[i]); 781 changes++; 717 changes++; 782 } 718 } 783 719 784 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; 720 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) 785 if (node_diff[i]) { 721 if (node_diff[i]) { 786 atomic_long_add(node_d 722 atomic_long_add(node_diff[i], &vm_node_stat[i]); 787 changes++; 723 changes++; 788 } 724 } 789 return changes; 725 return changes; 790 } 726 } >> 727 #endif /* CONFIG_NUMA */ 791 728 792 /* 729 /* 793 * Update the zone counters for the current cp 730 * Update the zone counters for the current cpu. 794 * 731 * 795 * Note that refresh_cpu_vm_stats strives to o 732 * Note that refresh_cpu_vm_stats strives to only access 796 * node local memory. The per cpu pagesets on 733 * node local memory. The per cpu pagesets on remote zones are placed 797 * in the memory local to the processor using 734 * in the memory local to the processor using that pageset. So the 798 * loop over all zones will access a series of 735 * loop over all zones will access a series of cachelines local to 799 * the processor. 736 * the processor. 800 * 737 * 801 * The call to zone_page_state_add updates the 738 * The call to zone_page_state_add updates the cachelines with the 802 * statistics in the remote zone struct as wel 739 * statistics in the remote zone struct as well as the global cachelines 803 * with the global counters. These could cause 740 * with the global counters. These could cause remote node cache line 804 * bouncing and will have to be only done when 741 * bouncing and will have to be only done when necessary. 805 * 742 * 806 * The function returns the number of global c 743 * The function returns the number of global counters updated. 807 */ 744 */ 808 static int refresh_cpu_vm_stats(bool do_pagese 745 static int refresh_cpu_vm_stats(bool do_pagesets) 809 { 746 { 810 struct pglist_data *pgdat; 747 struct pglist_data *pgdat; 811 struct zone *zone; 748 struct zone *zone; 812 int i; 749 int i; 813 int global_zone_diff[NR_VM_ZONE_STAT_I 750 int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; >> 751 #ifdef CONFIG_NUMA >> 752 int global_numa_diff[NR_VM_NUMA_STAT_ITEMS] = { 0, }; >> 753 #endif 814 int global_node_diff[NR_VM_NODE_STAT_I 754 int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, }; 815 int changes = 0; 755 int changes = 0; 816 756 817 for_each_populated_zone(zone) { 757 for_each_populated_zone(zone) { 818 struct per_cpu_zonestat __perc !! 758 struct per_cpu_pageset __percpu *p = zone->pageset; 819 struct per_cpu_pages __percpu << 820 759 821 for (i = 0; i < NR_VM_ZONE_STA 760 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) { 822 int v; 761 int v; 823 762 824 v = this_cpu_xchg(pzst !! 763 v = this_cpu_xchg(p->vm_stat_diff[i], 0); 825 if (v) { 764 if (v) { 826 765 827 atomic_long_ad 766 atomic_long_add(v, &zone->vm_stat[i]); 828 global_zone_di 767 global_zone_diff[i] += v; 829 #ifdef CONFIG_NUMA 768 #ifdef CONFIG_NUMA 830 /* 3 seconds i 769 /* 3 seconds idle till flush */ 831 __this_cpu_wri !! 770 __this_cpu_write(p->expire, 3); 832 #endif 771 #endif 833 } 772 } 834 } 773 } >> 774 #ifdef CONFIG_NUMA >> 775 for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) { >> 776 int v; >> 777 >> 778 v = this_cpu_xchg(p->vm_numa_stat_diff[i], 0); >> 779 if (v) { >> 780 >> 781 atomic_long_add(v, &zone->vm_numa_stat[i]); >> 782 global_numa_diff[i] += v; >> 783 __this_cpu_write(p->expire, 3); >> 784 } >> 785 } 835 786 836 if (do_pagesets) { 787 if (do_pagesets) { 837 cond_resched(); 788 cond_resched(); 838 << 839 changes += decay_pcp_h << 840 #ifdef CONFIG_NUMA << 841 /* 789 /* 842 * Deal with draining 790 * Deal with draining the remote pageset of this 843 * processor 791 * processor 844 * 792 * 845 * Check if there are 793 * Check if there are pages remaining in this pageset 846 * if not then there i 794 * if not then there is nothing to expire. 847 */ 795 */ 848 if (!__this_cpu_read(p !! 796 if (!__this_cpu_read(p->expire) || 849 !__this_cpu_rea !! 797 !__this_cpu_read(p->pcp.count)) 850 continue; 798 continue; 851 799 852 /* 800 /* 853 * We never drain zone 801 * We never drain zones local to this processor. 854 */ 802 */ 855 if (zone_to_nid(zone) 803 if (zone_to_nid(zone) == numa_node_id()) { 856 __this_cpu_wri !! 804 __this_cpu_write(p->expire, 0); 857 continue; 805 continue; 858 } 806 } 859 807 860 if (__this_cpu_dec_ret !! 808 if (__this_cpu_dec_return(p->expire)) 861 changes++; << 862 continue; 809 continue; 863 } << 864 810 865 if (__this_cpu_read(pc !! 811 if (__this_cpu_read(p->pcp.count)) { 866 drain_zone_pag !! 812 drain_zone_pages(zone, this_cpu_ptr(&p->pcp)); 867 changes++; 813 changes++; 868 } 814 } 869 #endif << 870 } 815 } >> 816 #endif 871 } 817 } 872 818 873 for_each_online_pgdat(pgdat) { 819 for_each_online_pgdat(pgdat) { 874 struct per_cpu_nodestat __perc 820 struct per_cpu_nodestat __percpu *p = pgdat->per_cpu_nodestats; 875 821 876 for (i = 0; i < NR_VM_NODE_STA 822 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) { 877 int v; 823 int v; 878 824 879 v = this_cpu_xchg(p->v 825 v = this_cpu_xchg(p->vm_node_stat_diff[i], 0); 880 if (v) { 826 if (v) { 881 atomic_long_ad 827 atomic_long_add(v, &pgdat->vm_stat[i]); 882 global_node_di 828 global_node_diff[i] += v; 883 } 829 } 884 } 830 } 885 } 831 } 886 832 >> 833 #ifdef CONFIG_NUMA >> 834 changes += fold_diff(global_zone_diff, global_numa_diff, >> 835 global_node_diff); >> 836 #else 887 changes += fold_diff(global_zone_diff, 837 changes += fold_diff(global_zone_diff, global_node_diff); >> 838 #endif 888 return changes; 839 return changes; 889 } 840 } 890 841 891 /* 842 /* 892 * Fold the data for an offline cpu into the g 843 * Fold the data for an offline cpu into the global array. 893 * There cannot be any access by the offline c 844 * There cannot be any access by the offline cpu and therefore 894 * synchronization is simplified. 845 * synchronization is simplified. 895 */ 846 */ 896 void cpu_vm_stats_fold(int cpu) 847 void cpu_vm_stats_fold(int cpu) 897 { 848 { 898 struct pglist_data *pgdat; 849 struct pglist_data *pgdat; 899 struct zone *zone; 850 struct zone *zone; 900 int i; 851 int i; 901 int global_zone_diff[NR_VM_ZONE_STAT_I 852 int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; >> 853 #ifdef CONFIG_NUMA >> 854 int global_numa_diff[NR_VM_NUMA_STAT_ITEMS] = { 0, }; >> 855 #endif 902 int global_node_diff[NR_VM_NODE_STAT_I 856 int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, }; 903 857 904 for_each_populated_zone(zone) { 858 for_each_populated_zone(zone) { 905 struct per_cpu_zonestat *pzsta !! 859 struct per_cpu_pageset *p; 906 860 907 pzstats = per_cpu_ptr(zone->pe !! 861 p = per_cpu_ptr(zone->pageset, cpu); 908 862 909 for (i = 0; i < NR_VM_ZONE_STA !! 863 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) 910 if (pzstats->vm_stat_d !! 864 if (p->vm_stat_diff[i]) { 911 int v; 865 int v; 912 866 913 v = pzstats->v !! 867 v = p->vm_stat_diff[i]; 914 pzstats->vm_st !! 868 p->vm_stat_diff[i] = 0; 915 atomic_long_ad 869 atomic_long_add(v, &zone->vm_stat[i]); 916 global_zone_di 870 global_zone_diff[i] += v; 917 } 871 } 918 } !! 872 919 #ifdef CONFIG_NUMA 873 #ifdef CONFIG_NUMA 920 for (i = 0; i < NR_VM_NUMA_EVE !! 874 for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) 921 if (pzstats->vm_numa_e !! 875 if (p->vm_numa_stat_diff[i]) { 922 unsigned long !! 876 int v; 923 !! 877 924 v = pzstats->v !! 878 v = p->vm_numa_stat_diff[i]; 925 pzstats->vm_nu !! 879 p->vm_numa_stat_diff[i] = 0; 926 zone_numa_even !! 880 atomic_long_add(v, &zone->vm_numa_stat[i]); >> 881 global_numa_diff[i] += v; 927 } 882 } 928 } << 929 #endif 883 #endif 930 } 884 } 931 885 932 for_each_online_pgdat(pgdat) { 886 for_each_online_pgdat(pgdat) { 933 struct per_cpu_nodestat *p; 887 struct per_cpu_nodestat *p; 934 888 935 p = per_cpu_ptr(pgdat->per_cpu 889 p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu); 936 890 937 for (i = 0; i < NR_VM_NODE_STA 891 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) 938 if (p->vm_node_stat_di 892 if (p->vm_node_stat_diff[i]) { 939 int v; 893 int v; 940 894 941 v = p->vm_node 895 v = p->vm_node_stat_diff[i]; 942 p->vm_node_sta 896 p->vm_node_stat_diff[i] = 0; 943 atomic_long_ad 897 atomic_long_add(v, &pgdat->vm_stat[i]); 944 global_node_di 898 global_node_diff[i] += v; 945 } 899 } 946 } 900 } 947 901 >> 902 #ifdef CONFIG_NUMA >> 903 fold_diff(global_zone_diff, global_numa_diff, global_node_diff); >> 904 #else 948 fold_diff(global_zone_diff, global_nod 905 fold_diff(global_zone_diff, global_node_diff); >> 906 #endif 949 } 907 } 950 908 951 /* 909 /* 952 * this is only called if !populated_zone(zone 910 * this is only called if !populated_zone(zone), which implies no other users of 953 * pset->vm_stat_diff[] exist. !! 911 * pset->vm_stat_diff[] exsist. 954 */ 912 */ 955 void drain_zonestat(struct zone *zone, struct !! 913 void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset) 956 { 914 { 957 unsigned long v; << 958 int i; 915 int i; 959 916 960 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; !! 917 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) 961 if (pzstats->vm_stat_diff[i]) !! 918 if (pset->vm_stat_diff[i]) { 962 v = pzstats->vm_stat_d !! 919 int v = pset->vm_stat_diff[i]; 963 pzstats->vm_stat_diff[ !! 920 pset->vm_stat_diff[i] = 0; 964 zone_page_state_add(v, !! 921 atomic_long_add(v, &zone->vm_stat[i]); >> 922 atomic_long_add(v, &vm_zone_stat[i]); 965 } 923 } 966 } << 967 924 968 #ifdef CONFIG_NUMA 925 #ifdef CONFIG_NUMA 969 for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS !! 926 for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) 970 if (pzstats->vm_numa_event[i]) !! 927 if (pset->vm_numa_stat_diff[i]) { 971 v = pzstats->vm_numa_e !! 928 int v = pset->vm_numa_stat_diff[i]; 972 pzstats->vm_numa_event !! 929 973 zone_numa_event_add(v, !! 930 pset->vm_numa_stat_diff[i] = 0; >> 931 atomic_long_add(v, &zone->vm_numa_stat[i]); >> 932 atomic_long_add(v, &vm_numa_stat[i]); 974 } 933 } 975 } << 976 #endif 934 #endif 977 } 935 } 978 #endif 936 #endif 979 937 980 #ifdef CONFIG_NUMA 938 #ifdef CONFIG_NUMA >> 939 void __inc_numa_state(struct zone *zone, >> 940 enum numa_stat_item item) >> 941 { >> 942 struct per_cpu_pageset __percpu *pcp = zone->pageset; >> 943 u16 __percpu *p = pcp->vm_numa_stat_diff + item; >> 944 u16 v; >> 945 >> 946 v = __this_cpu_inc_return(*p); >> 947 >> 948 if (unlikely(v > NUMA_STATS_THRESHOLD)) { >> 949 zone_numa_state_add(v, zone, item); >> 950 __this_cpu_write(*p, 0); >> 951 } >> 952 } >> 953 981 /* 954 /* 982 * Determine the per node value of a stat item 955 * Determine the per node value of a stat item. This function 983 * is called frequently in a NUMA machine, so 956 * is called frequently in a NUMA machine, so try to be as 984 * frugal as possible. 957 * frugal as possible. 985 */ 958 */ 986 unsigned long sum_zone_node_page_state(int nod 959 unsigned long sum_zone_node_page_state(int node, 987 enum zone_sta 960 enum zone_stat_item item) 988 { 961 { 989 struct zone *zones = NODE_DATA(node)-> 962 struct zone *zones = NODE_DATA(node)->node_zones; 990 int i; 963 int i; 991 unsigned long count = 0; 964 unsigned long count = 0; 992 965 993 for (i = 0; i < MAX_NR_ZONES; i++) 966 for (i = 0; i < MAX_NR_ZONES; i++) 994 count += zone_page_state(zones 967 count += zone_page_state(zones + i, item); 995 968 996 return count; 969 return count; 997 } 970 } 998 971 999 /* Determine the per node value of a numa stat !! 972 /* 1000 unsigned long sum_zone_numa_event_state(int n !! 973 * Determine the per node value of a numa stat item. To avoid deviation, >> 974 * the per cpu stat number in vm_numa_stat_diff[] is also included. >> 975 */ >> 976 unsigned long sum_zone_numa_state(int node, 1001 enum numa_st 977 enum numa_stat_item item) 1002 { 978 { 1003 struct zone *zones = NODE_DATA(node)- 979 struct zone *zones = NODE_DATA(node)->node_zones; 1004 unsigned long count = 0; << 1005 int i; 980 int i; >> 981 unsigned long count = 0; 1006 982 1007 for (i = 0; i < MAX_NR_ZONES; i++) 983 for (i = 0; i < MAX_NR_ZONES; i++) 1008 count += zone_numa_event_stat !! 984 count += zone_numa_state_snapshot(zones + i, item); 1009 985 1010 return count; 986 return count; 1011 } 987 } 1012 988 1013 /* 989 /* 1014 * Determine the per node value of a stat ite 990 * Determine the per node value of a stat item. 1015 */ 991 */ 1016 unsigned long node_page_state_pages(struct pg !! 992 unsigned long node_page_state(struct pglist_data *pgdat, 1017 enum node !! 993 enum node_stat_item item) 1018 { 994 { 1019 long x = atomic_long_read(&pgdat->vm_ 995 long x = atomic_long_read(&pgdat->vm_stat[item]); 1020 #ifdef CONFIG_SMP 996 #ifdef CONFIG_SMP 1021 if (x < 0) 997 if (x < 0) 1022 x = 0; 998 x = 0; 1023 #endif 999 #endif 1024 return x; 1000 return x; 1025 } 1001 } 1026 << 1027 unsigned long node_page_state(struct pglist_d << 1028 enum node_stat_ << 1029 { << 1030 VM_WARN_ON_ONCE(vmstat_item_in_bytes( << 1031 << 1032 return node_page_state_pages(pgdat, i << 1033 } << 1034 #endif 1002 #endif 1035 1003 1036 /* << 1037 * Count number of pages "struct page" and "s << 1038 * nr_memmap_boot_pages: # of pages allocated << 1039 * nr_memmap_pages: # of pages that were allo << 1040 */ << 1041 static atomic_long_t nr_memmap_boot_pages = A << 1042 static atomic_long_t nr_memmap_pages = ATOMIC << 1043 << 1044 void memmap_boot_pages_add(long delta) << 1045 { << 1046 atomic_long_add(delta, &nr_memmap_boo << 1047 } << 1048 << 1049 void memmap_pages_add(long delta) << 1050 { << 1051 atomic_long_add(delta, &nr_memmap_pag << 1052 } << 1053 << 1054 #ifdef CONFIG_COMPACTION 1004 #ifdef CONFIG_COMPACTION 1055 1005 1056 struct contig_page_info { 1006 struct contig_page_info { 1057 unsigned long free_pages; 1007 unsigned long free_pages; 1058 unsigned long free_blocks_total; 1008 unsigned long free_blocks_total; 1059 unsigned long free_blocks_suitable; 1009 unsigned long free_blocks_suitable; 1060 }; 1010 }; 1061 1011 1062 /* 1012 /* 1063 * Calculate the number of free pages in a zo 1013 * Calculate the number of free pages in a zone, how many contiguous 1064 * pages are free and how many are large enou 1014 * pages are free and how many are large enough to satisfy an allocation of 1065 * the target size. Note that this function m 1015 * the target size. Note that this function makes no attempt to estimate 1066 * how many suitable free blocks there *might 1016 * how many suitable free blocks there *might* be if MOVABLE pages were 1067 * migrated. Calculating that is possible, bu 1017 * migrated. Calculating that is possible, but expensive and can be 1068 * figured out from userspace 1018 * figured out from userspace 1069 */ 1019 */ 1070 static void fill_contig_page_info(struct zone 1020 static void fill_contig_page_info(struct zone *zone, 1071 unsigned int 1021 unsigned int suitable_order, 1072 struct contig 1022 struct contig_page_info *info) 1073 { 1023 { 1074 unsigned int order; 1024 unsigned int order; 1075 1025 1076 info->free_pages = 0; 1026 info->free_pages = 0; 1077 info->free_blocks_total = 0; 1027 info->free_blocks_total = 0; 1078 info->free_blocks_suitable = 0; 1028 info->free_blocks_suitable = 0; 1079 1029 1080 for (order = 0; order < NR_PAGE_ORDER !! 1030 for (order = 0; order < MAX_ORDER; order++) { 1081 unsigned long blocks; 1031 unsigned long blocks; 1082 1032 1083 /* !! 1033 /* Count number of free blocks */ 1084 * Count number of free block !! 1034 blocks = zone->free_area[order].nr_free; 1085 * << 1086 * Access to nr_free is lockl << 1087 * diagnostic purposes. Use d << 1088 */ << 1089 blocks = data_race(zone->free << 1090 info->free_blocks_total += bl 1035 info->free_blocks_total += blocks; 1091 1036 1092 /* Count free base pages */ 1037 /* Count free base pages */ 1093 info->free_pages += blocks << 1038 info->free_pages += blocks << order; 1094 1039 1095 /* Count the suitable free bl 1040 /* Count the suitable free blocks */ 1096 if (order >= suitable_order) 1041 if (order >= suitable_order) 1097 info->free_blocks_sui 1042 info->free_blocks_suitable += blocks << 1098 1043 (order - suitable_order); 1099 } 1044 } 1100 } 1045 } 1101 1046 1102 /* 1047 /* 1103 * A fragmentation index only makes sense if 1048 * A fragmentation index only makes sense if an allocation of a requested 1104 * size would fail. If that is true, the frag 1049 * size would fail. If that is true, the fragmentation index indicates 1105 * whether external fragmentation or a lack o 1050 * whether external fragmentation or a lack of memory was the problem. 1106 * The value can be used to determine if page 1051 * The value can be used to determine if page reclaim or compaction 1107 * should be used 1052 * should be used 1108 */ 1053 */ 1109 static int __fragmentation_index(unsigned int 1054 static int __fragmentation_index(unsigned int order, struct contig_page_info *info) 1110 { 1055 { 1111 unsigned long requested = 1UL << orde 1056 unsigned long requested = 1UL << order; 1112 1057 1113 if (WARN_ON_ONCE(order > MAX_PAGE_ORD !! 1058 if (WARN_ON_ONCE(order >= MAX_ORDER)) 1114 return 0; 1059 return 0; 1115 1060 1116 if (!info->free_blocks_total) 1061 if (!info->free_blocks_total) 1117 return 0; 1062 return 0; 1118 1063 1119 /* Fragmentation index only makes sen 1064 /* Fragmentation index only makes sense when a request would fail */ 1120 if (info->free_blocks_suitable) 1065 if (info->free_blocks_suitable) 1121 return -1000; 1066 return -1000; 1122 1067 1123 /* 1068 /* 1124 * Index is between 0 and 1 so return 1069 * Index is between 0 and 1 so return within 3 decimal places 1125 * 1070 * 1126 * 0 => allocation would fail due to 1071 * 0 => allocation would fail due to lack of memory 1127 * 1 => allocation would fail due to 1072 * 1 => allocation would fail due to fragmentation 1128 */ 1073 */ 1129 return 1000 - div_u64( (1000+(div_u64 1074 return 1000 - div_u64( (1000+(div_u64(info->free_pages * 1000ULL, requested))), info->free_blocks_total); 1130 } 1075 } 1131 1076 1132 /* << 1133 * Calculates external fragmentation within a << 1134 * It is defined as the percentage of pages f << 1135 * less than 1 << order. It returns values in << 1136 */ << 1137 unsigned int extfrag_for_order(struct zone *z << 1138 { << 1139 struct contig_page_info info; << 1140 << 1141 fill_contig_page_info(zone, order, &i << 1142 if (info.free_pages == 0) << 1143 return 0; << 1144 << 1145 return div_u64((info.free_pages - << 1146 (info.free_blocks_sui << 1147 info.free_pages); << 1148 } << 1149 << 1150 /* Same as __fragmentation index but allocs c 1077 /* Same as __fragmentation index but allocs contig_page_info on stack */ 1151 int fragmentation_index(struct zone *zone, un 1078 int fragmentation_index(struct zone *zone, unsigned int order) 1152 { 1079 { 1153 struct contig_page_info info; 1080 struct contig_page_info info; 1154 1081 1155 fill_contig_page_info(zone, order, &i 1082 fill_contig_page_info(zone, order, &info); 1156 return __fragmentation_index(order, & 1083 return __fragmentation_index(order, &info); 1157 } 1084 } 1158 #endif 1085 #endif 1159 1086 1160 #if defined(CONFIG_PROC_FS) || defined(CONFIG 1087 #if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) || \ 1161 defined(CONFIG_NUMA) || defined(CONFIG_ME 1088 defined(CONFIG_NUMA) || defined(CONFIG_MEMCG) 1162 #ifdef CONFIG_ZONE_DMA 1089 #ifdef CONFIG_ZONE_DMA 1163 #define TEXT_FOR_DMA(xx) xx "_dma", 1090 #define TEXT_FOR_DMA(xx) xx "_dma", 1164 #else 1091 #else 1165 #define TEXT_FOR_DMA(xx) 1092 #define TEXT_FOR_DMA(xx) 1166 #endif 1093 #endif 1167 1094 1168 #ifdef CONFIG_ZONE_DMA32 1095 #ifdef CONFIG_ZONE_DMA32 1169 #define TEXT_FOR_DMA32(xx) xx "_dma32", 1096 #define TEXT_FOR_DMA32(xx) xx "_dma32", 1170 #else 1097 #else 1171 #define TEXT_FOR_DMA32(xx) 1098 #define TEXT_FOR_DMA32(xx) 1172 #endif 1099 #endif 1173 1100 1174 #ifdef CONFIG_HIGHMEM 1101 #ifdef CONFIG_HIGHMEM 1175 #define TEXT_FOR_HIGHMEM(xx) xx "_high", 1102 #define TEXT_FOR_HIGHMEM(xx) xx "_high", 1176 #else 1103 #else 1177 #define TEXT_FOR_HIGHMEM(xx) 1104 #define TEXT_FOR_HIGHMEM(xx) 1178 #endif 1105 #endif 1179 1106 1180 #ifdef CONFIG_ZONE_DEVICE << 1181 #define TEXT_FOR_DEVICE(xx) xx "_device", << 1182 #else << 1183 #define TEXT_FOR_DEVICE(xx) << 1184 #endif << 1185 << 1186 #define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) 1107 #define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \ 1187 TEXT_ !! 1108 TEXT_FOR_HIGHMEM(xx) xx "_movable", 1188 TEXT_ << 1189 1109 1190 const char * const vmstat_text[] = { 1110 const char * const vmstat_text[] = { 1191 /* enum zone_stat_item counters */ 1111 /* enum zone_stat_item counters */ 1192 "nr_free_pages", 1112 "nr_free_pages", 1193 "nr_zone_inactive_anon", 1113 "nr_zone_inactive_anon", 1194 "nr_zone_active_anon", 1114 "nr_zone_active_anon", 1195 "nr_zone_inactive_file", 1115 "nr_zone_inactive_file", 1196 "nr_zone_active_file", 1116 "nr_zone_active_file", 1197 "nr_zone_unevictable", 1117 "nr_zone_unevictable", 1198 "nr_zone_write_pending", 1118 "nr_zone_write_pending", 1199 "nr_mlock", 1119 "nr_mlock", >> 1120 "nr_page_table_pages", >> 1121 "nr_kernel_stack", >> 1122 #if IS_ENABLED(CONFIG_SHADOW_CALL_STACK) >> 1123 "nr_shadow_call_stack", >> 1124 #endif 1200 "nr_bounce", 1125 "nr_bounce", 1201 #if IS_ENABLED(CONFIG_ZSMALLOC) 1126 #if IS_ENABLED(CONFIG_ZSMALLOC) 1202 "nr_zspages", 1127 "nr_zspages", 1203 #endif 1128 #endif 1204 "nr_free_cma", 1129 "nr_free_cma", 1205 #ifdef CONFIG_UNACCEPTED_MEMORY << 1206 "nr_unaccepted", << 1207 #endif << 1208 1130 1209 /* enum numa_stat_item counters */ 1131 /* enum numa_stat_item counters */ 1210 #ifdef CONFIG_NUMA 1132 #ifdef CONFIG_NUMA 1211 "numa_hit", 1133 "numa_hit", 1212 "numa_miss", 1134 "numa_miss", 1213 "numa_foreign", 1135 "numa_foreign", 1214 "numa_interleave", 1136 "numa_interleave", 1215 "numa_local", 1137 "numa_local", 1216 "numa_other", 1138 "numa_other", 1217 #endif 1139 #endif 1218 1140 1219 /* enum node_stat_item counters */ 1141 /* enum node_stat_item counters */ 1220 "nr_inactive_anon", 1142 "nr_inactive_anon", 1221 "nr_active_anon", 1143 "nr_active_anon", 1222 "nr_inactive_file", 1144 "nr_inactive_file", 1223 "nr_active_file", 1145 "nr_active_file", 1224 "nr_unevictable", 1146 "nr_unevictable", 1225 "nr_slab_reclaimable", 1147 "nr_slab_reclaimable", 1226 "nr_slab_unreclaimable", 1148 "nr_slab_unreclaimable", 1227 "nr_isolated_anon", 1149 "nr_isolated_anon", 1228 "nr_isolated_file", 1150 "nr_isolated_file", 1229 "workingset_nodes", 1151 "workingset_nodes", 1230 "workingset_refault_anon", !! 1152 "workingset_refault", 1231 "workingset_refault_file", !! 1153 "workingset_activate", 1232 "workingset_activate_anon", !! 1154 "workingset_restore", 1233 "workingset_activate_file", << 1234 "workingset_restore_anon", << 1235 "workingset_restore_file", << 1236 "workingset_nodereclaim", 1155 "workingset_nodereclaim", 1237 "nr_anon_pages", 1156 "nr_anon_pages", 1238 "nr_mapped", 1157 "nr_mapped", 1239 "nr_file_pages", 1158 "nr_file_pages", 1240 "nr_dirty", 1159 "nr_dirty", 1241 "nr_writeback", 1160 "nr_writeback", 1242 "nr_writeback_temp", 1161 "nr_writeback_temp", 1243 "nr_shmem", 1162 "nr_shmem", 1244 "nr_shmem_hugepages", 1163 "nr_shmem_hugepages", 1245 "nr_shmem_pmdmapped", 1164 "nr_shmem_pmdmapped", 1246 "nr_file_hugepages", 1165 "nr_file_hugepages", 1247 "nr_file_pmdmapped", 1166 "nr_file_pmdmapped", 1248 "nr_anon_transparent_hugepages", 1167 "nr_anon_transparent_hugepages", 1249 "nr_vmscan_write", 1168 "nr_vmscan_write", 1250 "nr_vmscan_immediate_reclaim", 1169 "nr_vmscan_immediate_reclaim", 1251 "nr_dirtied", 1170 "nr_dirtied", 1252 "nr_written", 1171 "nr_written", 1253 "nr_throttled_written", << 1254 "nr_kernel_misc_reclaimable", 1172 "nr_kernel_misc_reclaimable", 1255 "nr_foll_pin_acquired", 1173 "nr_foll_pin_acquired", 1256 "nr_foll_pin_released", 1174 "nr_foll_pin_released", 1257 "nr_kernel_stack", !! 1175 1258 #if IS_ENABLED(CONFIG_SHADOW_CALL_STACK) !! 1176 /* enum writeback_stat_item counters */ 1259 "nr_shadow_call_stack", << 1260 #endif << 1261 "nr_page_table_pages", << 1262 "nr_sec_page_table_pages", << 1263 #ifdef CONFIG_IOMMU_SUPPORT << 1264 "nr_iommu_pages", << 1265 #endif << 1266 #ifdef CONFIG_SWAP << 1267 "nr_swapcached", << 1268 #endif << 1269 #ifdef CONFIG_NUMA_BALANCING << 1270 "pgpromote_success", << 1271 "pgpromote_candidate", << 1272 #endif << 1273 "pgdemote_kswapd", << 1274 "pgdemote_direct", << 1275 "pgdemote_khugepaged", << 1276 /* system-wide enum vm_stat_item coun << 1277 "nr_dirty_threshold", 1177 "nr_dirty_threshold", 1278 "nr_dirty_background_threshold", 1178 "nr_dirty_background_threshold", 1279 "nr_memmap_pages", << 1280 "nr_memmap_boot_pages", << 1281 1179 1282 #if defined(CONFIG_VM_EVENT_COUNTERS) || defi 1180 #if defined(CONFIG_VM_EVENT_COUNTERS) || defined(CONFIG_MEMCG) 1283 /* enum vm_event_item counters */ 1181 /* enum vm_event_item counters */ 1284 "pgpgin", 1182 "pgpgin", 1285 "pgpgout", 1183 "pgpgout", 1286 "pswpin", 1184 "pswpin", 1287 "pswpout", 1185 "pswpout", 1288 1186 1289 TEXTS_FOR_ZONES("pgalloc") 1187 TEXTS_FOR_ZONES("pgalloc") 1290 TEXTS_FOR_ZONES("allocstall") 1188 TEXTS_FOR_ZONES("allocstall") 1291 TEXTS_FOR_ZONES("pgskip") 1189 TEXTS_FOR_ZONES("pgskip") 1292 1190 1293 "pgfree", 1191 "pgfree", 1294 "pgactivate", 1192 "pgactivate", 1295 "pgdeactivate", 1193 "pgdeactivate", 1296 "pglazyfree", 1194 "pglazyfree", 1297 1195 1298 "pgfault", 1196 "pgfault", 1299 "pgmajfault", 1197 "pgmajfault", 1300 "pglazyfreed", 1198 "pglazyfreed", 1301 1199 1302 "pgrefill", 1200 "pgrefill", 1303 "pgreuse", << 1304 "pgsteal_kswapd", 1201 "pgsteal_kswapd", 1305 "pgsteal_direct", 1202 "pgsteal_direct", 1306 "pgsteal_khugepaged", << 1307 "pgscan_kswapd", 1203 "pgscan_kswapd", 1308 "pgscan_direct", 1204 "pgscan_direct", 1309 "pgscan_khugepaged", << 1310 "pgscan_direct_throttle", 1205 "pgscan_direct_throttle", 1311 "pgscan_anon", 1206 "pgscan_anon", 1312 "pgscan_file", 1207 "pgscan_file", 1313 "pgsteal_anon", 1208 "pgsteal_anon", 1314 "pgsteal_file", 1209 "pgsteal_file", 1315 1210 1316 #ifdef CONFIG_NUMA 1211 #ifdef CONFIG_NUMA 1317 "zone_reclaim_success", << 1318 "zone_reclaim_failed", 1212 "zone_reclaim_failed", 1319 #endif 1213 #endif 1320 "pginodesteal", 1214 "pginodesteal", 1321 "slabs_scanned", 1215 "slabs_scanned", 1322 "kswapd_inodesteal", 1216 "kswapd_inodesteal", 1323 "kswapd_low_wmark_hit_quickly", 1217 "kswapd_low_wmark_hit_quickly", 1324 "kswapd_high_wmark_hit_quickly", 1218 "kswapd_high_wmark_hit_quickly", 1325 "pageoutrun", 1219 "pageoutrun", 1326 1220 1327 "pgrotated", 1221 "pgrotated", 1328 1222 1329 "drop_pagecache", 1223 "drop_pagecache", 1330 "drop_slab", 1224 "drop_slab", 1331 "oom_kill", 1225 "oom_kill", 1332 1226 1333 #ifdef CONFIG_NUMA_BALANCING 1227 #ifdef CONFIG_NUMA_BALANCING 1334 "numa_pte_updates", 1228 "numa_pte_updates", 1335 "numa_huge_pte_updates", 1229 "numa_huge_pte_updates", 1336 "numa_hint_faults", 1230 "numa_hint_faults", 1337 "numa_hint_faults_local", 1231 "numa_hint_faults_local", 1338 "numa_pages_migrated", 1232 "numa_pages_migrated", 1339 #endif 1233 #endif 1340 #ifdef CONFIG_MIGRATION 1234 #ifdef CONFIG_MIGRATION 1341 "pgmigrate_success", 1235 "pgmigrate_success", 1342 "pgmigrate_fail", 1236 "pgmigrate_fail", 1343 "thp_migration_success", << 1344 "thp_migration_fail", << 1345 "thp_migration_split", << 1346 #endif 1237 #endif 1347 #ifdef CONFIG_COMPACTION 1238 #ifdef CONFIG_COMPACTION 1348 "compact_migrate_scanned", 1239 "compact_migrate_scanned", 1349 "compact_free_scanned", 1240 "compact_free_scanned", 1350 "compact_isolated", 1241 "compact_isolated", 1351 "compact_stall", 1242 "compact_stall", 1352 "compact_fail", 1243 "compact_fail", 1353 "compact_success", 1244 "compact_success", 1354 "compact_daemon_wake", 1245 "compact_daemon_wake", 1355 "compact_daemon_migrate_scanned", 1246 "compact_daemon_migrate_scanned", 1356 "compact_daemon_free_scanned", 1247 "compact_daemon_free_scanned", 1357 #endif 1248 #endif 1358 1249 1359 #ifdef CONFIG_HUGETLB_PAGE 1250 #ifdef CONFIG_HUGETLB_PAGE 1360 "htlb_buddy_alloc_success", 1251 "htlb_buddy_alloc_success", 1361 "htlb_buddy_alloc_fail", 1252 "htlb_buddy_alloc_fail", 1362 #endif 1253 #endif 1363 #ifdef CONFIG_CMA << 1364 "cma_alloc_success", << 1365 "cma_alloc_fail", << 1366 #endif << 1367 "unevictable_pgs_culled", 1254 "unevictable_pgs_culled", 1368 "unevictable_pgs_scanned", 1255 "unevictable_pgs_scanned", 1369 "unevictable_pgs_rescued", 1256 "unevictable_pgs_rescued", 1370 "unevictable_pgs_mlocked", 1257 "unevictable_pgs_mlocked", 1371 "unevictable_pgs_munlocked", 1258 "unevictable_pgs_munlocked", 1372 "unevictable_pgs_cleared", 1259 "unevictable_pgs_cleared", 1373 "unevictable_pgs_stranded", 1260 "unevictable_pgs_stranded", 1374 1261 1375 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1262 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1376 "thp_fault_alloc", 1263 "thp_fault_alloc", 1377 "thp_fault_fallback", 1264 "thp_fault_fallback", 1378 "thp_fault_fallback_charge", 1265 "thp_fault_fallback_charge", 1379 "thp_collapse_alloc", 1266 "thp_collapse_alloc", 1380 "thp_collapse_alloc_failed", 1267 "thp_collapse_alloc_failed", 1381 "thp_file_alloc", 1268 "thp_file_alloc", 1382 "thp_file_fallback", 1269 "thp_file_fallback", 1383 "thp_file_fallback_charge", 1270 "thp_file_fallback_charge", 1384 "thp_file_mapped", 1271 "thp_file_mapped", 1385 "thp_split_page", 1272 "thp_split_page", 1386 "thp_split_page_failed", 1273 "thp_split_page_failed", 1387 "thp_deferred_split_page", 1274 "thp_deferred_split_page", 1388 "thp_underused_split_page", << 1389 "thp_split_pmd", 1275 "thp_split_pmd", 1390 "thp_scan_exceed_none_pte", << 1391 "thp_scan_exceed_swap_pte", << 1392 "thp_scan_exceed_share_pte", << 1393 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_ 1276 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD 1394 "thp_split_pud", 1277 "thp_split_pud", 1395 #endif 1278 #endif 1396 "thp_zero_page_alloc", 1279 "thp_zero_page_alloc", 1397 "thp_zero_page_alloc_failed", 1280 "thp_zero_page_alloc_failed", 1398 "thp_swpout", 1281 "thp_swpout", 1399 "thp_swpout_fallback", 1282 "thp_swpout_fallback", 1400 #endif 1283 #endif 1401 #ifdef CONFIG_MEMORY_BALLOON 1284 #ifdef CONFIG_MEMORY_BALLOON 1402 "balloon_inflate", 1285 "balloon_inflate", 1403 "balloon_deflate", 1286 "balloon_deflate", 1404 #ifdef CONFIG_BALLOON_COMPACTION 1287 #ifdef CONFIG_BALLOON_COMPACTION 1405 "balloon_migrate", 1288 "balloon_migrate", 1406 #endif 1289 #endif 1407 #endif /* CONFIG_MEMORY_BALLOON */ 1290 #endif /* CONFIG_MEMORY_BALLOON */ 1408 #ifdef CONFIG_DEBUG_TLBFLUSH 1291 #ifdef CONFIG_DEBUG_TLBFLUSH 1409 "nr_tlb_remote_flush", 1292 "nr_tlb_remote_flush", 1410 "nr_tlb_remote_flush_received", 1293 "nr_tlb_remote_flush_received", 1411 "nr_tlb_local_flush_all", 1294 "nr_tlb_local_flush_all", 1412 "nr_tlb_local_flush_one", 1295 "nr_tlb_local_flush_one", 1413 #endif /* CONFIG_DEBUG_TLBFLUSH */ 1296 #endif /* CONFIG_DEBUG_TLBFLUSH */ 1414 1297 >> 1298 #ifdef CONFIG_DEBUG_VM_VMACACHE >> 1299 "vmacache_find_calls", >> 1300 "vmacache_find_hits", >> 1301 #endif 1415 #ifdef CONFIG_SWAP 1302 #ifdef CONFIG_SWAP 1416 "swap_ra", 1303 "swap_ra", 1417 "swap_ra_hit", 1304 "swap_ra_hit", 1418 #ifdef CONFIG_KSM << 1419 "ksm_swpin_copy", << 1420 #endif << 1421 #endif << 1422 #ifdef CONFIG_KSM << 1423 "cow_ksm", << 1424 #endif << 1425 #ifdef CONFIG_ZSWAP << 1426 "zswpin", << 1427 "zswpout", << 1428 "zswpwb", << 1429 #endif << 1430 #ifdef CONFIG_X86 << 1431 "direct_map_level2_splits", << 1432 "direct_map_level3_splits", << 1433 #endif << 1434 #ifdef CONFIG_PER_VMA_LOCK_STATS << 1435 "vma_lock_success", << 1436 "vma_lock_abort", << 1437 "vma_lock_retry", << 1438 "vma_lock_miss", << 1439 #endif << 1440 #ifdef CONFIG_DEBUG_STACK_USAGE << 1441 "kstack_1k", << 1442 #if THREAD_SIZE > 1024 << 1443 "kstack_2k", << 1444 #endif << 1445 #if THREAD_SIZE > 2048 << 1446 "kstack_4k", << 1447 #endif << 1448 #if THREAD_SIZE > 4096 << 1449 "kstack_8k", << 1450 #endif << 1451 #if THREAD_SIZE > 8192 << 1452 "kstack_16k", << 1453 #endif << 1454 #if THREAD_SIZE > 16384 << 1455 "kstack_32k", << 1456 #endif << 1457 #if THREAD_SIZE > 32768 << 1458 "kstack_64k", << 1459 #endif << 1460 #if THREAD_SIZE > 65536 << 1461 "kstack_rest", << 1462 #endif << 1463 #endif 1305 #endif 1464 #endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_ 1306 #endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */ 1465 }; 1307 }; 1466 #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || C 1308 #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA || CONFIG_MEMCG */ 1467 1309 1468 #if (defined(CONFIG_DEBUG_FS) && defined(CONF 1310 #if (defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)) || \ 1469 defined(CONFIG_PROC_FS) 1311 defined(CONFIG_PROC_FS) 1470 static void *frag_start(struct seq_file *m, l 1312 static void *frag_start(struct seq_file *m, loff_t *pos) 1471 { 1313 { 1472 pg_data_t *pgdat; 1314 pg_data_t *pgdat; 1473 loff_t node = *pos; 1315 loff_t node = *pos; 1474 1316 1475 for (pgdat = first_online_pgdat(); 1317 for (pgdat = first_online_pgdat(); 1476 pgdat && node; 1318 pgdat && node; 1477 pgdat = next_online_pgdat(pgdat) 1319 pgdat = next_online_pgdat(pgdat)) 1478 --node; 1320 --node; 1479 1321 1480 return pgdat; 1322 return pgdat; 1481 } 1323 } 1482 1324 1483 static void *frag_next(struct seq_file *m, vo 1325 static void *frag_next(struct seq_file *m, void *arg, loff_t *pos) 1484 { 1326 { 1485 pg_data_t *pgdat = (pg_data_t *)arg; 1327 pg_data_t *pgdat = (pg_data_t *)arg; 1486 1328 1487 (*pos)++; 1329 (*pos)++; 1488 return next_online_pgdat(pgdat); 1330 return next_online_pgdat(pgdat); 1489 } 1331 } 1490 1332 1491 static void frag_stop(struct seq_file *m, voi 1333 static void frag_stop(struct seq_file *m, void *arg) 1492 { 1334 { 1493 } 1335 } 1494 1336 1495 /* 1337 /* 1496 * Walk zones in a node and print using a cal 1338 * Walk zones in a node and print using a callback. 1497 * If @assert_populated is true, only use cal 1339 * If @assert_populated is true, only use callback for zones that are populated. 1498 */ 1340 */ 1499 static void walk_zones_in_node(struct seq_fil 1341 static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat, 1500 bool assert_populated, bool n 1342 bool assert_populated, bool nolock, 1501 void (*print)(struct seq_file 1343 void (*print)(struct seq_file *m, pg_data_t *, struct zone *)) 1502 { 1344 { 1503 struct zone *zone; 1345 struct zone *zone; 1504 struct zone *node_zones = pgdat->node 1346 struct zone *node_zones = pgdat->node_zones; 1505 unsigned long flags; 1347 unsigned long flags; 1506 1348 1507 for (zone = node_zones; zone - node_z 1349 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { 1508 if (assert_populated && !popu 1350 if (assert_populated && !populated_zone(zone)) 1509 continue; 1351 continue; 1510 1352 1511 if (!nolock) 1353 if (!nolock) 1512 spin_lock_irqsave(&zo 1354 spin_lock_irqsave(&zone->lock, flags); 1513 print(m, pgdat, zone); 1355 print(m, pgdat, zone); 1514 if (!nolock) 1356 if (!nolock) 1515 spin_unlock_irqrestor 1357 spin_unlock_irqrestore(&zone->lock, flags); 1516 } 1358 } 1517 } 1359 } 1518 #endif 1360 #endif 1519 1361 1520 #ifdef CONFIG_PROC_FS 1362 #ifdef CONFIG_PROC_FS 1521 static void frag_show_print(struct seq_file * 1363 static void frag_show_print(struct seq_file *m, pg_data_t *pgdat, 1522 1364 struct zone *zone) 1523 { 1365 { 1524 int order; 1366 int order; 1525 1367 1526 seq_printf(m, "Node %d, zone %8s ", p 1368 seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); 1527 for (order = 0; order < NR_PAGE_ORDER !! 1369 for (order = 0; order < MAX_ORDER; ++order) 1528 /* !! 1370 seq_printf(m, "%6lu ", zone->free_area[order].nr_free); 1529 * Access to nr_free is lockl << 1530 * printing purposes. Use dat << 1531 */ << 1532 seq_printf(m, "%6lu ", data_r << 1533 seq_putc(m, '\n'); 1371 seq_putc(m, '\n'); 1534 } 1372 } 1535 1373 1536 /* 1374 /* 1537 * This walks the free areas for each zone. 1375 * This walks the free areas for each zone. 1538 */ 1376 */ 1539 static int frag_show(struct seq_file *m, void 1377 static int frag_show(struct seq_file *m, void *arg) 1540 { 1378 { 1541 pg_data_t *pgdat = (pg_data_t *)arg; 1379 pg_data_t *pgdat = (pg_data_t *)arg; 1542 walk_zones_in_node(m, pgdat, true, fa 1380 walk_zones_in_node(m, pgdat, true, false, frag_show_print); 1543 return 0; 1381 return 0; 1544 } 1382 } 1545 1383 1546 static void pagetypeinfo_showfree_print(struc 1384 static void pagetypeinfo_showfree_print(struct seq_file *m, 1547 pg_da 1385 pg_data_t *pgdat, struct zone *zone) 1548 { 1386 { 1549 int order, mtype; 1387 int order, mtype; 1550 1388 1551 for (mtype = 0; mtype < MIGRATE_TYPES 1389 for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) { 1552 seq_printf(m, "Node %4d, zone 1390 seq_printf(m, "Node %4d, zone %8s, type %12s ", 1553 pgdat 1391 pgdat->node_id, 1554 zone- 1392 zone->name, 1555 migra 1393 migratetype_names[mtype]); 1556 for (order = 0; order < NR_PA !! 1394 for (order = 0; order < MAX_ORDER; ++order) { 1557 unsigned long freecou 1395 unsigned long freecount = 0; 1558 struct free_area *are 1396 struct free_area *area; 1559 struct list_head *cur 1397 struct list_head *curr; 1560 bool overflow = false 1398 bool overflow = false; 1561 1399 1562 area = &(zone->free_a 1400 area = &(zone->free_area[order]); 1563 1401 1564 list_for_each(curr, & 1402 list_for_each(curr, &area->free_list[mtype]) { 1565 /* 1403 /* 1566 * Cap the fr 1404 * Cap the free_list iteration because it might 1567 * be really 1405 * be really large and we are under a spinlock 1568 * so a long 1406 * so a long time spent here could trigger a 1569 * hard locku 1407 * hard lockup detector. Anyway this is a 1570 * debugging 1408 * debugging tool so knowing there is a handful 1571 * of pages o 1409 * of pages of this order should be more than 1572 * sufficient 1410 * sufficient. 1573 */ 1411 */ 1574 if (++freecou 1412 if (++freecount >= 100000) { 1575 overf 1413 overflow = true; 1576 break 1414 break; 1577 } 1415 } 1578 } 1416 } 1579 seq_printf(m, "%s%6lu 1417 seq_printf(m, "%s%6lu ", overflow ? ">" : "", freecount); 1580 spin_unlock_irq(&zone 1418 spin_unlock_irq(&zone->lock); 1581 cond_resched(); 1419 cond_resched(); 1582 spin_lock_irq(&zone-> 1420 spin_lock_irq(&zone->lock); 1583 } 1421 } 1584 seq_putc(m, '\n'); 1422 seq_putc(m, '\n'); 1585 } 1423 } 1586 } 1424 } 1587 1425 1588 /* Print out the free pages at each order for 1426 /* Print out the free pages at each order for each migatetype */ 1589 static void pagetypeinfo_showfree(struct seq_ !! 1427 static int pagetypeinfo_showfree(struct seq_file *m, void *arg) 1590 { 1428 { 1591 int order; 1429 int order; 1592 pg_data_t *pgdat = (pg_data_t *)arg; 1430 pg_data_t *pgdat = (pg_data_t *)arg; 1593 1431 1594 /* Print header */ 1432 /* Print header */ 1595 seq_printf(m, "%-43s ", "Free pages c 1433 seq_printf(m, "%-43s ", "Free pages count per migrate type at order"); 1596 for (order = 0; order < NR_PAGE_ORDER !! 1434 for (order = 0; order < MAX_ORDER; ++order) 1597 seq_printf(m, "%6d ", order); 1435 seq_printf(m, "%6d ", order); 1598 seq_putc(m, '\n'); 1436 seq_putc(m, '\n'); 1599 1437 1600 walk_zones_in_node(m, pgdat, true, fa 1438 walk_zones_in_node(m, pgdat, true, false, pagetypeinfo_showfree_print); >> 1439 >> 1440 return 0; 1601 } 1441 } 1602 1442 1603 static void pagetypeinfo_showblockcount_print 1443 static void pagetypeinfo_showblockcount_print(struct seq_file *m, 1604 pg_da 1444 pg_data_t *pgdat, struct zone *zone) 1605 { 1445 { 1606 int mtype; 1446 int mtype; 1607 unsigned long pfn; 1447 unsigned long pfn; 1608 unsigned long start_pfn = zone->zone_ 1448 unsigned long start_pfn = zone->zone_start_pfn; 1609 unsigned long end_pfn = zone_end_pfn( 1449 unsigned long end_pfn = zone_end_pfn(zone); 1610 unsigned long count[MIGRATE_TYPES] = 1450 unsigned long count[MIGRATE_TYPES] = { 0, }; 1611 1451 1612 for (pfn = start_pfn; pfn < end_pfn; 1452 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { 1613 struct page *page; 1453 struct page *page; 1614 1454 1615 page = pfn_to_online_page(pfn 1455 page = pfn_to_online_page(pfn); 1616 if (!page) 1456 if (!page) 1617 continue; 1457 continue; 1618 1458 >> 1459 /* Watch for unexpected holes punched in the memmap */ >> 1460 if (!memmap_valid_within(pfn, page, zone)) >> 1461 continue; >> 1462 1619 if (page_zone(page) != zone) 1463 if (page_zone(page) != zone) 1620 continue; 1464 continue; 1621 1465 1622 mtype = get_pageblock_migrate 1466 mtype = get_pageblock_migratetype(page); 1623 1467 1624 if (mtype < MIGRATE_TYPES) 1468 if (mtype < MIGRATE_TYPES) 1625 count[mtype]++; 1469 count[mtype]++; 1626 } 1470 } 1627 1471 1628 /* Print counts */ 1472 /* Print counts */ 1629 seq_printf(m, "Node %d, zone %8s ", p 1473 seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); 1630 for (mtype = 0; mtype < MIGRATE_TYPES 1474 for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) 1631 seq_printf(m, "%12lu ", count 1475 seq_printf(m, "%12lu ", count[mtype]); 1632 seq_putc(m, '\n'); 1476 seq_putc(m, '\n'); 1633 } 1477 } 1634 1478 1635 /* Print out the number of pageblocks for eac 1479 /* Print out the number of pageblocks for each migratetype */ 1636 static void pagetypeinfo_showblockcount(struc !! 1480 static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg) 1637 { 1481 { 1638 int mtype; 1482 int mtype; 1639 pg_data_t *pgdat = (pg_data_t *)arg; 1483 pg_data_t *pgdat = (pg_data_t *)arg; 1640 1484 1641 seq_printf(m, "\n%-23s", "Number of b 1485 seq_printf(m, "\n%-23s", "Number of blocks type "); 1642 for (mtype = 0; mtype < MIGRATE_TYPES 1486 for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) 1643 seq_printf(m, "%12s ", migrat 1487 seq_printf(m, "%12s ", migratetype_names[mtype]); 1644 seq_putc(m, '\n'); 1488 seq_putc(m, '\n'); 1645 walk_zones_in_node(m, pgdat, true, fa 1489 walk_zones_in_node(m, pgdat, true, false, 1646 pagetypeinfo_showblockcount_p 1490 pagetypeinfo_showblockcount_print); >> 1491 >> 1492 return 0; 1647 } 1493 } 1648 1494 1649 /* 1495 /* 1650 * Print out the number of pageblocks for eac 1496 * Print out the number of pageblocks for each migratetype that contain pages 1651 * of other types. This gives an indication o 1497 * of other types. This gives an indication of how well fallbacks are being 1652 * contained by rmqueue_fallback(). It requir 1498 * contained by rmqueue_fallback(). It requires information from PAGE_OWNER 1653 * to determine what is going on 1499 * to determine what is going on 1654 */ 1500 */ 1655 static void pagetypeinfo_showmixedcount(struc 1501 static void pagetypeinfo_showmixedcount(struct seq_file *m, pg_data_t *pgdat) 1656 { 1502 { 1657 #ifdef CONFIG_PAGE_OWNER 1503 #ifdef CONFIG_PAGE_OWNER 1658 int mtype; 1504 int mtype; 1659 1505 1660 if (!static_branch_unlikely(&page_own 1506 if (!static_branch_unlikely(&page_owner_inited)) 1661 return; 1507 return; 1662 1508 1663 drain_all_pages(NULL); 1509 drain_all_pages(NULL); 1664 1510 1665 seq_printf(m, "\n%-23s", "Number of m 1511 seq_printf(m, "\n%-23s", "Number of mixed blocks "); 1666 for (mtype = 0; mtype < MIGRATE_TYPES 1512 for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) 1667 seq_printf(m, "%12s ", migrat 1513 seq_printf(m, "%12s ", migratetype_names[mtype]); 1668 seq_putc(m, '\n'); 1514 seq_putc(m, '\n'); 1669 1515 1670 walk_zones_in_node(m, pgdat, true, tr 1516 walk_zones_in_node(m, pgdat, true, true, 1671 pagetypeinfo_showmixedcount_p 1517 pagetypeinfo_showmixedcount_print); 1672 #endif /* CONFIG_PAGE_OWNER */ 1518 #endif /* CONFIG_PAGE_OWNER */ 1673 } 1519 } 1674 1520 1675 /* 1521 /* 1676 * This prints out statistics in relation to 1522 * This prints out statistics in relation to grouping pages by mobility. 1677 * It is expensive to collect so do not const 1523 * It is expensive to collect so do not constantly read the file. 1678 */ 1524 */ 1679 static int pagetypeinfo_show(struct seq_file 1525 static int pagetypeinfo_show(struct seq_file *m, void *arg) 1680 { 1526 { 1681 pg_data_t *pgdat = (pg_data_t *)arg; 1527 pg_data_t *pgdat = (pg_data_t *)arg; 1682 1528 1683 /* check memoryless node */ 1529 /* check memoryless node */ 1684 if (!node_state(pgdat->node_id, N_MEM 1530 if (!node_state(pgdat->node_id, N_MEMORY)) 1685 return 0; 1531 return 0; 1686 1532 1687 seq_printf(m, "Page block order: %d\n 1533 seq_printf(m, "Page block order: %d\n", pageblock_order); 1688 seq_printf(m, "Pages per block: %lu\ 1534 seq_printf(m, "Pages per block: %lu\n", pageblock_nr_pages); 1689 seq_putc(m, '\n'); 1535 seq_putc(m, '\n'); 1690 pagetypeinfo_showfree(m, pgdat); 1536 pagetypeinfo_showfree(m, pgdat); 1691 pagetypeinfo_showblockcount(m, pgdat) 1537 pagetypeinfo_showblockcount(m, pgdat); 1692 pagetypeinfo_showmixedcount(m, pgdat) 1538 pagetypeinfo_showmixedcount(m, pgdat); 1693 1539 1694 return 0; 1540 return 0; 1695 } 1541 } 1696 1542 1697 static const struct seq_operations fragmentat 1543 static const struct seq_operations fragmentation_op = { 1698 .start = frag_start, 1544 .start = frag_start, 1699 .next = frag_next, 1545 .next = frag_next, 1700 .stop = frag_stop, 1546 .stop = frag_stop, 1701 .show = frag_show, 1547 .show = frag_show, 1702 }; 1548 }; 1703 1549 1704 static const struct seq_operations pagetypein 1550 static const struct seq_operations pagetypeinfo_op = { 1705 .start = frag_start, 1551 .start = frag_start, 1706 .next = frag_next, 1552 .next = frag_next, 1707 .stop = frag_stop, 1553 .stop = frag_stop, 1708 .show = pagetypeinfo_show, 1554 .show = pagetypeinfo_show, 1709 }; 1555 }; 1710 1556 1711 static bool is_zone_first_populated(pg_data_t 1557 static bool is_zone_first_populated(pg_data_t *pgdat, struct zone *zone) 1712 { 1558 { 1713 int zid; 1559 int zid; 1714 1560 1715 for (zid = 0; zid < MAX_NR_ZONES; zid 1561 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 1716 struct zone *compare = &pgdat 1562 struct zone *compare = &pgdat->node_zones[zid]; 1717 1563 1718 if (populated_zone(compare)) 1564 if (populated_zone(compare)) 1719 return zone == compar 1565 return zone == compare; 1720 } 1566 } 1721 1567 1722 return false; 1568 return false; 1723 } 1569 } 1724 1570 1725 static void zoneinfo_show_print(struct seq_fi 1571 static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, 1726 1572 struct zone *zone) 1727 { 1573 { 1728 int i; 1574 int i; 1729 seq_printf(m, "Node %d, zone %8s", pg 1575 seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name); 1730 if (is_zone_first_populated(pgdat, zo 1576 if (is_zone_first_populated(pgdat, zone)) { 1731 seq_printf(m, "\n per-node s 1577 seq_printf(m, "\n per-node stats"); 1732 for (i = 0; i < NR_VM_NODE_ST 1578 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) { 1733 unsigned long pages = << 1734 << 1735 if (vmstat_item_print << 1736 pages /= HPAG << 1737 seq_printf(m, "\n 1579 seq_printf(m, "\n %-12s %lu", node_stat_name(i), 1738 pages); !! 1580 node_page_state(pgdat, i)); 1739 } 1581 } 1740 } 1582 } 1741 seq_printf(m, 1583 seq_printf(m, 1742 "\n pages free %lu" 1584 "\n pages free %lu" 1743 "\n boost %lu" << 1744 "\n min %lu" 1585 "\n min %lu" 1745 "\n low %lu" 1586 "\n low %lu" 1746 "\n high %lu" 1587 "\n high %lu" 1747 "\n promo %lu" << 1748 "\n spanned %lu" 1588 "\n spanned %lu" 1749 "\n present %lu" 1589 "\n present %lu" 1750 "\n managed %lu" !! 1590 "\n managed %lu", 1751 "\n cma %lu", << 1752 zone_page_state(zone, NR_F 1591 zone_page_state(zone, NR_FREE_PAGES), 1753 zone->watermark_boost, << 1754 min_wmark_pages(zone), 1592 min_wmark_pages(zone), 1755 low_wmark_pages(zone), 1593 low_wmark_pages(zone), 1756 high_wmark_pages(zone), 1594 high_wmark_pages(zone), 1757 promo_wmark_pages(zone), << 1758 zone->spanned_pages, 1595 zone->spanned_pages, 1759 zone->present_pages, 1596 zone->present_pages, 1760 zone_managed_pages(zone), !! 1597 zone_managed_pages(zone)); 1761 zone_cma_pages(zone)); << 1762 1598 1763 seq_printf(m, 1599 seq_printf(m, 1764 "\n protection: (%l 1600 "\n protection: (%ld", 1765 zone->lowmem_reserve[0]); 1601 zone->lowmem_reserve[0]); 1766 for (i = 1; i < ARRAY_SIZE(zone->lowm 1602 for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++) 1767 seq_printf(m, ", %ld", zone-> 1603 seq_printf(m, ", %ld", zone->lowmem_reserve[i]); 1768 seq_putc(m, ')'); 1604 seq_putc(m, ')'); 1769 1605 1770 /* If unpopulated, no other informati 1606 /* If unpopulated, no other information is useful */ 1771 if (!populated_zone(zone)) { 1607 if (!populated_zone(zone)) { 1772 seq_putc(m, '\n'); 1608 seq_putc(m, '\n'); 1773 return; 1609 return; 1774 } 1610 } 1775 1611 1776 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS 1612 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) 1777 seq_printf(m, "\n %-12s 1613 seq_printf(m, "\n %-12s %lu", zone_stat_name(i), 1778 zone_page_state(zo 1614 zone_page_state(zone, i)); 1779 1615 1780 #ifdef CONFIG_NUMA 1616 #ifdef CONFIG_NUMA 1781 for (i = 0; i < NR_VM_NUMA_EVENT_ITEM !! 1617 for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) 1782 seq_printf(m, "\n %-12s 1618 seq_printf(m, "\n %-12s %lu", numa_stat_name(i), 1783 zone_numa_event_st !! 1619 zone_numa_state_snapshot(zone, i)); 1784 #endif 1620 #endif 1785 1621 1786 seq_printf(m, "\n pagesets"); 1622 seq_printf(m, "\n pagesets"); 1787 for_each_online_cpu(i) { 1623 for_each_online_cpu(i) { 1788 struct per_cpu_pages *pcp; !! 1624 struct per_cpu_pageset *pageset; 1789 struct per_cpu_zonestat __may << 1790 1625 1791 pcp = per_cpu_ptr(zone->per_c !! 1626 pageset = per_cpu_ptr(zone->pageset, i); 1792 seq_printf(m, 1627 seq_printf(m, 1793 "\n cpu: %i" 1628 "\n cpu: %i" 1794 "\n c 1629 "\n count: %i" 1795 "\n h 1630 "\n high: %i" 1796 "\n b 1631 "\n batch: %i", 1797 i, 1632 i, 1798 pcp->count, !! 1633 pageset->pcp.count, 1799 pcp->high, !! 1634 pageset->pcp.high, 1800 pcp->batch); !! 1635 pageset->pcp.batch); 1801 #ifdef CONFIG_SMP 1636 #ifdef CONFIG_SMP 1802 pzstats = per_cpu_ptr(zone->p << 1803 seq_printf(m, "\n vm stats t 1637 seq_printf(m, "\n vm stats threshold: %d", 1804 pzstats->stat !! 1638 pageset->stat_threshold); 1805 #endif 1639 #endif 1806 } 1640 } 1807 seq_printf(m, 1641 seq_printf(m, 1808 "\n node_unreclaimable: 1642 "\n node_unreclaimable: %u" 1809 "\n start_pfn: 1643 "\n start_pfn: %lu", 1810 pgdat->kswapd_failures >= 1644 pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES, 1811 zone->zone_start_pfn); 1645 zone->zone_start_pfn); 1812 seq_putc(m, '\n'); 1646 seq_putc(m, '\n'); 1813 } 1647 } 1814 1648 1815 /* 1649 /* 1816 * Output information about zones in @pgdat. 1650 * Output information about zones in @pgdat. All zones are printed regardless 1817 * of whether they are populated or not: lowm 1651 * of whether they are populated or not: lowmem_reserve_ratio operates on the 1818 * set of all zones and userspace would not b 1652 * set of all zones and userspace would not be aware of such zones if they are 1819 * suppressed here (zoneinfo displays the eff 1653 * suppressed here (zoneinfo displays the effect of lowmem_reserve_ratio). 1820 */ 1654 */ 1821 static int zoneinfo_show(struct seq_file *m, 1655 static int zoneinfo_show(struct seq_file *m, void *arg) 1822 { 1656 { 1823 pg_data_t *pgdat = (pg_data_t *)arg; 1657 pg_data_t *pgdat = (pg_data_t *)arg; 1824 walk_zones_in_node(m, pgdat, false, f 1658 walk_zones_in_node(m, pgdat, false, false, zoneinfo_show_print); 1825 return 0; 1659 return 0; 1826 } 1660 } 1827 1661 1828 static const struct seq_operations zoneinfo_o 1662 static const struct seq_operations zoneinfo_op = { 1829 .start = frag_start, /* iterate over 1663 .start = frag_start, /* iterate over all zones. The same as in 1830 * fragmentatio 1664 * fragmentation. */ 1831 .next = frag_next, 1665 .next = frag_next, 1832 .stop = frag_stop, 1666 .stop = frag_stop, 1833 .show = zoneinfo_show, 1667 .show = zoneinfo_show, 1834 }; 1668 }; 1835 1669 1836 #define NR_VMSTAT_ITEMS (NR_VM_ZONE_STAT_ITEM 1670 #define NR_VMSTAT_ITEMS (NR_VM_ZONE_STAT_ITEMS + \ 1837 NR_VM_NUMA_EVENT_ITE !! 1671 NR_VM_NUMA_STAT_ITEMS + \ 1838 NR_VM_NODE_STAT_ITEM 1672 NR_VM_NODE_STAT_ITEMS + \ 1839 NR_VM_STAT_ITEMS + \ !! 1673 NR_VM_WRITEBACK_STAT_ITEMS + \ 1840 (IS_ENABLED(CONFIG_V 1674 (IS_ENABLED(CONFIG_VM_EVENT_COUNTERS) ? \ 1841 NR_VM_EVENT_ITEMS : 1675 NR_VM_EVENT_ITEMS : 0)) 1842 1676 1843 static void *vmstat_start(struct seq_file *m, 1677 static void *vmstat_start(struct seq_file *m, loff_t *pos) 1844 { 1678 { 1845 unsigned long *v; 1679 unsigned long *v; 1846 int i; 1680 int i; 1847 1681 1848 if (*pos >= NR_VMSTAT_ITEMS) 1682 if (*pos >= NR_VMSTAT_ITEMS) 1849 return NULL; 1683 return NULL; 1850 1684 1851 BUILD_BUG_ON(ARRAY_SIZE(vmstat_text) 1685 BUILD_BUG_ON(ARRAY_SIZE(vmstat_text) < NR_VMSTAT_ITEMS); 1852 fold_vm_numa_events(); << 1853 v = kmalloc_array(NR_VMSTAT_ITEMS, si 1686 v = kmalloc_array(NR_VMSTAT_ITEMS, sizeof(unsigned long), GFP_KERNEL); 1854 m->private = v; 1687 m->private = v; 1855 if (!v) 1688 if (!v) 1856 return ERR_PTR(-ENOMEM); 1689 return ERR_PTR(-ENOMEM); 1857 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS 1690 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) 1858 v[i] = global_zone_page_state 1691 v[i] = global_zone_page_state(i); 1859 v += NR_VM_ZONE_STAT_ITEMS; 1692 v += NR_VM_ZONE_STAT_ITEMS; 1860 1693 1861 #ifdef CONFIG_NUMA 1694 #ifdef CONFIG_NUMA 1862 for (i = 0; i < NR_VM_NUMA_EVENT_ITEM !! 1695 for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) 1863 v[i] = global_numa_event_stat !! 1696 v[i] = global_numa_state(i); 1864 v += NR_VM_NUMA_EVENT_ITEMS; !! 1697 v += NR_VM_NUMA_STAT_ITEMS; 1865 #endif 1698 #endif 1866 1699 1867 for (i = 0; i < NR_VM_NODE_STAT_ITEMS !! 1700 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) 1868 v[i] = global_node_page_state !! 1701 v[i] = global_node_page_state(i); 1869 if (vmstat_item_print_in_thp( << 1870 v[i] /= HPAGE_PMD_NR; << 1871 } << 1872 v += NR_VM_NODE_STAT_ITEMS; 1702 v += NR_VM_NODE_STAT_ITEMS; 1873 1703 1874 global_dirty_limits(v + NR_DIRTY_BG_T 1704 global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD, 1875 v + NR_DIRTY_THRE 1705 v + NR_DIRTY_THRESHOLD); 1876 v[NR_MEMMAP_PAGES] = atomic_long_read !! 1706 v += NR_VM_WRITEBACK_STAT_ITEMS; 1877 v[NR_MEMMAP_BOOT_PAGES] = atomic_long << 1878 v += NR_VM_STAT_ITEMS; << 1879 1707 1880 #ifdef CONFIG_VM_EVENT_COUNTERS 1708 #ifdef CONFIG_VM_EVENT_COUNTERS 1881 all_vm_events(v); 1709 all_vm_events(v); 1882 v[PGPGIN] /= 2; /* sectors -> 1710 v[PGPGIN] /= 2; /* sectors -> kbytes */ 1883 v[PGPGOUT] /= 2; 1711 v[PGPGOUT] /= 2; 1884 #endif 1712 #endif 1885 return (unsigned long *)m->private + 1713 return (unsigned long *)m->private + *pos; 1886 } 1714 } 1887 1715 1888 static void *vmstat_next(struct seq_file *m, 1716 static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos) 1889 { 1717 { 1890 (*pos)++; 1718 (*pos)++; 1891 if (*pos >= NR_VMSTAT_ITEMS) 1719 if (*pos >= NR_VMSTAT_ITEMS) 1892 return NULL; 1720 return NULL; 1893 return (unsigned long *)m->private + 1721 return (unsigned long *)m->private + *pos; 1894 } 1722 } 1895 1723 1896 static int vmstat_show(struct seq_file *m, vo 1724 static int vmstat_show(struct seq_file *m, void *arg) 1897 { 1725 { 1898 unsigned long *l = arg; 1726 unsigned long *l = arg; 1899 unsigned long off = l - (unsigned lon 1727 unsigned long off = l - (unsigned long *)m->private; 1900 1728 1901 seq_puts(m, vmstat_text[off]); 1729 seq_puts(m, vmstat_text[off]); 1902 seq_put_decimal_ull(m, " ", *l); 1730 seq_put_decimal_ull(m, " ", *l); 1903 seq_putc(m, '\n'); 1731 seq_putc(m, '\n'); 1904 1732 1905 if (off == NR_VMSTAT_ITEMS - 1) { 1733 if (off == NR_VMSTAT_ITEMS - 1) { 1906 /* 1734 /* 1907 * We've come to the end - ad 1735 * We've come to the end - add any deprecated counters to avoid 1908 * breaking userspace which m 1736 * breaking userspace which might depend on them being present. 1909 */ 1737 */ 1910 seq_puts(m, "nr_unstable 0\n" 1738 seq_puts(m, "nr_unstable 0\n"); 1911 } 1739 } 1912 return 0; 1740 return 0; 1913 } 1741 } 1914 1742 1915 static void vmstat_stop(struct seq_file *m, v 1743 static void vmstat_stop(struct seq_file *m, void *arg) 1916 { 1744 { 1917 kfree(m->private); 1745 kfree(m->private); 1918 m->private = NULL; 1746 m->private = NULL; 1919 } 1747 } 1920 1748 1921 static const struct seq_operations vmstat_op 1749 static const struct seq_operations vmstat_op = { 1922 .start = vmstat_start, 1750 .start = vmstat_start, 1923 .next = vmstat_next, 1751 .next = vmstat_next, 1924 .stop = vmstat_stop, 1752 .stop = vmstat_stop, 1925 .show = vmstat_show, 1753 .show = vmstat_show, 1926 }; 1754 }; 1927 #endif /* CONFIG_PROC_FS */ 1755 #endif /* CONFIG_PROC_FS */ 1928 1756 1929 #ifdef CONFIG_SMP 1757 #ifdef CONFIG_SMP 1930 static DEFINE_PER_CPU(struct delayed_work, vm 1758 static DEFINE_PER_CPU(struct delayed_work, vmstat_work); 1931 int sysctl_stat_interval __read_mostly = HZ; 1759 int sysctl_stat_interval __read_mostly = HZ; 1932 1760 1933 #ifdef CONFIG_PROC_FS 1761 #ifdef CONFIG_PROC_FS 1934 static void refresh_vm_stats(struct work_stru 1762 static void refresh_vm_stats(struct work_struct *work) 1935 { 1763 { 1936 refresh_cpu_vm_stats(true); 1764 refresh_cpu_vm_stats(true); 1937 } 1765 } 1938 1766 1939 int vmstat_refresh(const struct ctl_table *ta !! 1767 int vmstat_refresh(struct ctl_table *table, int write, 1940 void *buffer, size_t *lenp 1768 void *buffer, size_t *lenp, loff_t *ppos) 1941 { 1769 { 1942 long val; 1770 long val; 1943 int err; 1771 int err; 1944 int i; 1772 int i; 1945 1773 1946 /* 1774 /* 1947 * The regular update, every sysctl_s 1775 * The regular update, every sysctl_stat_interval, may come later 1948 * than expected: leaving a significa 1776 * than expected: leaving a significant amount in per_cpu buckets. 1949 * This is particularly misleading wh 1777 * This is particularly misleading when checking a quantity of HUGE 1950 * pages, immediately after running a 1778 * pages, immediately after running a test. /proc/sys/vm/stat_refresh, 1951 * which can equally be echo'ed to or 1779 * which can equally be echo'ed to or cat'ted from (by root), 1952 * can be used to update the stats ju 1780 * can be used to update the stats just before reading them. 1953 * 1781 * 1954 * Oh, and since global_zone_page_sta 1782 * Oh, and since global_zone_page_state() etc. are so careful to hide 1955 * transiently negative values, repor 1783 * transiently negative values, report an error here if any of 1956 * the stats is negative, so we know 1784 * the stats is negative, so we know to go looking for imbalance. 1957 */ 1785 */ 1958 err = schedule_on_each_cpu(refresh_vm 1786 err = schedule_on_each_cpu(refresh_vm_stats); 1959 if (err) 1787 if (err) 1960 return err; 1788 return err; 1961 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS 1789 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) { 1962 /* << 1963 * Skip checking stats known << 1964 */ << 1965 switch (i) { << 1966 case NR_ZONE_WRITE_PENDING: << 1967 case NR_FREE_CMA_PAGES: << 1968 continue; << 1969 } << 1970 val = atomic_long_read(&vm_zo 1790 val = atomic_long_read(&vm_zone_stat[i]); 1971 if (val < 0) { 1791 if (val < 0) { 1972 pr_warn("%s: %s %ld\n 1792 pr_warn("%s: %s %ld\n", 1973 __func__, zon 1793 __func__, zone_stat_name(i), val); >> 1794 err = -EINVAL; 1974 } 1795 } 1975 } 1796 } 1976 for (i = 0; i < NR_VM_NODE_STAT_ITEMS !! 1797 #ifdef CONFIG_NUMA 1977 /* !! 1798 for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) { 1978 * Skip checking stats known !! 1799 val = atomic_long_read(&vm_numa_stat[i]); 1979 */ << 1980 switch (i) { << 1981 case NR_WRITEBACK: << 1982 continue; << 1983 } << 1984 val = atomic_long_read(&vm_no << 1985 if (val < 0) { 1800 if (val < 0) { 1986 pr_warn("%s: %s %ld\n 1801 pr_warn("%s: %s %ld\n", 1987 __func__, nod !! 1802 __func__, numa_stat_name(i), val); >> 1803 err = -EINVAL; 1988 } 1804 } 1989 } 1805 } >> 1806 #endif >> 1807 if (err) >> 1808 return err; 1990 if (write) 1809 if (write) 1991 *ppos += *lenp; 1810 *ppos += *lenp; 1992 else 1811 else 1993 *lenp = 0; 1812 *lenp = 0; 1994 return 0; 1813 return 0; 1995 } 1814 } 1996 #endif /* CONFIG_PROC_FS */ 1815 #endif /* CONFIG_PROC_FS */ 1997 1816 1998 static void vmstat_update(struct work_struct 1817 static void vmstat_update(struct work_struct *w) 1999 { 1818 { 2000 if (refresh_cpu_vm_stats(true)) { 1819 if (refresh_cpu_vm_stats(true)) { 2001 /* 1820 /* 2002 * Counters were updated so w 1821 * Counters were updated so we expect more updates 2003 * to occur in the future. Ke 1822 * to occur in the future. Keep on running the 2004 * update worker thread. 1823 * update worker thread. 2005 */ 1824 */ 2006 queue_delayed_work_on(smp_pro 1825 queue_delayed_work_on(smp_processor_id(), mm_percpu_wq, 2007 this_cpu_ptr( 1826 this_cpu_ptr(&vmstat_work), 2008 round_jiffies 1827 round_jiffies_relative(sysctl_stat_interval)); 2009 } 1828 } 2010 } 1829 } 2011 1830 2012 /* 1831 /* >> 1832 * Switch off vmstat processing and then fold all the remaining differentials >> 1833 * until the diffs stay at zero. The function is used by NOHZ and can only be >> 1834 * invoked when tick processing is not active. >> 1835 */ >> 1836 /* 2013 * Check if the diffs for a certain cpu indic 1837 * Check if the diffs for a certain cpu indicate that 2014 * an update is needed. 1838 * an update is needed. 2015 */ 1839 */ 2016 static bool need_update(int cpu) 1840 static bool need_update(int cpu) 2017 { 1841 { 2018 pg_data_t *last_pgdat = NULL; << 2019 struct zone *zone; 1842 struct zone *zone; 2020 1843 2021 for_each_populated_zone(zone) { 1844 for_each_populated_zone(zone) { 2022 struct per_cpu_zonestat *pzst !! 1845 struct per_cpu_pageset *p = per_cpu_ptr(zone->pageset, cpu); 2023 struct per_cpu_nodestat *n; !! 1846 >> 1847 BUILD_BUG_ON(sizeof(p->vm_stat_diff[0]) != 1); >> 1848 #ifdef CONFIG_NUMA >> 1849 BUILD_BUG_ON(sizeof(p->vm_numa_stat_diff[0]) != 2); >> 1850 #endif 2024 1851 2025 /* 1852 /* 2026 * The fast way of checking i 1853 * The fast way of checking if there are any vmstat diffs. 2027 */ 1854 */ 2028 if (memchr_inv(pzstats->vm_st !! 1855 if (memchr_inv(p->vm_stat_diff, 0, NR_VM_ZONE_STAT_ITEMS * >> 1856 sizeof(p->vm_stat_diff[0]))) 2029 return true; 1857 return true; 2030 !! 1858 #ifdef CONFIG_NUMA 2031 if (last_pgdat == zone->zone_ !! 1859 if (memchr_inv(p->vm_numa_stat_diff, 0, NR_VM_NUMA_STAT_ITEMS * 2032 continue; !! 1860 sizeof(p->vm_numa_stat_diff[0]))) 2033 last_pgdat = zone->zone_pgdat << 2034 n = per_cpu_ptr(zone->zone_pg << 2035 if (memchr_inv(n->vm_node_sta << 2036 return true; 1861 return true; >> 1862 #endif 2037 } 1863 } 2038 return false; 1864 return false; 2039 } 1865 } 2040 1866 2041 /* 1867 /* 2042 * Switch off vmstat processing and then fold 1868 * Switch off vmstat processing and then fold all the remaining differentials 2043 * until the diffs stay at zero. The function 1869 * until the diffs stay at zero. The function is used by NOHZ and can only be 2044 * invoked when tick processing is not active 1870 * invoked when tick processing is not active. 2045 */ 1871 */ 2046 void quiet_vmstat(void) 1872 void quiet_vmstat(void) 2047 { 1873 { 2048 if (system_state != SYSTEM_RUNNING) 1874 if (system_state != SYSTEM_RUNNING) 2049 return; 1875 return; 2050 1876 2051 if (!delayed_work_pending(this_cpu_pt 1877 if (!delayed_work_pending(this_cpu_ptr(&vmstat_work))) 2052 return; 1878 return; 2053 1879 2054 if (!need_update(smp_processor_id())) 1880 if (!need_update(smp_processor_id())) 2055 return; 1881 return; 2056 1882 2057 /* 1883 /* 2058 * Just refresh counters and do not c 1884 * Just refresh counters and do not care about the pending delayed 2059 * vmstat_update. It doesn't fire tha 1885 * vmstat_update. It doesn't fire that often to matter and canceling 2060 * it would be too expensive from thi 1886 * it would be too expensive from this path. 2061 * vmstat_shepherd will take care abo 1887 * vmstat_shepherd will take care about that for us. 2062 */ 1888 */ 2063 refresh_cpu_vm_stats(false); 1889 refresh_cpu_vm_stats(false); 2064 } 1890 } 2065 1891 2066 /* 1892 /* 2067 * Shepherd worker thread that checks the 1893 * Shepherd worker thread that checks the 2068 * differentials of processors that have thei 1894 * differentials of processors that have their worker 2069 * threads for vm statistics updates disabled 1895 * threads for vm statistics updates disabled because of 2070 * inactivity. 1896 * inactivity. 2071 */ 1897 */ 2072 static void vmstat_shepherd(struct work_struc 1898 static void vmstat_shepherd(struct work_struct *w); 2073 1899 2074 static DECLARE_DEFERRABLE_WORK(shepherd, vmst 1900 static DECLARE_DEFERRABLE_WORK(shepherd, vmstat_shepherd); 2075 1901 2076 static void vmstat_shepherd(struct work_struc 1902 static void vmstat_shepherd(struct work_struct *w) 2077 { 1903 { 2078 int cpu; 1904 int cpu; 2079 1905 2080 cpus_read_lock(); !! 1906 get_online_cpus(); 2081 /* Check processors whose vmstat work 1907 /* Check processors whose vmstat worker threads have been disabled */ 2082 for_each_online_cpu(cpu) { 1908 for_each_online_cpu(cpu) { 2083 struct delayed_work *dw = &pe 1909 struct delayed_work *dw = &per_cpu(vmstat_work, cpu); 2084 1910 2085 /* << 2086 * In kernel users of vmstat << 2087 * they are using zone_page_s << 2088 * an imprecision as the regu << 2089 * cumulative error can grow << 2090 * << 2091 * From that POV the regular << 2092 * been isolated from the ker << 2093 * infrastructure ever notici << 2094 * for all isolated CPUs to a << 2095 */ << 2096 if (cpu_is_isolated(cpu)) << 2097 continue; << 2098 << 2099 if (!delayed_work_pending(dw) 1911 if (!delayed_work_pending(dw) && need_update(cpu)) 2100 queue_delayed_work_on 1912 queue_delayed_work_on(cpu, mm_percpu_wq, dw, 0); 2101 << 2102 cond_resched(); << 2103 } 1913 } 2104 cpus_read_unlock(); !! 1914 put_online_cpus(); 2105 1915 2106 schedule_delayed_work(&shepherd, 1916 schedule_delayed_work(&shepherd, 2107 round_jiffies_relative(sysctl 1917 round_jiffies_relative(sysctl_stat_interval)); 2108 } 1918 } 2109 1919 2110 static void __init start_shepherd_timer(void) 1920 static void __init start_shepherd_timer(void) 2111 { 1921 { 2112 int cpu; 1922 int cpu; 2113 1923 2114 for_each_possible_cpu(cpu) 1924 for_each_possible_cpu(cpu) 2115 INIT_DEFERRABLE_WORK(per_cpu_ 1925 INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu), 2116 vmstat_update); 1926 vmstat_update); 2117 1927 2118 schedule_delayed_work(&shepherd, 1928 schedule_delayed_work(&shepherd, 2119 round_jiffies_relative(sysctl 1929 round_jiffies_relative(sysctl_stat_interval)); 2120 } 1930 } 2121 1931 2122 static void __init init_cpu_node_state(void) 1932 static void __init init_cpu_node_state(void) 2123 { 1933 { 2124 int node; 1934 int node; 2125 1935 2126 for_each_online_node(node) { 1936 for_each_online_node(node) { 2127 if (!cpumask_empty(cpumask_of !! 1937 if (cpumask_weight(cpumask_of_node(node)) > 0) 2128 node_set_state(node, 1938 node_set_state(node, N_CPU); 2129 } 1939 } 2130 } 1940 } 2131 1941 2132 static int vmstat_cpu_online(unsigned int cpu 1942 static int vmstat_cpu_online(unsigned int cpu) 2133 { 1943 { 2134 refresh_zone_stat_thresholds(); 1944 refresh_zone_stat_thresholds(); 2135 !! 1945 node_set_state(cpu_to_node(cpu), N_CPU); 2136 if (!node_state(cpu_to_node(cpu), N_C << 2137 node_set_state(cpu_to_node(cp << 2138 } << 2139 << 2140 return 0; 1946 return 0; 2141 } 1947 } 2142 1948 2143 static int vmstat_cpu_down_prep(unsigned int 1949 static int vmstat_cpu_down_prep(unsigned int cpu) 2144 { 1950 { 2145 cancel_delayed_work_sync(&per_cpu(vms 1951 cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu)); 2146 return 0; 1952 return 0; 2147 } 1953 } 2148 1954 2149 static int vmstat_cpu_dead(unsigned int cpu) 1955 static int vmstat_cpu_dead(unsigned int cpu) 2150 { 1956 { 2151 const struct cpumask *node_cpus; 1957 const struct cpumask *node_cpus; 2152 int node; 1958 int node; 2153 1959 2154 node = cpu_to_node(cpu); 1960 node = cpu_to_node(cpu); 2155 1961 2156 refresh_zone_stat_thresholds(); 1962 refresh_zone_stat_thresholds(); 2157 node_cpus = cpumask_of_node(node); 1963 node_cpus = cpumask_of_node(node); 2158 if (!cpumask_empty(node_cpus)) !! 1964 if (cpumask_weight(node_cpus) > 0) 2159 return 0; 1965 return 0; 2160 1966 2161 node_clear_state(node, N_CPU); 1967 node_clear_state(node, N_CPU); 2162 << 2163 return 0; 1968 return 0; 2164 } 1969 } 2165 1970 2166 #endif 1971 #endif 2167 1972 2168 struct workqueue_struct *mm_percpu_wq; 1973 struct workqueue_struct *mm_percpu_wq; 2169 1974 2170 void __init init_mm_internals(void) 1975 void __init init_mm_internals(void) 2171 { 1976 { 2172 int ret __maybe_unused; 1977 int ret __maybe_unused; 2173 1978 2174 mm_percpu_wq = alloc_workqueue("mm_pe 1979 mm_percpu_wq = alloc_workqueue("mm_percpu_wq", WQ_MEM_RECLAIM, 0); 2175 1980 2176 #ifdef CONFIG_SMP 1981 #ifdef CONFIG_SMP 2177 ret = cpuhp_setup_state_nocalls(CPUHP 1982 ret = cpuhp_setup_state_nocalls(CPUHP_MM_VMSTAT_DEAD, "mm/vmstat:dead", 2178 NULL, 1983 NULL, vmstat_cpu_dead); 2179 if (ret < 0) 1984 if (ret < 0) 2180 pr_err("vmstat: failed to reg 1985 pr_err("vmstat: failed to register 'dead' hotplug state\n"); 2181 1986 2182 ret = cpuhp_setup_state_nocalls(CPUHP 1987 ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "mm/vmstat:online", 2183 vmsta 1988 vmstat_cpu_online, 2184 vmsta 1989 vmstat_cpu_down_prep); 2185 if (ret < 0) 1990 if (ret < 0) 2186 pr_err("vmstat: failed to reg 1991 pr_err("vmstat: failed to register 'online' hotplug state\n"); 2187 1992 2188 cpus_read_lock(); !! 1993 get_online_cpus(); 2189 init_cpu_node_state(); 1994 init_cpu_node_state(); 2190 cpus_read_unlock(); !! 1995 put_online_cpus(); 2191 1996 2192 start_shepherd_timer(); 1997 start_shepherd_timer(); 2193 #endif 1998 #endif 2194 #ifdef CONFIG_PROC_FS 1999 #ifdef CONFIG_PROC_FS 2195 proc_create_seq("buddyinfo", 0444, NU 2000 proc_create_seq("buddyinfo", 0444, NULL, &fragmentation_op); 2196 proc_create_seq("pagetypeinfo", 0400, 2001 proc_create_seq("pagetypeinfo", 0400, NULL, &pagetypeinfo_op); 2197 proc_create_seq("vmstat", 0444, NULL, 2002 proc_create_seq("vmstat", 0444, NULL, &vmstat_op); 2198 proc_create_seq("zoneinfo", 0444, NUL 2003 proc_create_seq("zoneinfo", 0444, NULL, &zoneinfo_op); 2199 #endif 2004 #endif 2200 } 2005 } 2201 2006 2202 #if defined(CONFIG_DEBUG_FS) && defined(CONFI 2007 #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION) 2203 2008 2204 /* 2009 /* 2205 * Return an index indicating how much of the 2010 * Return an index indicating how much of the available free memory is 2206 * unusable for an allocation of the requeste 2011 * unusable for an allocation of the requested size. 2207 */ 2012 */ 2208 static int unusable_free_index(unsigned int o 2013 static int unusable_free_index(unsigned int order, 2209 struct contig 2014 struct contig_page_info *info) 2210 { 2015 { 2211 /* No free memory is interpreted as a 2016 /* No free memory is interpreted as all free memory is unusable */ 2212 if (info->free_pages == 0) 2017 if (info->free_pages == 0) 2213 return 1000; 2018 return 1000; 2214 2019 2215 /* 2020 /* 2216 * Index should be a value between 0 2021 * Index should be a value between 0 and 1. Return a value to 3 2217 * decimal places. 2022 * decimal places. 2218 * 2023 * 2219 * 0 => no fragmentation 2024 * 0 => no fragmentation 2220 * 1 => high fragmentation 2025 * 1 => high fragmentation 2221 */ 2026 */ 2222 return div_u64((info->free_pages - (i 2027 return div_u64((info->free_pages - (info->free_blocks_suitable << order)) * 1000ULL, info->free_pages); 2223 2028 2224 } 2029 } 2225 2030 2226 static void unusable_show_print(struct seq_fi 2031 static void unusable_show_print(struct seq_file *m, 2227 pg_da 2032 pg_data_t *pgdat, struct zone *zone) 2228 { 2033 { 2229 unsigned int order; 2034 unsigned int order; 2230 int index; 2035 int index; 2231 struct contig_page_info info; 2036 struct contig_page_info info; 2232 2037 2233 seq_printf(m, "Node %d, zone %8s ", 2038 seq_printf(m, "Node %d, zone %8s ", 2234 pgdat->node_i 2039 pgdat->node_id, 2235 zone->name); 2040 zone->name); 2236 for (order = 0; order < NR_PAGE_ORDER !! 2041 for (order = 0; order < MAX_ORDER; ++order) { 2237 fill_contig_page_info(zone, o 2042 fill_contig_page_info(zone, order, &info); 2238 index = unusable_free_index(o 2043 index = unusable_free_index(order, &info); 2239 seq_printf(m, "%d.%03d ", ind 2044 seq_printf(m, "%d.%03d ", index / 1000, index % 1000); 2240 } 2045 } 2241 2046 2242 seq_putc(m, '\n'); 2047 seq_putc(m, '\n'); 2243 } 2048 } 2244 2049 2245 /* 2050 /* 2246 * Display unusable free space index 2051 * Display unusable free space index 2247 * 2052 * 2248 * The unusable free space index measures how 2053 * The unusable free space index measures how much of the available free 2249 * memory cannot be used to satisfy an alloca 2054 * memory cannot be used to satisfy an allocation of a given size and is a 2250 * value between 0 and 1. The higher the valu 2055 * value between 0 and 1. The higher the value, the more of free memory is 2251 * unusable and by implication, the worse the 2056 * unusable and by implication, the worse the external fragmentation is. This 2252 * can be expressed as a percentage by multip 2057 * can be expressed as a percentage by multiplying by 100. 2253 */ 2058 */ 2254 static int unusable_show(struct seq_file *m, 2059 static int unusable_show(struct seq_file *m, void *arg) 2255 { 2060 { 2256 pg_data_t *pgdat = (pg_data_t *)arg; 2061 pg_data_t *pgdat = (pg_data_t *)arg; 2257 2062 2258 /* check memoryless node */ 2063 /* check memoryless node */ 2259 if (!node_state(pgdat->node_id, N_MEM 2064 if (!node_state(pgdat->node_id, N_MEMORY)) 2260 return 0; 2065 return 0; 2261 2066 2262 walk_zones_in_node(m, pgdat, true, fa 2067 walk_zones_in_node(m, pgdat, true, false, unusable_show_print); 2263 2068 2264 return 0; 2069 return 0; 2265 } 2070 } 2266 2071 2267 static const struct seq_operations unusable_s 2072 static const struct seq_operations unusable_sops = { 2268 .start = frag_start, 2073 .start = frag_start, 2269 .next = frag_next, 2074 .next = frag_next, 2270 .stop = frag_stop, 2075 .stop = frag_stop, 2271 .show = unusable_show, 2076 .show = unusable_show, 2272 }; 2077 }; 2273 2078 2274 DEFINE_SEQ_ATTRIBUTE(unusable); 2079 DEFINE_SEQ_ATTRIBUTE(unusable); 2275 2080 2276 static void extfrag_show_print(struct seq_fil 2081 static void extfrag_show_print(struct seq_file *m, 2277 pg_da 2082 pg_data_t *pgdat, struct zone *zone) 2278 { 2083 { 2279 unsigned int order; 2084 unsigned int order; 2280 int index; 2085 int index; 2281 2086 2282 /* Alloc on stack as interrupts are d 2087 /* Alloc on stack as interrupts are disabled for zone walk */ 2283 struct contig_page_info info; 2088 struct contig_page_info info; 2284 2089 2285 seq_printf(m, "Node %d, zone %8s ", 2090 seq_printf(m, "Node %d, zone %8s ", 2286 pgdat->node_i 2091 pgdat->node_id, 2287 zone->name); 2092 zone->name); 2288 for (order = 0; order < NR_PAGE_ORDER !! 2093 for (order = 0; order < MAX_ORDER; ++order) { 2289 fill_contig_page_info(zone, o 2094 fill_contig_page_info(zone, order, &info); 2290 index = __fragmentation_index 2095 index = __fragmentation_index(order, &info); 2291 seq_printf(m, "%2d.%03d ", in !! 2096 seq_printf(m, "%d.%03d ", index / 1000, index % 1000); 2292 } 2097 } 2293 2098 2294 seq_putc(m, '\n'); 2099 seq_putc(m, '\n'); 2295 } 2100 } 2296 2101 2297 /* 2102 /* 2298 * Display fragmentation index for orders tha 2103 * Display fragmentation index for orders that allocations would fail for 2299 */ 2104 */ 2300 static int extfrag_show(struct seq_file *m, v 2105 static int extfrag_show(struct seq_file *m, void *arg) 2301 { 2106 { 2302 pg_data_t *pgdat = (pg_data_t *)arg; 2107 pg_data_t *pgdat = (pg_data_t *)arg; 2303 2108 2304 walk_zones_in_node(m, pgdat, true, fa 2109 walk_zones_in_node(m, pgdat, true, false, extfrag_show_print); 2305 2110 2306 return 0; 2111 return 0; 2307 } 2112 } 2308 2113 2309 static const struct seq_operations extfrag_so 2114 static const struct seq_operations extfrag_sops = { 2310 .start = frag_start, 2115 .start = frag_start, 2311 .next = frag_next, 2116 .next = frag_next, 2312 .stop = frag_stop, 2117 .stop = frag_stop, 2313 .show = extfrag_show, 2118 .show = extfrag_show, 2314 }; 2119 }; 2315 2120 2316 DEFINE_SEQ_ATTRIBUTE(extfrag); 2121 DEFINE_SEQ_ATTRIBUTE(extfrag); 2317 2122 2318 static int __init extfrag_debug_init(void) 2123 static int __init extfrag_debug_init(void) 2319 { 2124 { 2320 struct dentry *extfrag_debug_root; 2125 struct dentry *extfrag_debug_root; 2321 2126 2322 extfrag_debug_root = debugfs_create_d 2127 extfrag_debug_root = debugfs_create_dir("extfrag", NULL); 2323 2128 2324 debugfs_create_file("unusable_index", 2129 debugfs_create_file("unusable_index", 0444, extfrag_debug_root, NULL, 2325 &unusable_fops); 2130 &unusable_fops); 2326 2131 2327 debugfs_create_file("extfrag_index", 2132 debugfs_create_file("extfrag_index", 0444, extfrag_debug_root, NULL, 2328 &extfrag_fops); 2133 &extfrag_fops); 2329 2134 2330 return 0; 2135 return 0; 2331 } 2136 } 2332 2137 2333 module_init(extfrag_debug_init); 2138 module_init(extfrag_debug_init); 2334 << 2335 #endif 2139 #endif 2336 2140
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.