1 // SPDX-License-Identifier: GPL-2.0-only << 2 /* 1 /* 3 * linux/mm/vmstat.c 2 * linux/mm/vmstat.c 4 * 3 * 5 * Manages VM statistics 4 * Manages VM statistics 6 * Copyright (C) 1991, 1992, 1993, 1994 Linu 5 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 7 * 6 * 8 * zoned VM statistics 7 * zoned VM statistics 9 * Copyright (C) 2006 Silicon Graphics, Inc., 8 * Copyright (C) 2006 Silicon Graphics, Inc., 10 * Christoph Lameter <christoph@l 9 * Christoph Lameter <christoph@lameter.com> 11 * Copyright (C) 2008-2014 Christoph Lameter 10 * Copyright (C) 2008-2014 Christoph Lameter 12 */ 11 */ 13 #include <linux/fs.h> 12 #include <linux/fs.h> 14 #include <linux/mm.h> 13 #include <linux/mm.h> 15 #include <linux/err.h> 14 #include <linux/err.h> 16 #include <linux/module.h> 15 #include <linux/module.h> 17 #include <linux/slab.h> 16 #include <linux/slab.h> 18 #include <linux/cpu.h> 17 #include <linux/cpu.h> 19 #include <linux/cpumask.h> 18 #include <linux/cpumask.h> 20 #include <linux/vmstat.h> 19 #include <linux/vmstat.h> 21 #include <linux/proc_fs.h> 20 #include <linux/proc_fs.h> 22 #include <linux/seq_file.h> 21 #include <linux/seq_file.h> 23 #include <linux/debugfs.h> 22 #include <linux/debugfs.h> 24 #include <linux/sched.h> 23 #include <linux/sched.h> 25 #include <linux/math64.h> 24 #include <linux/math64.h> 26 #include <linux/writeback.h> 25 #include <linux/writeback.h> 27 #include <linux/compaction.h> 26 #include <linux/compaction.h> 28 #include <linux/mm_inline.h> 27 #include <linux/mm_inline.h> >> 28 #include <linux/page_ext.h> 29 #include <linux/page_owner.h> 29 #include <linux/page_owner.h> 30 #include <linux/sched/isolation.h> << 31 30 32 #include "internal.h" 31 #include "internal.h" 33 32 34 #ifdef CONFIG_NUMA << 35 int sysctl_vm_numa_stat = ENABLE_NUMA_STAT; << 36 << 37 /* zero numa counters within a zone */ << 38 static void zero_zone_numa_counters(struct zon << 39 { << 40 int item, cpu; << 41 << 42 for (item = 0; item < NR_VM_NUMA_EVENT << 43 atomic_long_set(&zone->vm_numa << 44 for_each_online_cpu(cpu) { << 45 per_cpu_ptr(zone->per_ << 46 << 47 } << 48 } << 49 } << 50 << 51 /* zero numa counters of all the populated zon << 52 static void zero_zones_numa_counters(void) << 53 { << 54 struct zone *zone; << 55 << 56 for_each_populated_zone(zone) << 57 zero_zone_numa_counters(zone); << 58 } << 59 << 60 /* zero global numa counters */ << 61 static void zero_global_numa_counters(void) << 62 { << 63 int item; << 64 << 65 for (item = 0; item < NR_VM_NUMA_EVENT << 66 atomic_long_set(&vm_numa_event << 67 } << 68 << 69 static void invalid_numa_statistics(void) << 70 { << 71 zero_zones_numa_counters(); << 72 zero_global_numa_counters(); << 73 } << 74 << 75 static DEFINE_MUTEX(vm_numa_stat_lock); << 76 << 77 int sysctl_vm_numa_stat_handler(const struct c << 78 void *buffer, size_t *length, << 79 { << 80 int ret, oldval; << 81 << 82 mutex_lock(&vm_numa_stat_lock); << 83 if (write) << 84 oldval = sysctl_vm_numa_stat; << 85 ret = proc_dointvec_minmax(table, writ << 86 if (ret || !write) << 87 goto out; << 88 << 89 if (oldval == sysctl_vm_numa_stat) << 90 goto out; << 91 else if (sysctl_vm_numa_stat == ENABLE << 92 static_branch_enable(&vm_numa_ << 93 pr_info("enable numa statistic << 94 } else { << 95 static_branch_disable(&vm_numa << 96 invalid_numa_statistics(); << 97 pr_info("disable numa statisti << 98 } << 99 << 100 out: << 101 mutex_unlock(&vm_numa_stat_lock); << 102 return ret; << 103 } << 104 #endif << 105 << 106 #ifdef CONFIG_VM_EVENT_COUNTERS 33 #ifdef CONFIG_VM_EVENT_COUNTERS 107 DEFINE_PER_CPU(struct vm_event_state, vm_event 34 DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}}; 108 EXPORT_PER_CPU_SYMBOL(vm_event_states); 35 EXPORT_PER_CPU_SYMBOL(vm_event_states); 109 36 110 static void sum_vm_events(unsigned long *ret) 37 static void sum_vm_events(unsigned long *ret) 111 { 38 { 112 int cpu; 39 int cpu; 113 int i; 40 int i; 114 41 115 memset(ret, 0, NR_VM_EVENT_ITEMS * siz 42 memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long)); 116 43 117 for_each_online_cpu(cpu) { 44 for_each_online_cpu(cpu) { 118 struct vm_event_state *this = 45 struct vm_event_state *this = &per_cpu(vm_event_states, cpu); 119 46 120 for (i = 0; i < NR_VM_EVENT_IT 47 for (i = 0; i < NR_VM_EVENT_ITEMS; i++) 121 ret[i] += this->event[ 48 ret[i] += this->event[i]; 122 } 49 } 123 } 50 } 124 51 125 /* 52 /* 126 * Accumulate the vm event counters across all 53 * Accumulate the vm event counters across all CPUs. 127 * The result is unavoidably approximate - it 54 * The result is unavoidably approximate - it can change 128 * during and after execution of this function 55 * during and after execution of this function. 129 */ 56 */ 130 void all_vm_events(unsigned long *ret) 57 void all_vm_events(unsigned long *ret) 131 { 58 { 132 cpus_read_lock(); !! 59 get_online_cpus(); 133 sum_vm_events(ret); 60 sum_vm_events(ret); 134 cpus_read_unlock(); !! 61 put_online_cpus(); 135 } 62 } 136 EXPORT_SYMBOL_GPL(all_vm_events); 63 EXPORT_SYMBOL_GPL(all_vm_events); 137 64 138 /* 65 /* 139 * Fold the foreign cpu events into our own. 66 * Fold the foreign cpu events into our own. 140 * 67 * 141 * This is adding to the events on one process 68 * This is adding to the events on one processor 142 * but keeps the global counts constant. 69 * but keeps the global counts constant. 143 */ 70 */ 144 void vm_events_fold_cpu(int cpu) 71 void vm_events_fold_cpu(int cpu) 145 { 72 { 146 struct vm_event_state *fold_state = &p 73 struct vm_event_state *fold_state = &per_cpu(vm_event_states, cpu); 147 int i; 74 int i; 148 75 149 for (i = 0; i < NR_VM_EVENT_ITEMS; i++ 76 for (i = 0; i < NR_VM_EVENT_ITEMS; i++) { 150 count_vm_events(i, fold_state- 77 count_vm_events(i, fold_state->event[i]); 151 fold_state->event[i] = 0; 78 fold_state->event[i] = 0; 152 } 79 } 153 } 80 } 154 81 155 #endif /* CONFIG_VM_EVENT_COUNTERS */ 82 #endif /* CONFIG_VM_EVENT_COUNTERS */ 156 83 157 /* 84 /* 158 * Manage combined zone based / global counter 85 * Manage combined zone based / global counters 159 * 86 * 160 * vm_stat contains the global counters 87 * vm_stat contains the global counters 161 */ 88 */ 162 atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITE !! 89 atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp; 163 atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITE !! 90 EXPORT_SYMBOL(vm_stat); 164 atomic_long_t vm_numa_event[NR_VM_NUMA_EVENT_I << 165 EXPORT_SYMBOL(vm_zone_stat); << 166 EXPORT_SYMBOL(vm_node_stat); << 167 << 168 #ifdef CONFIG_NUMA << 169 static void fold_vm_zone_numa_events(struct zo << 170 { << 171 unsigned long zone_numa_events[NR_VM_N << 172 int cpu; << 173 enum numa_stat_item item; << 174 << 175 for_each_online_cpu(cpu) { << 176 struct per_cpu_zonestat *pzsta << 177 << 178 pzstats = per_cpu_ptr(zone->pe << 179 for (item = 0; item < NR_VM_NU << 180 zone_numa_events[item] << 181 } << 182 << 183 for (item = 0; item < NR_VM_NUMA_EVENT << 184 zone_numa_event_add(zone_numa_ << 185 } << 186 << 187 void fold_vm_numa_events(void) << 188 { << 189 struct zone *zone; << 190 << 191 for_each_populated_zone(zone) << 192 fold_vm_zone_numa_events(zone) << 193 } << 194 #endif << 195 91 196 #ifdef CONFIG_SMP 92 #ifdef CONFIG_SMP 197 93 198 int calculate_pressure_threshold(struct zone * 94 int calculate_pressure_threshold(struct zone *zone) 199 { 95 { 200 int threshold; 96 int threshold; 201 int watermark_distance; 97 int watermark_distance; 202 98 203 /* 99 /* 204 * As vmstats are not up to date, ther 100 * As vmstats are not up to date, there is drift between the estimated 205 * and real values. For high threshold 101 * and real values. For high thresholds and a high number of CPUs, it 206 * is possible for the min watermark t 102 * is possible for the min watermark to be breached while the estimated 207 * value looks fine. The pressure thre 103 * value looks fine. The pressure threshold is a reduced value such 208 * that even the maximum amount of dri 104 * that even the maximum amount of drift will not accidentally breach 209 * the min watermark 105 * the min watermark 210 */ 106 */ 211 watermark_distance = low_wmark_pages(z 107 watermark_distance = low_wmark_pages(zone) - min_wmark_pages(zone); 212 threshold = max(1, (int)(watermark_dis 108 threshold = max(1, (int)(watermark_distance / num_online_cpus())); 213 109 214 /* 110 /* 215 * Maximum threshold is 125 111 * Maximum threshold is 125 216 */ 112 */ 217 threshold = min(125, threshold); 113 threshold = min(125, threshold); 218 114 219 return threshold; 115 return threshold; 220 } 116 } 221 117 222 int calculate_normal_threshold(struct zone *zo 118 int calculate_normal_threshold(struct zone *zone) 223 { 119 { 224 int threshold; 120 int threshold; 225 int mem; /* memory in 128 MB un 121 int mem; /* memory in 128 MB units */ 226 122 227 /* 123 /* 228 * The threshold scales with the numbe 124 * The threshold scales with the number of processors and the amount 229 * of memory per zone. More memory mea 125 * of memory per zone. More memory means that we can defer updates for 230 * longer, more processors could lead 126 * longer, more processors could lead to more contention. 231 * fls() is used to have a cheap way o 127 * fls() is used to have a cheap way of logarithmic scaling. 232 * 128 * 233 * Some sample thresholds: 129 * Some sample thresholds: 234 * 130 * 235 * Threshold Processors (fls) !! 131 * Threshold Processors (fls) Zonesize fls(mem+1) 236 * ----------------------------------- 132 * ------------------------------------------------------------------ 237 * 8 1 1 133 * 8 1 1 0.9-1 GB 4 238 * 16 2 2 134 * 16 2 2 0.9-1 GB 4 239 * 20 2 2 135 * 20 2 2 1-2 GB 5 240 * 24 2 2 136 * 24 2 2 2-4 GB 6 241 * 28 2 2 137 * 28 2 2 4-8 GB 7 242 * 32 2 2 138 * 32 2 2 8-16 GB 8 243 * 4 2 2 139 * 4 2 2 <128M 1 244 * 30 4 3 140 * 30 4 3 2-4 GB 5 245 * 48 4 3 141 * 48 4 3 8-16 GB 8 246 * 32 8 4 142 * 32 8 4 1-2 GB 4 247 * 32 8 4 143 * 32 8 4 0.9-1GB 4 248 * 10 16 5 144 * 10 16 5 <128M 1 249 * 40 16 5 145 * 40 16 5 900M 4 250 * 70 64 7 146 * 70 64 7 2-4 GB 5 251 * 84 64 7 147 * 84 64 7 4-8 GB 6 252 * 108 512 9 148 * 108 512 9 4-8 GB 6 253 * 125 1024 10 149 * 125 1024 10 8-16 GB 8 254 * 125 1024 10 150 * 125 1024 10 16-32 GB 9 255 */ 151 */ 256 152 257 mem = zone_managed_pages(zone) >> (27 !! 153 mem = zone->managed_pages >> (27 - PAGE_SHIFT); 258 154 259 threshold = 2 * fls(num_online_cpus()) 155 threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem)); 260 156 261 /* 157 /* 262 * Maximum threshold is 125 158 * Maximum threshold is 125 263 */ 159 */ 264 threshold = min(125, threshold); 160 threshold = min(125, threshold); 265 161 266 return threshold; 162 return threshold; 267 } 163 } 268 164 269 /* 165 /* 270 * Refresh the thresholds for each zone. 166 * Refresh the thresholds for each zone. 271 */ 167 */ 272 void refresh_zone_stat_thresholds(void) 168 void refresh_zone_stat_thresholds(void) 273 { 169 { 274 struct pglist_data *pgdat; << 275 struct zone *zone; 170 struct zone *zone; 276 int cpu; 171 int cpu; 277 int threshold; 172 int threshold; 278 173 279 /* Zero current pgdat thresholds */ << 280 for_each_online_pgdat(pgdat) { << 281 for_each_online_cpu(cpu) { << 282 per_cpu_ptr(pgdat->per << 283 } << 284 } << 285 << 286 for_each_populated_zone(zone) { 174 for_each_populated_zone(zone) { 287 struct pglist_data *pgdat = zo << 288 unsigned long max_drift, toler 175 unsigned long max_drift, tolerate_drift; 289 176 290 threshold = calculate_normal_t 177 threshold = calculate_normal_threshold(zone); 291 178 292 for_each_online_cpu(cpu) { !! 179 for_each_online_cpu(cpu) 293 int pgdat_threshold; !! 180 per_cpu_ptr(zone->pageset, cpu)->stat_threshold 294 << 295 per_cpu_ptr(zone->per_ << 296 181 = threshold; 297 182 298 /* Base nodestat thres << 299 pgdat_threshold = per_ << 300 per_cpu_ptr(pgdat->per << 301 = max(threshol << 302 } << 303 << 304 /* 183 /* 305 * Only set percpu_drift_mark 184 * Only set percpu_drift_mark if there is a danger that 306 * NR_FREE_PAGES reports the l 185 * NR_FREE_PAGES reports the low watermark is ok when in fact 307 * the min watermark could be 186 * the min watermark could be breached by an allocation 308 */ 187 */ 309 tolerate_drift = low_wmark_pag 188 tolerate_drift = low_wmark_pages(zone) - min_wmark_pages(zone); 310 max_drift = num_online_cpus() 189 max_drift = num_online_cpus() * threshold; 311 if (max_drift > tolerate_drift 190 if (max_drift > tolerate_drift) 312 zone->percpu_drift_mar 191 zone->percpu_drift_mark = high_wmark_pages(zone) + 313 max_dr 192 max_drift; 314 } 193 } 315 } 194 } 316 195 317 void set_pgdat_percpu_threshold(pg_data_t *pgd 196 void set_pgdat_percpu_threshold(pg_data_t *pgdat, 318 int (*calculat 197 int (*calculate_pressure)(struct zone *)) 319 { 198 { 320 struct zone *zone; 199 struct zone *zone; 321 int cpu; 200 int cpu; 322 int threshold; 201 int threshold; 323 int i; 202 int i; 324 203 325 for (i = 0; i < pgdat->nr_zones; i++) 204 for (i = 0; i < pgdat->nr_zones; i++) { 326 zone = &pgdat->node_zones[i]; 205 zone = &pgdat->node_zones[i]; 327 if (!zone->percpu_drift_mark) 206 if (!zone->percpu_drift_mark) 328 continue; 207 continue; 329 208 330 threshold = (*calculate_pressu 209 threshold = (*calculate_pressure)(zone); 331 for_each_online_cpu(cpu) 210 for_each_online_cpu(cpu) 332 per_cpu_ptr(zone->per_ !! 211 per_cpu_ptr(zone->pageset, cpu)->stat_threshold 333 212 = threshold; 334 } 213 } 335 } 214 } 336 215 337 /* 216 /* 338 * For use when we know that interrupts are di 217 * For use when we know that interrupts are disabled, 339 * or when we know that preemption is disabled 218 * or when we know that preemption is disabled and that 340 * particular counter cannot be updated from i 219 * particular counter cannot be updated from interrupt context. 341 */ 220 */ 342 void __mod_zone_page_state(struct zone *zone, 221 void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, 343 long delta) 222 long delta) 344 { 223 { 345 struct per_cpu_zonestat __percpu *pcp !! 224 struct per_cpu_pageset __percpu *pcp = zone->pageset; 346 s8 __percpu *p = pcp->vm_stat_diff + i 225 s8 __percpu *p = pcp->vm_stat_diff + item; 347 long x; 226 long x; 348 long t; 227 long t; 349 228 350 /* << 351 * Accurate vmstat updates require a R << 352 * atomicity is provided by IRQs being << 353 * or via local_lock_irq. On PREEMPT_R << 354 * CPU migrations and preemption poten << 355 * disable preemption. << 356 */ << 357 preempt_disable_nested(); << 358 << 359 x = delta + __this_cpu_read(*p); 229 x = delta + __this_cpu_read(*p); 360 230 361 t = __this_cpu_read(pcp->stat_threshol 231 t = __this_cpu_read(pcp->stat_threshold); 362 232 363 if (unlikely(abs(x) > t)) { !! 233 if (unlikely(x > t || x < -t)) { 364 zone_page_state_add(x, zone, i 234 zone_page_state_add(x, zone, item); 365 x = 0; 235 x = 0; 366 } 236 } 367 __this_cpu_write(*p, x); 237 __this_cpu_write(*p, x); 368 << 369 preempt_enable_nested(); << 370 } 238 } 371 EXPORT_SYMBOL(__mod_zone_page_state); 239 EXPORT_SYMBOL(__mod_zone_page_state); 372 240 373 void __mod_node_page_state(struct pglist_data << 374 long delta) << 375 { << 376 struct per_cpu_nodestat __percpu *pcp << 377 s8 __percpu *p = pcp->vm_node_stat_dif << 378 long x; << 379 long t; << 380 << 381 if (vmstat_item_in_bytes(item)) { << 382 /* << 383 * Only cgroups use subpage ac << 384 * the global level, these ite << 385 * multiples of whole pages. S << 386 * internally to keep the per- << 387 */ << 388 VM_WARN_ON_ONCE(delta & (PAGE_ << 389 delta >>= PAGE_SHIFT; << 390 } << 391 << 392 /* See __mod_node_page_state */ << 393 preempt_disable_nested(); << 394 << 395 x = delta + __this_cpu_read(*p); << 396 << 397 t = __this_cpu_read(pcp->stat_threshol << 398 << 399 if (unlikely(abs(x) > t)) { << 400 node_page_state_add(x, pgdat, << 401 x = 0; << 402 } << 403 __this_cpu_write(*p, x); << 404 << 405 preempt_enable_nested(); << 406 } << 407 EXPORT_SYMBOL(__mod_node_page_state); << 408 << 409 /* 241 /* 410 * Optimized increment and decrement functions 242 * Optimized increment and decrement functions. 411 * 243 * 412 * These are only for a single page and theref 244 * These are only for a single page and therefore can take a struct page * 413 * argument instead of struct zone *. This all 245 * argument instead of struct zone *. This allows the inclusion of the code 414 * generated for page_zone(page) into the opti 246 * generated for page_zone(page) into the optimized functions. 415 * 247 * 416 * No overflow check is necessary and therefor 248 * No overflow check is necessary and therefore the differential can be 417 * incremented or decremented in place which m 249 * incremented or decremented in place which may allow the compilers to 418 * generate better code. 250 * generate better code. 419 * The increment or decrement is known and the 251 * The increment or decrement is known and therefore one boundary check can 420 * be omitted. 252 * be omitted. 421 * 253 * 422 * NOTE: These functions are very performance 254 * NOTE: These functions are very performance sensitive. Change only 423 * with care. 255 * with care. 424 * 256 * 425 * Some processors have inc/dec instructions t 257 * Some processors have inc/dec instructions that are atomic vs an interrupt. 426 * However, the code must first determine the 258 * However, the code must first determine the differential location in a zone 427 * based on the processor number and then inc/ 259 * based on the processor number and then inc/dec the counter. There is no 428 * guarantee without disabling preemption that 260 * guarantee without disabling preemption that the processor will not change 429 * in between and therefore the atomicity vs. 261 * in between and therefore the atomicity vs. interrupt cannot be exploited 430 * in a useful way here. 262 * in a useful way here. 431 */ 263 */ 432 void __inc_zone_state(struct zone *zone, enum 264 void __inc_zone_state(struct zone *zone, enum zone_stat_item item) 433 { 265 { 434 struct per_cpu_zonestat __percpu *pcp !! 266 struct per_cpu_pageset __percpu *pcp = zone->pageset; 435 s8 __percpu *p = pcp->vm_stat_diff + i 267 s8 __percpu *p = pcp->vm_stat_diff + item; 436 s8 v, t; 268 s8 v, t; 437 269 438 /* See __mod_node_page_state */ << 439 preempt_disable_nested(); << 440 << 441 v = __this_cpu_inc_return(*p); 270 v = __this_cpu_inc_return(*p); 442 t = __this_cpu_read(pcp->stat_threshol 271 t = __this_cpu_read(pcp->stat_threshold); 443 if (unlikely(v > t)) { 272 if (unlikely(v > t)) { 444 s8 overstep = t >> 1; 273 s8 overstep = t >> 1; 445 274 446 zone_page_state_add(v + overst 275 zone_page_state_add(v + overstep, zone, item); 447 __this_cpu_write(*p, -overstep 276 __this_cpu_write(*p, -overstep); 448 } 277 } 449 << 450 preempt_enable_nested(); << 451 } << 452 << 453 void __inc_node_state(struct pglist_data *pgda << 454 { << 455 struct per_cpu_nodestat __percpu *pcp << 456 s8 __percpu *p = pcp->vm_node_stat_dif << 457 s8 v, t; << 458 << 459 VM_WARN_ON_ONCE(vmstat_item_in_bytes(i << 460 << 461 /* See __mod_node_page_state */ << 462 preempt_disable_nested(); << 463 << 464 v = __this_cpu_inc_return(*p); << 465 t = __this_cpu_read(pcp->stat_threshol << 466 if (unlikely(v > t)) { << 467 s8 overstep = t >> 1; << 468 << 469 node_page_state_add(v + overst << 470 __this_cpu_write(*p, -overstep << 471 } << 472 << 473 preempt_enable_nested(); << 474 } 278 } 475 279 476 void __inc_zone_page_state(struct page *page, 280 void __inc_zone_page_state(struct page *page, enum zone_stat_item item) 477 { 281 { 478 __inc_zone_state(page_zone(page), item 282 __inc_zone_state(page_zone(page), item); 479 } 283 } 480 EXPORT_SYMBOL(__inc_zone_page_state); 284 EXPORT_SYMBOL(__inc_zone_page_state); 481 285 482 void __inc_node_page_state(struct page *page, << 483 { << 484 __inc_node_state(page_pgdat(page), ite << 485 } << 486 EXPORT_SYMBOL(__inc_node_page_state); << 487 << 488 void __dec_zone_state(struct zone *zone, enum 286 void __dec_zone_state(struct zone *zone, enum zone_stat_item item) 489 { 287 { 490 struct per_cpu_zonestat __percpu *pcp !! 288 struct per_cpu_pageset __percpu *pcp = zone->pageset; 491 s8 __percpu *p = pcp->vm_stat_diff + i 289 s8 __percpu *p = pcp->vm_stat_diff + item; 492 s8 v, t; 290 s8 v, t; 493 291 494 /* See __mod_node_page_state */ << 495 preempt_disable_nested(); << 496 << 497 v = __this_cpu_dec_return(*p); 292 v = __this_cpu_dec_return(*p); 498 t = __this_cpu_read(pcp->stat_threshol 293 t = __this_cpu_read(pcp->stat_threshold); 499 if (unlikely(v < - t)) { 294 if (unlikely(v < - t)) { 500 s8 overstep = t >> 1; 295 s8 overstep = t >> 1; 501 296 502 zone_page_state_add(v - overst 297 zone_page_state_add(v - overstep, zone, item); 503 __this_cpu_write(*p, overstep) 298 __this_cpu_write(*p, overstep); 504 } 299 } 505 << 506 preempt_enable_nested(); << 507 } << 508 << 509 void __dec_node_state(struct pglist_data *pgda << 510 { << 511 struct per_cpu_nodestat __percpu *pcp << 512 s8 __percpu *p = pcp->vm_node_stat_dif << 513 s8 v, t; << 514 << 515 VM_WARN_ON_ONCE(vmstat_item_in_bytes(i << 516 << 517 /* See __mod_node_page_state */ << 518 preempt_disable_nested(); << 519 << 520 v = __this_cpu_dec_return(*p); << 521 t = __this_cpu_read(pcp->stat_threshol << 522 if (unlikely(v < - t)) { << 523 s8 overstep = t >> 1; << 524 << 525 node_page_state_add(v - overst << 526 __this_cpu_write(*p, overstep) << 527 } << 528 << 529 preempt_enable_nested(); << 530 } 300 } 531 301 532 void __dec_zone_page_state(struct page *page, 302 void __dec_zone_page_state(struct page *page, enum zone_stat_item item) 533 { 303 { 534 __dec_zone_state(page_zone(page), item 304 __dec_zone_state(page_zone(page), item); 535 } 305 } 536 EXPORT_SYMBOL(__dec_zone_page_state); 306 EXPORT_SYMBOL(__dec_zone_page_state); 537 307 538 void __dec_node_page_state(struct page *page, << 539 { << 540 __dec_node_state(page_pgdat(page), ite << 541 } << 542 EXPORT_SYMBOL(__dec_node_page_state); << 543 << 544 #ifdef CONFIG_HAVE_CMPXCHG_LOCAL 308 #ifdef CONFIG_HAVE_CMPXCHG_LOCAL 545 /* 309 /* 546 * If we have cmpxchg_local support then we do 310 * If we have cmpxchg_local support then we do not need to incur the overhead 547 * that comes with local_irq_save/restore if w 311 * that comes with local_irq_save/restore if we use this_cpu_cmpxchg. 548 * 312 * 549 * mod_state() modifies the zone counter state 313 * mod_state() modifies the zone counter state through atomic per cpu 550 * operations. 314 * operations. 551 * 315 * 552 * Overstep mode specifies how overstep should 316 * Overstep mode specifies how overstep should handled: 553 * 0 No overstepping 317 * 0 No overstepping 554 * 1 Overstepping half of threshold 318 * 1 Overstepping half of threshold 555 * -1 Overstepping minus half of thre 319 * -1 Overstepping minus half of threshold 556 */ 320 */ 557 static inline void mod_zone_state(struct zone !! 321 static inline void mod_state(struct zone *zone, enum zone_stat_item item, 558 enum zone_stat_item item, long delta, i !! 322 long delta, int overstep_mode) 559 { 323 { 560 struct per_cpu_zonestat __percpu *pcp !! 324 struct per_cpu_pageset __percpu *pcp = zone->pageset; 561 s8 __percpu *p = pcp->vm_stat_diff + i 325 s8 __percpu *p = pcp->vm_stat_diff + item; 562 long n, t, z; !! 326 long o, n, t, z; 563 s8 o; << 564 327 565 o = this_cpu_read(*p); << 566 do { 328 do { 567 z = 0; /* overflow to zone co 329 z = 0; /* overflow to zone counters */ 568 330 569 /* 331 /* 570 * The fetching of the stat_th 332 * The fetching of the stat_threshold is racy. We may apply 571 * a counter threshold to the 333 * a counter threshold to the wrong the cpu if we get 572 * rescheduled while executing 334 * rescheduled while executing here. However, the next 573 * counter update will apply t 335 * counter update will apply the threshold again and 574 * therefore bring the counter 336 * therefore bring the counter under the threshold again. 575 * 337 * 576 * Most of the time the thresh 338 * Most of the time the thresholds are the same anyways 577 * for all cpus in a zone. 339 * for all cpus in a zone. 578 */ 340 */ 579 t = this_cpu_read(pcp->stat_th 341 t = this_cpu_read(pcp->stat_threshold); 580 342 581 n = delta + (long)o; !! 343 o = this_cpu_read(*p); >> 344 n = delta + o; 582 345 583 if (abs(n) > t) { !! 346 if (n > t || n < -t) { 584 int os = overstep_mode 347 int os = overstep_mode * (t >> 1) ; 585 348 586 /* Overflow must be ad 349 /* Overflow must be added to zone counters */ 587 z = n + os; 350 z = n + os; 588 n = -os; 351 n = -os; 589 } 352 } 590 } while (!this_cpu_try_cmpxchg(*p, &o, !! 353 } while (this_cpu_cmpxchg(*p, o, n) != o); 591 354 592 if (z) 355 if (z) 593 zone_page_state_add(z, zone, i 356 zone_page_state_add(z, zone, item); 594 } 357 } 595 358 596 void mod_zone_page_state(struct zone *zone, en 359 void mod_zone_page_state(struct zone *zone, enum zone_stat_item item, 597 long delta) 360 long delta) 598 { 361 { 599 mod_zone_state(zone, item, delta, 0); !! 362 mod_state(zone, item, delta, 0); 600 } 363 } 601 EXPORT_SYMBOL(mod_zone_page_state); 364 EXPORT_SYMBOL(mod_zone_page_state); 602 365 >> 366 void inc_zone_state(struct zone *zone, enum zone_stat_item item) >> 367 { >> 368 mod_state(zone, item, 1, 1); >> 369 } >> 370 603 void inc_zone_page_state(struct page *page, en 371 void inc_zone_page_state(struct page *page, enum zone_stat_item item) 604 { 372 { 605 mod_zone_state(page_zone(page), item, !! 373 mod_state(page_zone(page), item, 1, 1); 606 } 374 } 607 EXPORT_SYMBOL(inc_zone_page_state); 375 EXPORT_SYMBOL(inc_zone_page_state); 608 376 609 void dec_zone_page_state(struct page *page, en 377 void dec_zone_page_state(struct page *page, enum zone_stat_item item) 610 { 378 { 611 mod_zone_state(page_zone(page), item, !! 379 mod_state(page_zone(page), item, -1, -1); 612 } 380 } 613 EXPORT_SYMBOL(dec_zone_page_state); 381 EXPORT_SYMBOL(dec_zone_page_state); 614 << 615 static inline void mod_node_state(struct pglis << 616 enum node_stat_item item, int delta, in << 617 { << 618 struct per_cpu_nodestat __percpu *pcp << 619 s8 __percpu *p = pcp->vm_node_stat_dif << 620 long n, t, z; << 621 s8 o; << 622 << 623 if (vmstat_item_in_bytes(item)) { << 624 /* << 625 * Only cgroups use subpage ac << 626 * the global level, these ite << 627 * multiples of whole pages. S << 628 * internally to keep the per- << 629 */ << 630 VM_WARN_ON_ONCE(delta & (PAGE_ << 631 delta >>= PAGE_SHIFT; << 632 } << 633 << 634 o = this_cpu_read(*p); << 635 do { << 636 z = 0; /* overflow to node co << 637 << 638 /* << 639 * The fetching of the stat_th << 640 * a counter threshold to the << 641 * rescheduled while executing << 642 * counter update will apply t << 643 * therefore bring the counter << 644 * << 645 * Most of the time the thresh << 646 * for all cpus in a node. << 647 */ << 648 t = this_cpu_read(pcp->stat_th << 649 << 650 n = delta + (long)o; << 651 << 652 if (abs(n) > t) { << 653 int os = overstep_mode << 654 << 655 /* Overflow must be ad << 656 z = n + os; << 657 n = -os; << 658 } << 659 } while (!this_cpu_try_cmpxchg(*p, &o, << 660 << 661 if (z) << 662 node_page_state_add(z, pgdat, << 663 } << 664 << 665 void mod_node_page_state(struct pglist_data *p << 666 long d << 667 { << 668 mod_node_state(pgdat, item, delta, 0); << 669 } << 670 EXPORT_SYMBOL(mod_node_page_state); << 671 << 672 void inc_node_state(struct pglist_data *pgdat, << 673 { << 674 mod_node_state(pgdat, item, 1, 1); << 675 } << 676 << 677 void inc_node_page_state(struct page *page, en << 678 { << 679 mod_node_state(page_pgdat(page), item, << 680 } << 681 EXPORT_SYMBOL(inc_node_page_state); << 682 << 683 void dec_node_page_state(struct page *page, en << 684 { << 685 mod_node_state(page_pgdat(page), item, << 686 } << 687 EXPORT_SYMBOL(dec_node_page_state); << 688 #else 382 #else 689 /* 383 /* 690 * Use interrupt disable to serialize counter 384 * Use interrupt disable to serialize counter updates 691 */ 385 */ 692 void mod_zone_page_state(struct zone *zone, en 386 void mod_zone_page_state(struct zone *zone, enum zone_stat_item item, 693 long delta) 387 long delta) 694 { 388 { 695 unsigned long flags; 389 unsigned long flags; 696 390 697 local_irq_save(flags); 391 local_irq_save(flags); 698 __mod_zone_page_state(zone, item, delt 392 __mod_zone_page_state(zone, item, delta); 699 local_irq_restore(flags); 393 local_irq_restore(flags); 700 } 394 } 701 EXPORT_SYMBOL(mod_zone_page_state); 395 EXPORT_SYMBOL(mod_zone_page_state); 702 396 703 void inc_zone_page_state(struct page *page, en !! 397 void inc_zone_state(struct zone *zone, enum zone_stat_item item) 704 { 398 { 705 unsigned long flags; 399 unsigned long flags; 706 struct zone *zone; << 707 400 708 zone = page_zone(page); << 709 local_irq_save(flags); 401 local_irq_save(flags); 710 __inc_zone_state(zone, item); 402 __inc_zone_state(zone, item); 711 local_irq_restore(flags); 403 local_irq_restore(flags); 712 } 404 } 713 EXPORT_SYMBOL(inc_zone_page_state); << 714 << 715 void dec_zone_page_state(struct page *page, en << 716 { << 717 unsigned long flags; << 718 << 719 local_irq_save(flags); << 720 __dec_zone_page_state(page, item); << 721 local_irq_restore(flags); << 722 } << 723 EXPORT_SYMBOL(dec_zone_page_state); << 724 << 725 void inc_node_state(struct pglist_data *pgdat, << 726 { << 727 unsigned long flags; << 728 << 729 local_irq_save(flags); << 730 __inc_node_state(pgdat, item); << 731 local_irq_restore(flags); << 732 } << 733 EXPORT_SYMBOL(inc_node_state); << 734 << 735 void mod_node_page_state(struct pglist_data *p << 736 long d << 737 { << 738 unsigned long flags; << 739 << 740 local_irq_save(flags); << 741 __mod_node_page_state(pgdat, item, del << 742 local_irq_restore(flags); << 743 } << 744 EXPORT_SYMBOL(mod_node_page_state); << 745 405 746 void inc_node_page_state(struct page *page, en !! 406 void inc_zone_page_state(struct page *page, enum zone_stat_item item) 747 { 407 { 748 unsigned long flags; 408 unsigned long flags; 749 struct pglist_data *pgdat; !! 409 struct zone *zone; 750 410 751 pgdat = page_pgdat(page); !! 411 zone = page_zone(page); 752 local_irq_save(flags); 412 local_irq_save(flags); 753 __inc_node_state(pgdat, item); !! 413 __inc_zone_state(zone, item); 754 local_irq_restore(flags); 414 local_irq_restore(flags); 755 } 415 } 756 EXPORT_SYMBOL(inc_node_page_state); !! 416 EXPORT_SYMBOL(inc_zone_page_state); 757 417 758 void dec_node_page_state(struct page *page, en !! 418 void dec_zone_page_state(struct page *page, enum zone_stat_item item) 759 { 419 { 760 unsigned long flags; 420 unsigned long flags; 761 421 762 local_irq_save(flags); 422 local_irq_save(flags); 763 __dec_node_page_state(page, item); !! 423 __dec_zone_page_state(page, item); 764 local_irq_restore(flags); 424 local_irq_restore(flags); 765 } 425 } 766 EXPORT_SYMBOL(dec_node_page_state); !! 426 EXPORT_SYMBOL(dec_zone_page_state); 767 #endif 427 #endif 768 428 >> 429 769 /* 430 /* 770 * Fold a differential into the global counter 431 * Fold a differential into the global counters. 771 * Returns the number of counters updated. 432 * Returns the number of counters updated. 772 */ 433 */ 773 static int fold_diff(int *zone_diff, int *node !! 434 static int fold_diff(int *diff) 774 { 435 { 775 int i; 436 int i; 776 int changes = 0; 437 int changes = 0; 777 438 778 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; 439 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) 779 if (zone_diff[i]) { !! 440 if (diff[i]) { 780 atomic_long_add(zone_d !! 441 atomic_long_add(diff[i], &vm_stat[i]); 781 changes++; << 782 } << 783 << 784 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; << 785 if (node_diff[i]) { << 786 atomic_long_add(node_d << 787 changes++; 442 changes++; 788 } 443 } 789 return changes; 444 return changes; 790 } 445 } 791 446 792 /* 447 /* 793 * Update the zone counters for the current cp 448 * Update the zone counters for the current cpu. 794 * 449 * 795 * Note that refresh_cpu_vm_stats strives to o 450 * Note that refresh_cpu_vm_stats strives to only access 796 * node local memory. The per cpu pagesets on 451 * node local memory. The per cpu pagesets on remote zones are placed 797 * in the memory local to the processor using 452 * in the memory local to the processor using that pageset. So the 798 * loop over all zones will access a series of 453 * loop over all zones will access a series of cachelines local to 799 * the processor. 454 * the processor. 800 * 455 * 801 * The call to zone_page_state_add updates the 456 * The call to zone_page_state_add updates the cachelines with the 802 * statistics in the remote zone struct as wel 457 * statistics in the remote zone struct as well as the global cachelines 803 * with the global counters. These could cause 458 * with the global counters. These could cause remote node cache line 804 * bouncing and will have to be only done when 459 * bouncing and will have to be only done when necessary. 805 * 460 * 806 * The function returns the number of global c 461 * The function returns the number of global counters updated. 807 */ 462 */ 808 static int refresh_cpu_vm_stats(bool do_pagese 463 static int refresh_cpu_vm_stats(bool do_pagesets) 809 { 464 { 810 struct pglist_data *pgdat; << 811 struct zone *zone; 465 struct zone *zone; 812 int i; 466 int i; 813 int global_zone_diff[NR_VM_ZONE_STAT_I !! 467 int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; 814 int global_node_diff[NR_VM_NODE_STAT_I << 815 int changes = 0; 468 int changes = 0; 816 469 817 for_each_populated_zone(zone) { 470 for_each_populated_zone(zone) { 818 struct per_cpu_zonestat __perc !! 471 struct per_cpu_pageset __percpu *p = zone->pageset; 819 struct per_cpu_pages __percpu << 820 472 821 for (i = 0; i < NR_VM_ZONE_STA 473 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) { 822 int v; 474 int v; 823 475 824 v = this_cpu_xchg(pzst !! 476 v = this_cpu_xchg(p->vm_stat_diff[i], 0); 825 if (v) { 477 if (v) { 826 478 827 atomic_long_ad 479 atomic_long_add(v, &zone->vm_stat[i]); 828 global_zone_di !! 480 global_diff[i] += v; 829 #ifdef CONFIG_NUMA 481 #ifdef CONFIG_NUMA 830 /* 3 seconds i 482 /* 3 seconds idle till flush */ 831 __this_cpu_wri !! 483 __this_cpu_write(p->expire, 3); 832 #endif 484 #endif 833 } 485 } 834 } 486 } 835 !! 487 #ifdef CONFIG_NUMA 836 if (do_pagesets) { 488 if (do_pagesets) { 837 cond_resched(); 489 cond_resched(); 838 << 839 changes += decay_pcp_h << 840 #ifdef CONFIG_NUMA << 841 /* 490 /* 842 * Deal with draining 491 * Deal with draining the remote pageset of this 843 * processor 492 * processor 844 * 493 * 845 * Check if there are 494 * Check if there are pages remaining in this pageset 846 * if not then there i 495 * if not then there is nothing to expire. 847 */ 496 */ 848 if (!__this_cpu_read(p !! 497 if (!__this_cpu_read(p->expire) || 849 !__this_cpu_rea !! 498 !__this_cpu_read(p->pcp.count)) 850 continue; 499 continue; 851 500 852 /* 501 /* 853 * We never drain zone 502 * We never drain zones local to this processor. 854 */ 503 */ 855 if (zone_to_nid(zone) 504 if (zone_to_nid(zone) == numa_node_id()) { 856 __this_cpu_wri !! 505 __this_cpu_write(p->expire, 0); 857 continue; 506 continue; 858 } 507 } 859 508 860 if (__this_cpu_dec_ret !! 509 if (__this_cpu_dec_return(p->expire)) 861 changes++; << 862 continue; 510 continue; 863 } << 864 511 865 if (__this_cpu_read(pc !! 512 if (__this_cpu_read(p->pcp.count)) { 866 drain_zone_pag !! 513 drain_zone_pages(zone, this_cpu_ptr(&p->pcp)); 867 changes++; 514 changes++; 868 } 515 } 869 #endif << 870 } << 871 } << 872 << 873 for_each_online_pgdat(pgdat) { << 874 struct per_cpu_nodestat __perc << 875 << 876 for (i = 0; i < NR_VM_NODE_STA << 877 int v; << 878 << 879 v = this_cpu_xchg(p->v << 880 if (v) { << 881 atomic_long_ad << 882 global_node_di << 883 } << 884 } 516 } >> 517 #endif 885 } 518 } 886 !! 519 changes += fold_diff(global_diff); 887 changes += fold_diff(global_zone_diff, << 888 return changes; 520 return changes; 889 } 521 } 890 522 891 /* 523 /* 892 * Fold the data for an offline cpu into the g 524 * Fold the data for an offline cpu into the global array. 893 * There cannot be any access by the offline c 525 * There cannot be any access by the offline cpu and therefore 894 * synchronization is simplified. 526 * synchronization is simplified. 895 */ 527 */ 896 void cpu_vm_stats_fold(int cpu) 528 void cpu_vm_stats_fold(int cpu) 897 { 529 { 898 struct pglist_data *pgdat; << 899 struct zone *zone; 530 struct zone *zone; 900 int i; 531 int i; 901 int global_zone_diff[NR_VM_ZONE_STAT_I !! 532 int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; 902 int global_node_diff[NR_VM_NODE_STAT_I << 903 533 904 for_each_populated_zone(zone) { 534 for_each_populated_zone(zone) { 905 struct per_cpu_zonestat *pzsta !! 535 struct per_cpu_pageset *p; 906 536 907 pzstats = per_cpu_ptr(zone->pe !! 537 p = per_cpu_ptr(zone->pageset, cpu); 908 538 909 for (i = 0; i < NR_VM_ZONE_STA !! 539 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) 910 if (pzstats->vm_stat_d !! 540 if (p->vm_stat_diff[i]) { 911 int v; 541 int v; 912 542 913 v = pzstats->v !! 543 v = p->vm_stat_diff[i]; 914 pzstats->vm_st !! 544 p->vm_stat_diff[i] = 0; 915 atomic_long_ad 545 atomic_long_add(v, &zone->vm_stat[i]); 916 global_zone_di !! 546 global_diff[i] += v; 917 } << 918 } << 919 #ifdef CONFIG_NUMA << 920 for (i = 0; i < NR_VM_NUMA_EVE << 921 if (pzstats->vm_numa_e << 922 unsigned long << 923 << 924 v = pzstats->v << 925 pzstats->vm_nu << 926 zone_numa_even << 927 } 547 } 928 } << 929 #endif << 930 } 548 } 931 549 932 for_each_online_pgdat(pgdat) { !! 550 fold_diff(global_diff); 933 struct per_cpu_nodestat *p; << 934 << 935 p = per_cpu_ptr(pgdat->per_cpu << 936 << 937 for (i = 0; i < NR_VM_NODE_STA << 938 if (p->vm_node_stat_di << 939 int v; << 940 << 941 v = p->vm_node << 942 p->vm_node_sta << 943 atomic_long_ad << 944 global_node_di << 945 } << 946 } << 947 << 948 fold_diff(global_zone_diff, global_nod << 949 } 551 } 950 552 951 /* 553 /* 952 * this is only called if !populated_zone(zone 554 * this is only called if !populated_zone(zone), which implies no other users of 953 * pset->vm_stat_diff[] exist. !! 555 * pset->vm_stat_diff[] exsist. 954 */ 556 */ 955 void drain_zonestat(struct zone *zone, struct !! 557 void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset) 956 { 558 { 957 unsigned long v; << 958 int i; 559 int i; 959 560 960 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; !! 561 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) 961 if (pzstats->vm_stat_diff[i]) !! 562 if (pset->vm_stat_diff[i]) { 962 v = pzstats->vm_stat_d !! 563 int v = pset->vm_stat_diff[i]; 963 pzstats->vm_stat_diff[ !! 564 pset->vm_stat_diff[i] = 0; 964 zone_page_state_add(v, !! 565 atomic_long_add(v, &zone->vm_stat[i]); 965 } !! 566 atomic_long_add(v, &vm_stat[i]); 966 } << 967 << 968 #ifdef CONFIG_NUMA << 969 for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS << 970 if (pzstats->vm_numa_event[i]) << 971 v = pzstats->vm_numa_e << 972 pzstats->vm_numa_event << 973 zone_numa_event_add(v, << 974 } 567 } 975 } << 976 #endif << 977 } 568 } 978 #endif 569 #endif 979 570 980 #ifdef CONFIG_NUMA 571 #ifdef CONFIG_NUMA 981 /* 572 /* 982 * Determine the per node value of a stat item !! 573 * zonelist = the list of zones passed to the allocator 983 * is called frequently in a NUMA machine, so !! 574 * z = the zone from which the allocation occurred. 984 * frugal as possible. !! 575 * >> 576 * Must be called with interrupts disabled. >> 577 * >> 578 * When __GFP_OTHER_NODE is set assume the node of the preferred >> 579 * zone is the local node. This is useful for daemons who allocate >> 580 * memory on behalf of other processes. 985 */ 581 */ 986 unsigned long sum_zone_node_page_state(int nod !! 582 void zone_statistics(struct zone *preferred_zone, struct zone *z, gfp_t flags) 987 enum zone_sta << 988 { 583 { 989 struct zone *zones = NODE_DATA(node)-> !! 584 if (z->zone_pgdat == preferred_zone->zone_pgdat) { 990 int i; !! 585 __inc_zone_state(z, NUMA_HIT); 991 unsigned long count = 0; !! 586 } else { 992 !! 587 __inc_zone_state(z, NUMA_MISS); 993 for (i = 0; i < MAX_NR_ZONES; i++) !! 588 __inc_zone_state(preferred_zone, NUMA_FOREIGN); 994 count += zone_page_state(zones !! 589 } 995 !! 590 if (z->node == ((flags & __GFP_OTHER_NODE) ? 996 return count; !! 591 preferred_zone->node : numa_node_id())) 997 } !! 592 __inc_zone_state(z, NUMA_LOCAL); 998 !! 593 else 999 /* Determine the per node value of a numa stat !! 594 __inc_zone_state(z, NUMA_OTHER); 1000 unsigned long sum_zone_numa_event_state(int n << 1001 enum numa_st << 1002 { << 1003 struct zone *zones = NODE_DATA(node)- << 1004 unsigned long count = 0; << 1005 int i; << 1006 << 1007 for (i = 0; i < MAX_NR_ZONES; i++) << 1008 count += zone_numa_event_stat << 1009 << 1010 return count; << 1011 } 595 } 1012 596 1013 /* 597 /* 1014 * Determine the per node value of a stat ite 598 * Determine the per node value of a stat item. 1015 */ 599 */ 1016 unsigned long node_page_state_pages(struct pg !! 600 unsigned long node_page_state(int node, enum zone_stat_item item) 1017 enum node << 1018 { 601 { 1019 long x = atomic_long_read(&pgdat->vm_ !! 602 struct zone *zones = NODE_DATA(node)->node_zones; 1020 #ifdef CONFIG_SMP << 1021 if (x < 0) << 1022 x = 0; << 1023 #endif << 1024 return x; << 1025 } << 1026 << 1027 unsigned long node_page_state(struct pglist_d << 1028 enum node_stat_ << 1029 { << 1030 VM_WARN_ON_ONCE(vmstat_item_in_bytes( << 1031 603 1032 return node_page_state_pages(pgdat, i !! 604 return 1033 } !! 605 #ifdef CONFIG_ZONE_DMA >> 606 zone_page_state(&zones[ZONE_DMA], item) + 1034 #endif 607 #endif 1035 !! 608 #ifdef CONFIG_ZONE_DMA32 1036 /* !! 609 zone_page_state(&zones[ZONE_DMA32], item) + 1037 * Count number of pages "struct page" and "s !! 610 #endif 1038 * nr_memmap_boot_pages: # of pages allocated !! 611 #ifdef CONFIG_HIGHMEM 1039 * nr_memmap_pages: # of pages that were allo !! 612 zone_page_state(&zones[ZONE_HIGHMEM], item) + 1040 */ !! 613 #endif 1041 static atomic_long_t nr_memmap_boot_pages = A !! 614 zone_page_state(&zones[ZONE_NORMAL], item) + 1042 static atomic_long_t nr_memmap_pages = ATOMIC !! 615 zone_page_state(&zones[ZONE_MOVABLE], item); 1043 << 1044 void memmap_boot_pages_add(long delta) << 1045 { << 1046 atomic_long_add(delta, &nr_memmap_boo << 1047 } 616 } 1048 617 1049 void memmap_pages_add(long delta) !! 618 #endif 1050 { << 1051 atomic_long_add(delta, &nr_memmap_pag << 1052 } << 1053 619 1054 #ifdef CONFIG_COMPACTION 620 #ifdef CONFIG_COMPACTION 1055 621 1056 struct contig_page_info { 622 struct contig_page_info { 1057 unsigned long free_pages; 623 unsigned long free_pages; 1058 unsigned long free_blocks_total; 624 unsigned long free_blocks_total; 1059 unsigned long free_blocks_suitable; 625 unsigned long free_blocks_suitable; 1060 }; 626 }; 1061 627 1062 /* 628 /* 1063 * Calculate the number of free pages in a zo 629 * Calculate the number of free pages in a zone, how many contiguous 1064 * pages are free and how many are large enou 630 * pages are free and how many are large enough to satisfy an allocation of 1065 * the target size. Note that this function m 631 * the target size. Note that this function makes no attempt to estimate 1066 * how many suitable free blocks there *might 632 * how many suitable free blocks there *might* be if MOVABLE pages were 1067 * migrated. Calculating that is possible, bu 633 * migrated. Calculating that is possible, but expensive and can be 1068 * figured out from userspace 634 * figured out from userspace 1069 */ 635 */ 1070 static void fill_contig_page_info(struct zone 636 static void fill_contig_page_info(struct zone *zone, 1071 unsigned int 637 unsigned int suitable_order, 1072 struct contig 638 struct contig_page_info *info) 1073 { 639 { 1074 unsigned int order; 640 unsigned int order; 1075 641 1076 info->free_pages = 0; 642 info->free_pages = 0; 1077 info->free_blocks_total = 0; 643 info->free_blocks_total = 0; 1078 info->free_blocks_suitable = 0; 644 info->free_blocks_suitable = 0; 1079 645 1080 for (order = 0; order < NR_PAGE_ORDER !! 646 for (order = 0; order < MAX_ORDER; order++) { 1081 unsigned long blocks; 647 unsigned long blocks; 1082 648 1083 /* !! 649 /* Count number of free blocks */ 1084 * Count number of free block !! 650 blocks = zone->free_area[order].nr_free; 1085 * << 1086 * Access to nr_free is lockl << 1087 * diagnostic purposes. Use d << 1088 */ << 1089 blocks = data_race(zone->free << 1090 info->free_blocks_total += bl 651 info->free_blocks_total += blocks; 1091 652 1092 /* Count free base pages */ 653 /* Count free base pages */ 1093 info->free_pages += blocks << 654 info->free_pages += blocks << order; 1094 655 1095 /* Count the suitable free bl 656 /* Count the suitable free blocks */ 1096 if (order >= suitable_order) 657 if (order >= suitable_order) 1097 info->free_blocks_sui 658 info->free_blocks_suitable += blocks << 1098 659 (order - suitable_order); 1099 } 660 } 1100 } 661 } 1101 662 1102 /* 663 /* 1103 * A fragmentation index only makes sense if 664 * A fragmentation index only makes sense if an allocation of a requested 1104 * size would fail. If that is true, the frag 665 * size would fail. If that is true, the fragmentation index indicates 1105 * whether external fragmentation or a lack o 666 * whether external fragmentation or a lack of memory was the problem. 1106 * The value can be used to determine if page 667 * The value can be used to determine if page reclaim or compaction 1107 * should be used 668 * should be used 1108 */ 669 */ 1109 static int __fragmentation_index(unsigned int 670 static int __fragmentation_index(unsigned int order, struct contig_page_info *info) 1110 { 671 { 1111 unsigned long requested = 1UL << orde 672 unsigned long requested = 1UL << order; 1112 673 1113 if (WARN_ON_ONCE(order > MAX_PAGE_ORD << 1114 return 0; << 1115 << 1116 if (!info->free_blocks_total) 674 if (!info->free_blocks_total) 1117 return 0; 675 return 0; 1118 676 1119 /* Fragmentation index only makes sen 677 /* Fragmentation index only makes sense when a request would fail */ 1120 if (info->free_blocks_suitable) 678 if (info->free_blocks_suitable) 1121 return -1000; 679 return -1000; 1122 680 1123 /* 681 /* 1124 * Index is between 0 and 1 so return 682 * Index is between 0 and 1 so return within 3 decimal places 1125 * 683 * 1126 * 0 => allocation would fail due to 684 * 0 => allocation would fail due to lack of memory 1127 * 1 => allocation would fail due to 685 * 1 => allocation would fail due to fragmentation 1128 */ 686 */ 1129 return 1000 - div_u64( (1000+(div_u64 687 return 1000 - div_u64( (1000+(div_u64(info->free_pages * 1000ULL, requested))), info->free_blocks_total); 1130 } 688 } 1131 689 1132 /* << 1133 * Calculates external fragmentation within a << 1134 * It is defined as the percentage of pages f << 1135 * less than 1 << order. It returns values in << 1136 */ << 1137 unsigned int extfrag_for_order(struct zone *z << 1138 { << 1139 struct contig_page_info info; << 1140 << 1141 fill_contig_page_info(zone, order, &i << 1142 if (info.free_pages == 0) << 1143 return 0; << 1144 << 1145 return div_u64((info.free_pages - << 1146 (info.free_blocks_sui << 1147 info.free_pages); << 1148 } << 1149 << 1150 /* Same as __fragmentation index but allocs c 690 /* Same as __fragmentation index but allocs contig_page_info on stack */ 1151 int fragmentation_index(struct zone *zone, un 691 int fragmentation_index(struct zone *zone, unsigned int order) 1152 { 692 { 1153 struct contig_page_info info; 693 struct contig_page_info info; 1154 694 1155 fill_contig_page_info(zone, order, &i 695 fill_contig_page_info(zone, order, &info); 1156 return __fragmentation_index(order, & 696 return __fragmentation_index(order, &info); 1157 } 697 } 1158 #endif 698 #endif 1159 699 1160 #if defined(CONFIG_PROC_FS) || defined(CONFIG !! 700 #if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) || defined(CONFIG_NUMA) 1161 defined(CONFIG_NUMA) || defined(CONFIG_ME << 1162 #ifdef CONFIG_ZONE_DMA 701 #ifdef CONFIG_ZONE_DMA 1163 #define TEXT_FOR_DMA(xx) xx "_dma", 702 #define TEXT_FOR_DMA(xx) xx "_dma", 1164 #else 703 #else 1165 #define TEXT_FOR_DMA(xx) 704 #define TEXT_FOR_DMA(xx) 1166 #endif 705 #endif 1167 706 1168 #ifdef CONFIG_ZONE_DMA32 707 #ifdef CONFIG_ZONE_DMA32 1169 #define TEXT_FOR_DMA32(xx) xx "_dma32", 708 #define TEXT_FOR_DMA32(xx) xx "_dma32", 1170 #else 709 #else 1171 #define TEXT_FOR_DMA32(xx) 710 #define TEXT_FOR_DMA32(xx) 1172 #endif 711 #endif 1173 712 1174 #ifdef CONFIG_HIGHMEM 713 #ifdef CONFIG_HIGHMEM 1175 #define TEXT_FOR_HIGHMEM(xx) xx "_high", 714 #define TEXT_FOR_HIGHMEM(xx) xx "_high", 1176 #else 715 #else 1177 #define TEXT_FOR_HIGHMEM(xx) 716 #define TEXT_FOR_HIGHMEM(xx) 1178 #endif 717 #endif 1179 718 1180 #ifdef CONFIG_ZONE_DEVICE << 1181 #define TEXT_FOR_DEVICE(xx) xx "_device", << 1182 #else << 1183 #define TEXT_FOR_DEVICE(xx) << 1184 #endif << 1185 << 1186 #define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) 719 #define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \ 1187 TEXT_ !! 720 TEXT_FOR_HIGHMEM(xx) xx "_movable", 1188 TEXT_ << 1189 721 1190 const char * const vmstat_text[] = { 722 const char * const vmstat_text[] = { 1191 /* enum zone_stat_item counters */ !! 723 /* enum zone_stat_item countes */ 1192 "nr_free_pages", 724 "nr_free_pages", 1193 "nr_zone_inactive_anon", !! 725 "nr_alloc_batch", 1194 "nr_zone_active_anon", << 1195 "nr_zone_inactive_file", << 1196 "nr_zone_active_file", << 1197 "nr_zone_unevictable", << 1198 "nr_zone_write_pending", << 1199 "nr_mlock", << 1200 "nr_bounce", << 1201 #if IS_ENABLED(CONFIG_ZSMALLOC) << 1202 "nr_zspages", << 1203 #endif << 1204 "nr_free_cma", << 1205 #ifdef CONFIG_UNACCEPTED_MEMORY << 1206 "nr_unaccepted", << 1207 #endif << 1208 << 1209 /* enum numa_stat_item counters */ << 1210 #ifdef CONFIG_NUMA << 1211 "numa_hit", << 1212 "numa_miss", << 1213 "numa_foreign", << 1214 "numa_interleave", << 1215 "numa_local", << 1216 "numa_other", << 1217 #endif << 1218 << 1219 /* enum node_stat_item counters */ << 1220 "nr_inactive_anon", 726 "nr_inactive_anon", 1221 "nr_active_anon", 727 "nr_active_anon", 1222 "nr_inactive_file", 728 "nr_inactive_file", 1223 "nr_active_file", 729 "nr_active_file", 1224 "nr_unevictable", 730 "nr_unevictable", 1225 "nr_slab_reclaimable", !! 731 "nr_mlock", 1226 "nr_slab_unreclaimable", << 1227 "nr_isolated_anon", << 1228 "nr_isolated_file", << 1229 "workingset_nodes", << 1230 "workingset_refault_anon", << 1231 "workingset_refault_file", << 1232 "workingset_activate_anon", << 1233 "workingset_activate_file", << 1234 "workingset_restore_anon", << 1235 "workingset_restore_file", << 1236 "workingset_nodereclaim", << 1237 "nr_anon_pages", 732 "nr_anon_pages", 1238 "nr_mapped", 733 "nr_mapped", 1239 "nr_file_pages", 734 "nr_file_pages", 1240 "nr_dirty", 735 "nr_dirty", 1241 "nr_writeback", 736 "nr_writeback", 1242 "nr_writeback_temp", !! 737 "nr_slab_reclaimable", 1243 "nr_shmem", !! 738 "nr_slab_unreclaimable", 1244 "nr_shmem_hugepages", !! 739 "nr_page_table_pages", 1245 "nr_shmem_pmdmapped", !! 740 "nr_kernel_stack", 1246 "nr_file_hugepages", !! 741 "nr_overhead", 1247 "nr_file_pmdmapped", !! 742 "nr_unstable", 1248 "nr_anon_transparent_hugepages", !! 743 "nr_bounce", 1249 "nr_vmscan_write", 744 "nr_vmscan_write", 1250 "nr_vmscan_immediate_reclaim", 745 "nr_vmscan_immediate_reclaim", >> 746 "nr_writeback_temp", >> 747 "nr_isolated_anon", >> 748 "nr_isolated_file", >> 749 "nr_shmem", 1251 "nr_dirtied", 750 "nr_dirtied", 1252 "nr_written", 751 "nr_written", 1253 "nr_throttled_written", !! 752 "nr_pages_scanned", 1254 "nr_kernel_misc_reclaimable", !! 753 1255 "nr_foll_pin_acquired", !! 754 #ifdef CONFIG_NUMA 1256 "nr_foll_pin_released", !! 755 "numa_hit", 1257 "nr_kernel_stack", !! 756 "numa_miss", 1258 #if IS_ENABLED(CONFIG_SHADOW_CALL_STACK) !! 757 "numa_foreign", 1259 "nr_shadow_call_stack", !! 758 "numa_interleave", 1260 #endif !! 759 "numa_local", 1261 "nr_page_table_pages", !! 760 "numa_other", 1262 "nr_sec_page_table_pages", << 1263 #ifdef CONFIG_IOMMU_SUPPORT << 1264 "nr_iommu_pages", << 1265 #endif << 1266 #ifdef CONFIG_SWAP << 1267 "nr_swapcached", << 1268 #endif << 1269 #ifdef CONFIG_NUMA_BALANCING << 1270 "pgpromote_success", << 1271 "pgpromote_candidate", << 1272 #endif 761 #endif 1273 "pgdemote_kswapd", !! 762 "workingset_refault", 1274 "pgdemote_direct", !! 763 "workingset_activate", 1275 "pgdemote_khugepaged", !! 764 "workingset_nodereclaim", 1276 /* system-wide enum vm_stat_item coun !! 765 "nr_anon_transparent_hugepages", >> 766 "nr_free_cma", >> 767 >> 768 /* enum writeback_stat_item counters */ 1277 "nr_dirty_threshold", 769 "nr_dirty_threshold", 1278 "nr_dirty_background_threshold", 770 "nr_dirty_background_threshold", 1279 "nr_memmap_pages", << 1280 "nr_memmap_boot_pages", << 1281 771 1282 #if defined(CONFIG_VM_EVENT_COUNTERS) || defi !! 772 #ifdef CONFIG_VM_EVENT_COUNTERS 1283 /* enum vm_event_item counters */ 773 /* enum vm_event_item counters */ 1284 "pgpgin", 774 "pgpgin", 1285 "pgpgout", 775 "pgpgout", 1286 "pswpin", 776 "pswpin", 1287 "pswpout", 777 "pswpout", 1288 778 1289 TEXTS_FOR_ZONES("pgalloc") 779 TEXTS_FOR_ZONES("pgalloc") 1290 TEXTS_FOR_ZONES("allocstall") << 1291 TEXTS_FOR_ZONES("pgskip") << 1292 780 1293 "pgfree", 781 "pgfree", 1294 "pgactivate", 782 "pgactivate", 1295 "pgdeactivate", 783 "pgdeactivate", 1296 "pglazyfree", << 1297 784 1298 "pgfault", 785 "pgfault", 1299 "pgmajfault", 786 "pgmajfault", 1300 "pglazyfreed", << 1301 787 1302 "pgrefill", !! 788 TEXTS_FOR_ZONES("pgrefill") 1303 "pgreuse", !! 789 TEXTS_FOR_ZONES("pgsteal_kswapd") 1304 "pgsteal_kswapd", !! 790 TEXTS_FOR_ZONES("pgsteal_direct") 1305 "pgsteal_direct", !! 791 TEXTS_FOR_ZONES("pgscan_kswapd") 1306 "pgsteal_khugepaged", !! 792 TEXTS_FOR_ZONES("pgscan_direct") 1307 "pgscan_kswapd", << 1308 "pgscan_direct", << 1309 "pgscan_khugepaged", << 1310 "pgscan_direct_throttle", 793 "pgscan_direct_throttle", 1311 "pgscan_anon", << 1312 "pgscan_file", << 1313 "pgsteal_anon", << 1314 "pgsteal_file", << 1315 794 1316 #ifdef CONFIG_NUMA 795 #ifdef CONFIG_NUMA 1317 "zone_reclaim_success", << 1318 "zone_reclaim_failed", 796 "zone_reclaim_failed", 1319 #endif 797 #endif 1320 "pginodesteal", 798 "pginodesteal", 1321 "slabs_scanned", 799 "slabs_scanned", 1322 "kswapd_inodesteal", 800 "kswapd_inodesteal", 1323 "kswapd_low_wmark_hit_quickly", 801 "kswapd_low_wmark_hit_quickly", 1324 "kswapd_high_wmark_hit_quickly", 802 "kswapd_high_wmark_hit_quickly", 1325 "pageoutrun", 803 "pageoutrun", >> 804 "allocstall", 1326 805 1327 "pgrotated", 806 "pgrotated", 1328 807 1329 "drop_pagecache", 808 "drop_pagecache", 1330 "drop_slab", 809 "drop_slab", 1331 "oom_kill", << 1332 810 1333 #ifdef CONFIG_NUMA_BALANCING 811 #ifdef CONFIG_NUMA_BALANCING 1334 "numa_pte_updates", 812 "numa_pte_updates", 1335 "numa_huge_pte_updates", 813 "numa_huge_pte_updates", 1336 "numa_hint_faults", 814 "numa_hint_faults", 1337 "numa_hint_faults_local", 815 "numa_hint_faults_local", 1338 "numa_pages_migrated", 816 "numa_pages_migrated", 1339 #endif 817 #endif 1340 #ifdef CONFIG_MIGRATION 818 #ifdef CONFIG_MIGRATION 1341 "pgmigrate_success", 819 "pgmigrate_success", 1342 "pgmigrate_fail", 820 "pgmigrate_fail", 1343 "thp_migration_success", << 1344 "thp_migration_fail", << 1345 "thp_migration_split", << 1346 #endif 821 #endif 1347 #ifdef CONFIG_COMPACTION 822 #ifdef CONFIG_COMPACTION 1348 "compact_migrate_scanned", 823 "compact_migrate_scanned", 1349 "compact_free_scanned", 824 "compact_free_scanned", 1350 "compact_isolated", 825 "compact_isolated", 1351 "compact_stall", 826 "compact_stall", 1352 "compact_fail", 827 "compact_fail", 1353 "compact_success", 828 "compact_success", 1354 "compact_daemon_wake", << 1355 "compact_daemon_migrate_scanned", << 1356 "compact_daemon_free_scanned", << 1357 #endif 829 #endif 1358 830 1359 #ifdef CONFIG_HUGETLB_PAGE 831 #ifdef CONFIG_HUGETLB_PAGE 1360 "htlb_buddy_alloc_success", 832 "htlb_buddy_alloc_success", 1361 "htlb_buddy_alloc_fail", 833 "htlb_buddy_alloc_fail", 1362 #endif 834 #endif 1363 #ifdef CONFIG_CMA << 1364 "cma_alloc_success", << 1365 "cma_alloc_fail", << 1366 #endif << 1367 "unevictable_pgs_culled", 835 "unevictable_pgs_culled", 1368 "unevictable_pgs_scanned", 836 "unevictable_pgs_scanned", 1369 "unevictable_pgs_rescued", 837 "unevictable_pgs_rescued", 1370 "unevictable_pgs_mlocked", 838 "unevictable_pgs_mlocked", 1371 "unevictable_pgs_munlocked", 839 "unevictable_pgs_munlocked", 1372 "unevictable_pgs_cleared", 840 "unevictable_pgs_cleared", 1373 "unevictable_pgs_stranded", 841 "unevictable_pgs_stranded", 1374 842 1375 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 843 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1376 "thp_fault_alloc", 844 "thp_fault_alloc", 1377 "thp_fault_fallback", 845 "thp_fault_fallback", 1378 "thp_fault_fallback_charge", << 1379 "thp_collapse_alloc", 846 "thp_collapse_alloc", 1380 "thp_collapse_alloc_failed", 847 "thp_collapse_alloc_failed", 1381 "thp_file_alloc", !! 848 "thp_split", 1382 "thp_file_fallback", << 1383 "thp_file_fallback_charge", << 1384 "thp_file_mapped", << 1385 "thp_split_page", << 1386 "thp_split_page_failed", << 1387 "thp_deferred_split_page", << 1388 "thp_underused_split_page", << 1389 "thp_split_pmd", << 1390 "thp_scan_exceed_none_pte", << 1391 "thp_scan_exceed_swap_pte", << 1392 "thp_scan_exceed_share_pte", << 1393 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_ << 1394 "thp_split_pud", << 1395 #endif << 1396 "thp_zero_page_alloc", 849 "thp_zero_page_alloc", 1397 "thp_zero_page_alloc_failed", 850 "thp_zero_page_alloc_failed", 1398 "thp_swpout", << 1399 "thp_swpout_fallback", << 1400 #endif 851 #endif 1401 #ifdef CONFIG_MEMORY_BALLOON 852 #ifdef CONFIG_MEMORY_BALLOON 1402 "balloon_inflate", 853 "balloon_inflate", 1403 "balloon_deflate", 854 "balloon_deflate", 1404 #ifdef CONFIG_BALLOON_COMPACTION 855 #ifdef CONFIG_BALLOON_COMPACTION 1405 "balloon_migrate", 856 "balloon_migrate", 1406 #endif 857 #endif 1407 #endif /* CONFIG_MEMORY_BALLOON */ 858 #endif /* CONFIG_MEMORY_BALLOON */ 1408 #ifdef CONFIG_DEBUG_TLBFLUSH 859 #ifdef CONFIG_DEBUG_TLBFLUSH 1409 "nr_tlb_remote_flush", 860 "nr_tlb_remote_flush", 1410 "nr_tlb_remote_flush_received", 861 "nr_tlb_remote_flush_received", 1411 "nr_tlb_local_flush_all", 862 "nr_tlb_local_flush_all", 1412 "nr_tlb_local_flush_one", 863 "nr_tlb_local_flush_one", 1413 #endif /* CONFIG_DEBUG_TLBFLUSH */ 864 #endif /* CONFIG_DEBUG_TLBFLUSH */ 1414 865 1415 #ifdef CONFIG_SWAP !! 866 #ifdef CONFIG_DEBUG_VM_VMACACHE 1416 "swap_ra", !! 867 "vmacache_find_calls", 1417 "swap_ra_hit", !! 868 "vmacache_find_hits", 1418 #ifdef CONFIG_KSM << 1419 "ksm_swpin_copy", << 1420 #endif << 1421 #endif << 1422 #ifdef CONFIG_KSM << 1423 "cow_ksm", << 1424 #endif 869 #endif 1425 #ifdef CONFIG_ZSWAP !! 870 #endif /* CONFIG_VM_EVENTS_COUNTERS */ 1426 "zswpin", << 1427 "zswpout", << 1428 "zswpwb", << 1429 #endif << 1430 #ifdef CONFIG_X86 << 1431 "direct_map_level2_splits", << 1432 "direct_map_level3_splits", << 1433 #endif << 1434 #ifdef CONFIG_PER_VMA_LOCK_STATS << 1435 "vma_lock_success", << 1436 "vma_lock_abort", << 1437 "vma_lock_retry", << 1438 "vma_lock_miss", << 1439 #endif << 1440 #ifdef CONFIG_DEBUG_STACK_USAGE << 1441 "kstack_1k", << 1442 #if THREAD_SIZE > 1024 << 1443 "kstack_2k", << 1444 #endif << 1445 #if THREAD_SIZE > 2048 << 1446 "kstack_4k", << 1447 #endif << 1448 #if THREAD_SIZE > 4096 << 1449 "kstack_8k", << 1450 #endif << 1451 #if THREAD_SIZE > 8192 << 1452 "kstack_16k", << 1453 #endif << 1454 #if THREAD_SIZE > 16384 << 1455 "kstack_32k", << 1456 #endif << 1457 #if THREAD_SIZE > 32768 << 1458 "kstack_64k", << 1459 #endif << 1460 #if THREAD_SIZE > 65536 << 1461 "kstack_rest", << 1462 #endif << 1463 #endif << 1464 #endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_ << 1465 }; 871 }; 1466 #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || C !! 872 #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */ >> 873 1467 874 1468 #if (defined(CONFIG_DEBUG_FS) && defined(CONF 875 #if (defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)) || \ 1469 defined(CONFIG_PROC_FS) 876 defined(CONFIG_PROC_FS) 1470 static void *frag_start(struct seq_file *m, l 877 static void *frag_start(struct seq_file *m, loff_t *pos) 1471 { 878 { 1472 pg_data_t *pgdat; 879 pg_data_t *pgdat; 1473 loff_t node = *pos; 880 loff_t node = *pos; 1474 881 1475 for (pgdat = first_online_pgdat(); 882 for (pgdat = first_online_pgdat(); 1476 pgdat && node; 883 pgdat && node; 1477 pgdat = next_online_pgdat(pgdat) 884 pgdat = next_online_pgdat(pgdat)) 1478 --node; 885 --node; 1479 886 1480 return pgdat; 887 return pgdat; 1481 } 888 } 1482 889 1483 static void *frag_next(struct seq_file *m, vo 890 static void *frag_next(struct seq_file *m, void *arg, loff_t *pos) 1484 { 891 { 1485 pg_data_t *pgdat = (pg_data_t *)arg; 892 pg_data_t *pgdat = (pg_data_t *)arg; 1486 893 1487 (*pos)++; 894 (*pos)++; 1488 return next_online_pgdat(pgdat); 895 return next_online_pgdat(pgdat); 1489 } 896 } 1490 897 1491 static void frag_stop(struct seq_file *m, voi 898 static void frag_stop(struct seq_file *m, void *arg) 1492 { 899 { 1493 } 900 } 1494 901 1495 /* !! 902 /* Walk all the zones in a node and print using a callback */ 1496 * Walk zones in a node and print using a cal << 1497 * If @assert_populated is true, only use cal << 1498 */ << 1499 static void walk_zones_in_node(struct seq_fil 903 static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat, 1500 bool assert_populated, bool n << 1501 void (*print)(struct seq_file 904 void (*print)(struct seq_file *m, pg_data_t *, struct zone *)) 1502 { 905 { 1503 struct zone *zone; 906 struct zone *zone; 1504 struct zone *node_zones = pgdat->node 907 struct zone *node_zones = pgdat->node_zones; 1505 unsigned long flags; 908 unsigned long flags; 1506 909 1507 for (zone = node_zones; zone - node_z 910 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { 1508 if (assert_populated && !popu !! 911 if (!populated_zone(zone)) 1509 continue; 912 continue; 1510 913 1511 if (!nolock) !! 914 spin_lock_irqsave(&zone->lock, flags); 1512 spin_lock_irqsave(&zo << 1513 print(m, pgdat, zone); 915 print(m, pgdat, zone); 1514 if (!nolock) !! 916 spin_unlock_irqrestore(&zone->lock, flags); 1515 spin_unlock_irqrestor << 1516 } 917 } 1517 } 918 } 1518 #endif 919 #endif 1519 920 1520 #ifdef CONFIG_PROC_FS 921 #ifdef CONFIG_PROC_FS >> 922 static char * const migratetype_names[MIGRATE_TYPES] = { >> 923 "Unmovable", >> 924 "Movable", >> 925 "Reclaimable", >> 926 "HighAtomic", >> 927 #ifdef CONFIG_CMA >> 928 "CMA", >> 929 #endif >> 930 #ifdef CONFIG_MEMORY_ISOLATION >> 931 "Isolate", >> 932 #endif >> 933 }; >> 934 1521 static void frag_show_print(struct seq_file * 935 static void frag_show_print(struct seq_file *m, pg_data_t *pgdat, 1522 936 struct zone *zone) 1523 { 937 { 1524 int order; 938 int order; 1525 939 1526 seq_printf(m, "Node %d, zone %8s ", p 940 seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); 1527 for (order = 0; order < NR_PAGE_ORDER !! 941 for (order = 0; order < MAX_ORDER; ++order) 1528 /* !! 942 seq_printf(m, "%6lu ", zone->free_area[order].nr_free); 1529 * Access to nr_free is lockl << 1530 * printing purposes. Use dat << 1531 */ << 1532 seq_printf(m, "%6lu ", data_r << 1533 seq_putc(m, '\n'); 943 seq_putc(m, '\n'); 1534 } 944 } 1535 945 1536 /* 946 /* 1537 * This walks the free areas for each zone. 947 * This walks the free areas for each zone. 1538 */ 948 */ 1539 static int frag_show(struct seq_file *m, void 949 static int frag_show(struct seq_file *m, void *arg) 1540 { 950 { 1541 pg_data_t *pgdat = (pg_data_t *)arg; 951 pg_data_t *pgdat = (pg_data_t *)arg; 1542 walk_zones_in_node(m, pgdat, true, fa !! 952 walk_zones_in_node(m, pgdat, frag_show_print); 1543 return 0; 953 return 0; 1544 } 954 } 1545 955 1546 static void pagetypeinfo_showfree_print(struc 956 static void pagetypeinfo_showfree_print(struct seq_file *m, 1547 pg_da 957 pg_data_t *pgdat, struct zone *zone) 1548 { 958 { 1549 int order, mtype; 959 int order, mtype; 1550 960 1551 for (mtype = 0; mtype < MIGRATE_TYPES 961 for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) { 1552 seq_printf(m, "Node %4d, zone 962 seq_printf(m, "Node %4d, zone %8s, type %12s ", 1553 pgdat 963 pgdat->node_id, 1554 zone- 964 zone->name, 1555 migra 965 migratetype_names[mtype]); 1556 for (order = 0; order < NR_PA !! 966 for (order = 0; order < MAX_ORDER; ++order) { 1557 unsigned long freecou 967 unsigned long freecount = 0; 1558 struct free_area *are 968 struct free_area *area; 1559 struct list_head *cur 969 struct list_head *curr; 1560 bool overflow = false << 1561 970 1562 area = &(zone->free_a 971 area = &(zone->free_area[order]); 1563 972 1564 list_for_each(curr, & !! 973 list_for_each(curr, &area->free_list[mtype]) 1565 /* !! 974 freecount++; 1566 * Cap the fr !! 975 seq_printf(m, "%6lu ", freecount); 1567 * be really << 1568 * so a long << 1569 * hard locku << 1570 * debugging << 1571 * of pages o << 1572 * sufficient << 1573 */ << 1574 if (++freecou << 1575 overf << 1576 break << 1577 } << 1578 } << 1579 seq_printf(m, "%s%6lu << 1580 spin_unlock_irq(&zone 976 spin_unlock_irq(&zone->lock); 1581 cond_resched(); 977 cond_resched(); 1582 spin_lock_irq(&zone-> 978 spin_lock_irq(&zone->lock); 1583 } 979 } 1584 seq_putc(m, '\n'); 980 seq_putc(m, '\n'); 1585 } 981 } 1586 } 982 } 1587 983 1588 /* Print out the free pages at each order for 984 /* Print out the free pages at each order for each migatetype */ 1589 static void pagetypeinfo_showfree(struct seq_ !! 985 static int pagetypeinfo_showfree(struct seq_file *m, void *arg) 1590 { 986 { 1591 int order; 987 int order; 1592 pg_data_t *pgdat = (pg_data_t *)arg; 988 pg_data_t *pgdat = (pg_data_t *)arg; 1593 989 1594 /* Print header */ 990 /* Print header */ 1595 seq_printf(m, "%-43s ", "Free pages c 991 seq_printf(m, "%-43s ", "Free pages count per migrate type at order"); 1596 for (order = 0; order < NR_PAGE_ORDER !! 992 for (order = 0; order < MAX_ORDER; ++order) 1597 seq_printf(m, "%6d ", order); 993 seq_printf(m, "%6d ", order); 1598 seq_putc(m, '\n'); 994 seq_putc(m, '\n'); 1599 995 1600 walk_zones_in_node(m, pgdat, true, fa !! 996 walk_zones_in_node(m, pgdat, pagetypeinfo_showfree_print); >> 997 >> 998 return 0; 1601 } 999 } 1602 1000 1603 static void pagetypeinfo_showblockcount_print 1001 static void pagetypeinfo_showblockcount_print(struct seq_file *m, 1604 pg_da 1002 pg_data_t *pgdat, struct zone *zone) 1605 { 1003 { 1606 int mtype; 1004 int mtype; 1607 unsigned long pfn; 1005 unsigned long pfn; 1608 unsigned long start_pfn = zone->zone_ 1006 unsigned long start_pfn = zone->zone_start_pfn; 1609 unsigned long end_pfn = zone_end_pfn( 1007 unsigned long end_pfn = zone_end_pfn(zone); 1610 unsigned long count[MIGRATE_TYPES] = 1008 unsigned long count[MIGRATE_TYPES] = { 0, }; 1611 1009 1612 for (pfn = start_pfn; pfn < end_pfn; 1010 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { 1613 struct page *page; 1011 struct page *page; 1614 1012 1615 page = pfn_to_online_page(pfn !! 1013 if (!pfn_valid(pfn)) 1616 if (!page) << 1617 continue; 1014 continue; 1618 1015 1619 if (page_zone(page) != zone) !! 1016 page = pfn_to_page(pfn); >> 1017 >> 1018 /* Watch for unexpected holes punched in the memmap */ >> 1019 if (!memmap_valid_within(pfn, page, zone)) 1620 continue; 1020 continue; 1621 1021 1622 mtype = get_pageblock_migrate 1022 mtype = get_pageblock_migratetype(page); 1623 1023 1624 if (mtype < MIGRATE_TYPES) 1024 if (mtype < MIGRATE_TYPES) 1625 count[mtype]++; 1025 count[mtype]++; 1626 } 1026 } 1627 1027 1628 /* Print counts */ 1028 /* Print counts */ 1629 seq_printf(m, "Node %d, zone %8s ", p 1029 seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); 1630 for (mtype = 0; mtype < MIGRATE_TYPES 1030 for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) 1631 seq_printf(m, "%12lu ", count 1031 seq_printf(m, "%12lu ", count[mtype]); 1632 seq_putc(m, '\n'); 1032 seq_putc(m, '\n'); 1633 } 1033 } 1634 1034 1635 /* Print out the number of pageblocks for eac !! 1035 /* Print out the free pages at each order for each migratetype */ 1636 static void pagetypeinfo_showblockcount(struc !! 1036 static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg) 1637 { 1037 { 1638 int mtype; 1038 int mtype; 1639 pg_data_t *pgdat = (pg_data_t *)arg; 1039 pg_data_t *pgdat = (pg_data_t *)arg; 1640 1040 1641 seq_printf(m, "\n%-23s", "Number of b 1041 seq_printf(m, "\n%-23s", "Number of blocks type "); 1642 for (mtype = 0; mtype < MIGRATE_TYPES 1042 for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) 1643 seq_printf(m, "%12s ", migrat 1043 seq_printf(m, "%12s ", migratetype_names[mtype]); 1644 seq_putc(m, '\n'); 1044 seq_putc(m, '\n'); 1645 walk_zones_in_node(m, pgdat, true, fa !! 1045 walk_zones_in_node(m, pgdat, pagetypeinfo_showblockcount_print); 1646 pagetypeinfo_showblockcount_p !! 1046 >> 1047 return 0; 1647 } 1048 } 1648 1049 >> 1050 #ifdef CONFIG_PAGE_OWNER >> 1051 static void pagetypeinfo_showmixedcount_print(struct seq_file *m, >> 1052 pg_data_t *pgdat, >> 1053 struct zone *zone) >> 1054 { >> 1055 struct page *page; >> 1056 struct page_ext *page_ext; >> 1057 unsigned long pfn = zone->zone_start_pfn, block_end_pfn; >> 1058 unsigned long end_pfn = pfn + zone->spanned_pages; >> 1059 unsigned long count[MIGRATE_TYPES] = { 0, }; >> 1060 int pageblock_mt, page_mt; >> 1061 int i; >> 1062 >> 1063 /* Scan block by block. First and last block may be incomplete */ >> 1064 pfn = zone->zone_start_pfn; >> 1065 >> 1066 /* >> 1067 * Walk the zone in pageblock_nr_pages steps. If a page block spans >> 1068 * a zone boundary, it will be double counted between zones. This does >> 1069 * not matter as the mixed block count will still be correct >> 1070 */ >> 1071 for (; pfn < end_pfn; ) { >> 1072 if (!pfn_valid(pfn)) { >> 1073 pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES); >> 1074 continue; >> 1075 } >> 1076 >> 1077 block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); >> 1078 block_end_pfn = min(block_end_pfn, end_pfn); >> 1079 >> 1080 page = pfn_to_page(pfn); >> 1081 pageblock_mt = get_pfnblock_migratetype(page, pfn); >> 1082 >> 1083 for (; pfn < block_end_pfn; pfn++) { >> 1084 if (!pfn_valid_within(pfn)) >> 1085 continue; >> 1086 >> 1087 page = pfn_to_page(pfn); >> 1088 if (PageBuddy(page)) { >> 1089 pfn += (1UL << page_order(page)) - 1; >> 1090 continue; >> 1091 } >> 1092 >> 1093 if (PageReserved(page)) >> 1094 continue; >> 1095 >> 1096 page_ext = lookup_page_ext(page); >> 1097 if (unlikely(!page_ext)) >> 1098 continue; >> 1099 >> 1100 if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) >> 1101 continue; >> 1102 >> 1103 page_mt = gfpflags_to_migratetype(page_ext->gfp_mask); >> 1104 if (pageblock_mt != page_mt) { >> 1105 if (is_migrate_cma(pageblock_mt)) >> 1106 count[MIGRATE_MOVABLE]++; >> 1107 else >> 1108 count[pageblock_mt]++; >> 1109 >> 1110 pfn = block_end_pfn; >> 1111 break; >> 1112 } >> 1113 pfn += (1UL << page_ext->order) - 1; >> 1114 } >> 1115 } >> 1116 >> 1117 /* Print counts */ >> 1118 seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); >> 1119 for (i = 0; i < MIGRATE_TYPES; i++) >> 1120 seq_printf(m, "%12lu ", count[i]); >> 1121 seq_putc(m, '\n'); >> 1122 } >> 1123 #endif /* CONFIG_PAGE_OWNER */ >> 1124 1649 /* 1125 /* 1650 * Print out the number of pageblocks for eac 1126 * Print out the number of pageblocks for each migratetype that contain pages 1651 * of other types. This gives an indication o 1127 * of other types. This gives an indication of how well fallbacks are being 1652 * contained by rmqueue_fallback(). It requir 1128 * contained by rmqueue_fallback(). It requires information from PAGE_OWNER 1653 * to determine what is going on 1129 * to determine what is going on 1654 */ 1130 */ 1655 static void pagetypeinfo_showmixedcount(struc 1131 static void pagetypeinfo_showmixedcount(struct seq_file *m, pg_data_t *pgdat) 1656 { 1132 { 1657 #ifdef CONFIG_PAGE_OWNER 1133 #ifdef CONFIG_PAGE_OWNER 1658 int mtype; 1134 int mtype; 1659 1135 1660 if (!static_branch_unlikely(&page_own !! 1136 if (!page_owner_inited) 1661 return; 1137 return; 1662 1138 1663 drain_all_pages(NULL); 1139 drain_all_pages(NULL); 1664 1140 1665 seq_printf(m, "\n%-23s", "Number of m 1141 seq_printf(m, "\n%-23s", "Number of mixed blocks "); 1666 for (mtype = 0; mtype < MIGRATE_TYPES 1142 for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) 1667 seq_printf(m, "%12s ", migrat 1143 seq_printf(m, "%12s ", migratetype_names[mtype]); 1668 seq_putc(m, '\n'); 1144 seq_putc(m, '\n'); 1669 1145 1670 walk_zones_in_node(m, pgdat, true, tr !! 1146 walk_zones_in_node(m, pgdat, pagetypeinfo_showmixedcount_print); 1671 pagetypeinfo_showmixedcount_p << 1672 #endif /* CONFIG_PAGE_OWNER */ 1147 #endif /* CONFIG_PAGE_OWNER */ 1673 } 1148 } 1674 1149 1675 /* 1150 /* 1676 * This prints out statistics in relation to 1151 * This prints out statistics in relation to grouping pages by mobility. 1677 * It is expensive to collect so do not const 1152 * It is expensive to collect so do not constantly read the file. 1678 */ 1153 */ 1679 static int pagetypeinfo_show(struct seq_file 1154 static int pagetypeinfo_show(struct seq_file *m, void *arg) 1680 { 1155 { 1681 pg_data_t *pgdat = (pg_data_t *)arg; 1156 pg_data_t *pgdat = (pg_data_t *)arg; 1682 1157 1683 /* check memoryless node */ 1158 /* check memoryless node */ 1684 if (!node_state(pgdat->node_id, N_MEM 1159 if (!node_state(pgdat->node_id, N_MEMORY)) 1685 return 0; 1160 return 0; 1686 1161 1687 seq_printf(m, "Page block order: %d\n 1162 seq_printf(m, "Page block order: %d\n", pageblock_order); 1688 seq_printf(m, "Pages per block: %lu\ 1163 seq_printf(m, "Pages per block: %lu\n", pageblock_nr_pages); 1689 seq_putc(m, '\n'); 1164 seq_putc(m, '\n'); 1690 pagetypeinfo_showfree(m, pgdat); 1165 pagetypeinfo_showfree(m, pgdat); 1691 pagetypeinfo_showblockcount(m, pgdat) 1166 pagetypeinfo_showblockcount(m, pgdat); 1692 pagetypeinfo_showmixedcount(m, pgdat) 1167 pagetypeinfo_showmixedcount(m, pgdat); 1693 1168 1694 return 0; 1169 return 0; 1695 } 1170 } 1696 1171 1697 static const struct seq_operations fragmentat 1172 static const struct seq_operations fragmentation_op = { 1698 .start = frag_start, 1173 .start = frag_start, 1699 .next = frag_next, 1174 .next = frag_next, 1700 .stop = frag_stop, 1175 .stop = frag_stop, 1701 .show = frag_show, 1176 .show = frag_show, 1702 }; 1177 }; 1703 1178 >> 1179 static int fragmentation_open(struct inode *inode, struct file *file) >> 1180 { >> 1181 return seq_open(file, &fragmentation_op); >> 1182 } >> 1183 >> 1184 static const struct file_operations fragmentation_file_operations = { >> 1185 .open = fragmentation_open, >> 1186 .read = seq_read, >> 1187 .llseek = seq_lseek, >> 1188 .release = seq_release, >> 1189 }; >> 1190 1704 static const struct seq_operations pagetypein 1191 static const struct seq_operations pagetypeinfo_op = { 1705 .start = frag_start, 1192 .start = frag_start, 1706 .next = frag_next, 1193 .next = frag_next, 1707 .stop = frag_stop, 1194 .stop = frag_stop, 1708 .show = pagetypeinfo_show, 1195 .show = pagetypeinfo_show, 1709 }; 1196 }; 1710 1197 1711 static bool is_zone_first_populated(pg_data_t !! 1198 static int pagetypeinfo_open(struct inode *inode, struct file *file) 1712 { 1199 { 1713 int zid; !! 1200 return seq_open(file, &pagetypeinfo_op); 1714 << 1715 for (zid = 0; zid < MAX_NR_ZONES; zid << 1716 struct zone *compare = &pgdat << 1717 << 1718 if (populated_zone(compare)) << 1719 return zone == compar << 1720 } << 1721 << 1722 return false; << 1723 } 1201 } 1724 1202 >> 1203 static const struct file_operations pagetypeinfo_file_ops = { >> 1204 .open = pagetypeinfo_open, >> 1205 .read = seq_read, >> 1206 .llseek = seq_lseek, >> 1207 .release = seq_release, >> 1208 }; >> 1209 1725 static void zoneinfo_show_print(struct seq_fi 1210 static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, 1726 1211 struct zone *zone) 1727 { 1212 { 1728 int i; 1213 int i; 1729 seq_printf(m, "Node %d, zone %8s", pg 1214 seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name); 1730 if (is_zone_first_populated(pgdat, zo << 1731 seq_printf(m, "\n per-node s << 1732 for (i = 0; i < NR_VM_NODE_ST << 1733 unsigned long pages = << 1734 << 1735 if (vmstat_item_print << 1736 pages /= HPAG << 1737 seq_printf(m, "\n << 1738 pages); << 1739 } << 1740 } << 1741 seq_printf(m, 1215 seq_printf(m, 1742 "\n pages free %lu" 1216 "\n pages free %lu" 1743 "\n boost %lu" << 1744 "\n min %lu" 1217 "\n min %lu" 1745 "\n low %lu" 1218 "\n low %lu" 1746 "\n high %lu" 1219 "\n high %lu" 1747 "\n promo %lu" !! 1220 "\n scanned %lu" 1748 "\n spanned %lu" 1221 "\n spanned %lu" 1749 "\n present %lu" 1222 "\n present %lu" 1750 "\n managed %lu" !! 1223 "\n managed %lu", 1751 "\n cma %lu", << 1752 zone_page_state(zone, NR_F 1224 zone_page_state(zone, NR_FREE_PAGES), 1753 zone->watermark_boost, << 1754 min_wmark_pages(zone), 1225 min_wmark_pages(zone), 1755 low_wmark_pages(zone), 1226 low_wmark_pages(zone), 1756 high_wmark_pages(zone), 1227 high_wmark_pages(zone), 1757 promo_wmark_pages(zone), !! 1228 zone_page_state(zone, NR_PAGES_SCANNED), 1758 zone->spanned_pages, 1229 zone->spanned_pages, 1759 zone->present_pages, 1230 zone->present_pages, 1760 zone_managed_pages(zone), !! 1231 zone->managed_pages); 1761 zone_cma_pages(zone)); !! 1232 >> 1233 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) >> 1234 seq_printf(m, "\n %-12s %lu", vmstat_text[i], >> 1235 zone_page_state(zone, i)); 1762 1236 1763 seq_printf(m, 1237 seq_printf(m, 1764 "\n protection: (%l 1238 "\n protection: (%ld", 1765 zone->lowmem_reserve[0]); 1239 zone->lowmem_reserve[0]); 1766 for (i = 1; i < ARRAY_SIZE(zone->lowm 1240 for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++) 1767 seq_printf(m, ", %ld", zone-> 1241 seq_printf(m, ", %ld", zone->lowmem_reserve[i]); 1768 seq_putc(m, ')'); !! 1242 seq_printf(m, 1769 !! 1243 ")" 1770 /* If unpopulated, no other informati !! 1244 "\n pagesets"); 1771 if (!populated_zone(zone)) { << 1772 seq_putc(m, '\n'); << 1773 return; << 1774 } << 1775 << 1776 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS << 1777 seq_printf(m, "\n %-12s << 1778 zone_page_state(zo << 1779 << 1780 #ifdef CONFIG_NUMA << 1781 for (i = 0; i < NR_VM_NUMA_EVENT_ITEM << 1782 seq_printf(m, "\n %-12s << 1783 zone_numa_event_st << 1784 #endif << 1785 << 1786 seq_printf(m, "\n pagesets"); << 1787 for_each_online_cpu(i) { 1245 for_each_online_cpu(i) { 1788 struct per_cpu_pages *pcp; !! 1246 struct per_cpu_pageset *pageset; 1789 struct per_cpu_zonestat __may << 1790 1247 1791 pcp = per_cpu_ptr(zone->per_c !! 1248 pageset = per_cpu_ptr(zone->pageset, i); 1792 seq_printf(m, 1249 seq_printf(m, 1793 "\n cpu: %i" 1250 "\n cpu: %i" 1794 "\n c 1251 "\n count: %i" 1795 "\n h 1252 "\n high: %i" 1796 "\n b 1253 "\n batch: %i", 1797 i, 1254 i, 1798 pcp->count, !! 1255 pageset->pcp.count, 1799 pcp->high, !! 1256 pageset->pcp.high, 1800 pcp->batch); !! 1257 pageset->pcp.batch); 1801 #ifdef CONFIG_SMP 1258 #ifdef CONFIG_SMP 1802 pzstats = per_cpu_ptr(zone->p << 1803 seq_printf(m, "\n vm stats t 1259 seq_printf(m, "\n vm stats threshold: %d", 1804 pzstats->stat !! 1260 pageset->stat_threshold); 1805 #endif 1261 #endif 1806 } 1262 } 1807 seq_printf(m, 1263 seq_printf(m, 1808 "\n node_unreclaimable: !! 1264 "\n all_unreclaimable: %u" 1809 "\n start_pfn: !! 1265 "\n start_pfn: %lu" 1810 pgdat->kswapd_failures >= !! 1266 "\n inactive_ratio: %u", 1811 zone->zone_start_pfn); !! 1267 !zone_reclaimable(zone), >> 1268 zone->zone_start_pfn, >> 1269 zone->inactive_ratio); 1812 seq_putc(m, '\n'); 1270 seq_putc(m, '\n'); 1813 } 1271 } 1814 1272 1815 /* 1273 /* 1816 * Output information about zones in @pgdat. !! 1274 * Output information about zones in @pgdat. 1817 * of whether they are populated or not: lowm << 1818 * set of all zones and userspace would not b << 1819 * suppressed here (zoneinfo displays the eff << 1820 */ 1275 */ 1821 static int zoneinfo_show(struct seq_file *m, 1276 static int zoneinfo_show(struct seq_file *m, void *arg) 1822 { 1277 { 1823 pg_data_t *pgdat = (pg_data_t *)arg; 1278 pg_data_t *pgdat = (pg_data_t *)arg; 1824 walk_zones_in_node(m, pgdat, false, f !! 1279 walk_zones_in_node(m, pgdat, zoneinfo_show_print); 1825 return 0; 1280 return 0; 1826 } 1281 } 1827 1282 1828 static const struct seq_operations zoneinfo_o 1283 static const struct seq_operations zoneinfo_op = { 1829 .start = frag_start, /* iterate over 1284 .start = frag_start, /* iterate over all zones. The same as in 1830 * fragmentatio 1285 * fragmentation. */ 1831 .next = frag_next, 1286 .next = frag_next, 1832 .stop = frag_stop, 1287 .stop = frag_stop, 1833 .show = zoneinfo_show, 1288 .show = zoneinfo_show, 1834 }; 1289 }; 1835 1290 1836 #define NR_VMSTAT_ITEMS (NR_VM_ZONE_STAT_ITEM !! 1291 static int zoneinfo_open(struct inode *inode, struct file *file) 1837 NR_VM_NUMA_EVENT_ITE !! 1292 { 1838 NR_VM_NODE_STAT_ITEM !! 1293 return seq_open(file, &zoneinfo_op); 1839 NR_VM_STAT_ITEMS + \ !! 1294 } 1840 (IS_ENABLED(CONFIG_V !! 1295 1841 NR_VM_EVENT_ITEMS : !! 1296 static const struct file_operations proc_zoneinfo_file_operations = { >> 1297 .open = zoneinfo_open, >> 1298 .read = seq_read, >> 1299 .llseek = seq_lseek, >> 1300 .release = seq_release, >> 1301 }; >> 1302 >> 1303 enum writeback_stat_item { >> 1304 NR_DIRTY_THRESHOLD, >> 1305 NR_DIRTY_BG_THRESHOLD, >> 1306 NR_VM_WRITEBACK_STAT_ITEMS, >> 1307 }; 1842 1308 1843 static void *vmstat_start(struct seq_file *m, 1309 static void *vmstat_start(struct seq_file *m, loff_t *pos) 1844 { 1310 { 1845 unsigned long *v; 1311 unsigned long *v; 1846 int i; !! 1312 int i, stat_items_size; 1847 1313 1848 if (*pos >= NR_VMSTAT_ITEMS) !! 1314 if (*pos >= ARRAY_SIZE(vmstat_text)) 1849 return NULL; 1315 return NULL; >> 1316 stat_items_size = NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) + >> 1317 NR_VM_WRITEBACK_STAT_ITEMS * sizeof(unsigned long); >> 1318 >> 1319 #ifdef CONFIG_VM_EVENT_COUNTERS >> 1320 stat_items_size += sizeof(struct vm_event_state); >> 1321 #endif 1850 1322 1851 BUILD_BUG_ON(ARRAY_SIZE(vmstat_text) !! 1323 v = kmalloc(stat_items_size, GFP_KERNEL); 1852 fold_vm_numa_events(); << 1853 v = kmalloc_array(NR_VMSTAT_ITEMS, si << 1854 m->private = v; 1324 m->private = v; 1855 if (!v) 1325 if (!v) 1856 return ERR_PTR(-ENOMEM); 1326 return ERR_PTR(-ENOMEM); 1857 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS 1327 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) 1858 v[i] = global_zone_page_state !! 1328 v[i] = global_page_state(i); 1859 v += NR_VM_ZONE_STAT_ITEMS; 1329 v += NR_VM_ZONE_STAT_ITEMS; 1860 1330 1861 #ifdef CONFIG_NUMA << 1862 for (i = 0; i < NR_VM_NUMA_EVENT_ITEM << 1863 v[i] = global_numa_event_stat << 1864 v += NR_VM_NUMA_EVENT_ITEMS; << 1865 #endif << 1866 << 1867 for (i = 0; i < NR_VM_NODE_STAT_ITEMS << 1868 v[i] = global_node_page_state << 1869 if (vmstat_item_print_in_thp( << 1870 v[i] /= HPAGE_PMD_NR; << 1871 } << 1872 v += NR_VM_NODE_STAT_ITEMS; << 1873 << 1874 global_dirty_limits(v + NR_DIRTY_BG_T 1331 global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD, 1875 v + NR_DIRTY_THRE 1332 v + NR_DIRTY_THRESHOLD); 1876 v[NR_MEMMAP_PAGES] = atomic_long_read !! 1333 v += NR_VM_WRITEBACK_STAT_ITEMS; 1877 v[NR_MEMMAP_BOOT_PAGES] = atomic_long << 1878 v += NR_VM_STAT_ITEMS; << 1879 1334 1880 #ifdef CONFIG_VM_EVENT_COUNTERS 1335 #ifdef CONFIG_VM_EVENT_COUNTERS 1881 all_vm_events(v); 1336 all_vm_events(v); 1882 v[PGPGIN] /= 2; /* sectors -> 1337 v[PGPGIN] /= 2; /* sectors -> kbytes */ 1883 v[PGPGOUT] /= 2; 1338 v[PGPGOUT] /= 2; 1884 #endif 1339 #endif 1885 return (unsigned long *)m->private + 1340 return (unsigned long *)m->private + *pos; 1886 } 1341 } 1887 1342 1888 static void *vmstat_next(struct seq_file *m, 1343 static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos) 1889 { 1344 { 1890 (*pos)++; 1345 (*pos)++; 1891 if (*pos >= NR_VMSTAT_ITEMS) !! 1346 if (*pos >= ARRAY_SIZE(vmstat_text)) 1892 return NULL; 1347 return NULL; 1893 return (unsigned long *)m->private + 1348 return (unsigned long *)m->private + *pos; 1894 } 1349 } 1895 1350 1896 static int vmstat_show(struct seq_file *m, vo 1351 static int vmstat_show(struct seq_file *m, void *arg) 1897 { 1352 { 1898 unsigned long *l = arg; 1353 unsigned long *l = arg; 1899 unsigned long off = l - (unsigned lon 1354 unsigned long off = l - (unsigned long *)m->private; 1900 1355 1901 seq_puts(m, vmstat_text[off]); 1356 seq_puts(m, vmstat_text[off]); 1902 seq_put_decimal_ull(m, " ", *l); !! 1357 seq_put_decimal_ull(m, ' ', *l); 1903 seq_putc(m, '\n'); 1358 seq_putc(m, '\n'); 1904 << 1905 if (off == NR_VMSTAT_ITEMS - 1) { << 1906 /* << 1907 * We've come to the end - ad << 1908 * breaking userspace which m << 1909 */ << 1910 seq_puts(m, "nr_unstable 0\n" << 1911 } << 1912 return 0; 1359 return 0; 1913 } 1360 } 1914 1361 1915 static void vmstat_stop(struct seq_file *m, v 1362 static void vmstat_stop(struct seq_file *m, void *arg) 1916 { 1363 { 1917 kfree(m->private); 1364 kfree(m->private); 1918 m->private = NULL; 1365 m->private = NULL; 1919 } 1366 } 1920 1367 1921 static const struct seq_operations vmstat_op 1368 static const struct seq_operations vmstat_op = { 1922 .start = vmstat_start, 1369 .start = vmstat_start, 1923 .next = vmstat_next, 1370 .next = vmstat_next, 1924 .stop = vmstat_stop, 1371 .stop = vmstat_stop, 1925 .show = vmstat_show, 1372 .show = vmstat_show, 1926 }; 1373 }; 1927 #endif /* CONFIG_PROC_FS */ << 1928 << 1929 #ifdef CONFIG_SMP << 1930 static DEFINE_PER_CPU(struct delayed_work, vm << 1931 int sysctl_stat_interval __read_mostly = HZ; << 1932 1374 1933 #ifdef CONFIG_PROC_FS !! 1375 static int vmstat_open(struct inode *inode, struct file *file) 1934 static void refresh_vm_stats(struct work_stru << 1935 { 1376 { 1936 refresh_cpu_vm_stats(true); !! 1377 return seq_open(file, &vmstat_op); 1937 } 1378 } 1938 1379 1939 int vmstat_refresh(const struct ctl_table *ta !! 1380 static const struct file_operations proc_vmstat_file_operations = { 1940 void *buffer, size_t *lenp !! 1381 .open = vmstat_open, 1941 { !! 1382 .read = seq_read, 1942 long val; !! 1383 .llseek = seq_lseek, 1943 int err; !! 1384 .release = seq_release, 1944 int i; !! 1385 }; 1945 << 1946 /* << 1947 * The regular update, every sysctl_s << 1948 * than expected: leaving a significa << 1949 * This is particularly misleading wh << 1950 * pages, immediately after running a << 1951 * which can equally be echo'ed to or << 1952 * can be used to update the stats ju << 1953 * << 1954 * Oh, and since global_zone_page_sta << 1955 * transiently negative values, repor << 1956 * the stats is negative, so we know << 1957 */ << 1958 err = schedule_on_each_cpu(refresh_vm << 1959 if (err) << 1960 return err; << 1961 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS << 1962 /* << 1963 * Skip checking stats known << 1964 */ << 1965 switch (i) { << 1966 case NR_ZONE_WRITE_PENDING: << 1967 case NR_FREE_CMA_PAGES: << 1968 continue; << 1969 } << 1970 val = atomic_long_read(&vm_zo << 1971 if (val < 0) { << 1972 pr_warn("%s: %s %ld\n << 1973 __func__, zon << 1974 } << 1975 } << 1976 for (i = 0; i < NR_VM_NODE_STAT_ITEMS << 1977 /* << 1978 * Skip checking stats known << 1979 */ << 1980 switch (i) { << 1981 case NR_WRITEBACK: << 1982 continue; << 1983 } << 1984 val = atomic_long_read(&vm_no << 1985 if (val < 0) { << 1986 pr_warn("%s: %s %ld\n << 1987 __func__, nod << 1988 } << 1989 } << 1990 if (write) << 1991 *ppos += *lenp; << 1992 else << 1993 *lenp = 0; << 1994 return 0; << 1995 } << 1996 #endif /* CONFIG_PROC_FS */ 1386 #endif /* CONFIG_PROC_FS */ 1997 1387 >> 1388 #ifdef CONFIG_SMP >> 1389 static struct workqueue_struct *vmstat_wq; >> 1390 static DEFINE_PER_CPU(struct delayed_work, vmstat_work); >> 1391 int sysctl_stat_interval __read_mostly = HZ; >> 1392 static cpumask_var_t cpu_stat_off; >> 1393 1998 static void vmstat_update(struct work_struct 1394 static void vmstat_update(struct work_struct *w) 1999 { 1395 { 2000 if (refresh_cpu_vm_stats(true)) { 1396 if (refresh_cpu_vm_stats(true)) { 2001 /* 1397 /* 2002 * Counters were updated so w 1398 * Counters were updated so we expect more updates 2003 * to occur in the future. Ke 1399 * to occur in the future. Keep on running the 2004 * update worker thread. 1400 * update worker thread. >> 1401 * If we were marked on cpu_stat_off clear the flag >> 1402 * so that vmstat_shepherd doesn't schedule us again. 2005 */ 1403 */ 2006 queue_delayed_work_on(smp_pro !! 1404 if (!cpumask_test_and_clear_cpu(smp_processor_id(), >> 1405 cpu_stat_off)) { >> 1406 queue_delayed_work_on(smp_processor_id(), vmstat_wq, 2007 this_cpu_ptr( 1407 this_cpu_ptr(&vmstat_work), 2008 round_jiffies 1408 round_jiffies_relative(sysctl_stat_interval)); >> 1409 } >> 1410 } else { >> 1411 /* >> 1412 * We did not update any counters so the app may be in >> 1413 * a mode where it does not cause counter updates. >> 1414 * We may be uselessly running vmstat_update. >> 1415 * Defer the checking for differentials to the >> 1416 * shepherd thread on a different processor. >> 1417 */ >> 1418 cpumask_set_cpu(smp_processor_id(), cpu_stat_off); 2009 } 1419 } 2010 } 1420 } 2011 1421 2012 /* 1422 /* >> 1423 * Switch off vmstat processing and then fold all the remaining differentials >> 1424 * until the diffs stay at zero. The function is used by NOHZ and can only be >> 1425 * invoked when tick processing is not active. >> 1426 */ >> 1427 /* 2013 * Check if the diffs for a certain cpu indic 1428 * Check if the diffs for a certain cpu indicate that 2014 * an update is needed. 1429 * an update is needed. 2015 */ 1430 */ 2016 static bool need_update(int cpu) 1431 static bool need_update(int cpu) 2017 { 1432 { 2018 pg_data_t *last_pgdat = NULL; << 2019 struct zone *zone; 1433 struct zone *zone; 2020 1434 2021 for_each_populated_zone(zone) { 1435 for_each_populated_zone(zone) { 2022 struct per_cpu_zonestat *pzst !! 1436 struct per_cpu_pageset *p = per_cpu_ptr(zone->pageset, cpu); 2023 struct per_cpu_nodestat *n; << 2024 1437 >> 1438 BUILD_BUG_ON(sizeof(p->vm_stat_diff[0]) != 1); 2025 /* 1439 /* 2026 * The fast way of checking i 1440 * The fast way of checking if there are any vmstat diffs. >> 1441 * This works because the diffs are byte sized items. 2027 */ 1442 */ 2028 if (memchr_inv(pzstats->vm_st !! 1443 if (memchr_inv(p->vm_stat_diff, 0, NR_VM_ZONE_STAT_ITEMS)) 2029 return true; 1444 return true; 2030 1445 2031 if (last_pgdat == zone->zone_ << 2032 continue; << 2033 last_pgdat = zone->zone_pgdat << 2034 n = per_cpu_ptr(zone->zone_pg << 2035 if (memchr_inv(n->vm_node_sta << 2036 return true; << 2037 } 1446 } 2038 return false; 1447 return false; 2039 } 1448 } 2040 1449 2041 /* << 2042 * Switch off vmstat processing and then fold << 2043 * until the diffs stay at zero. The function << 2044 * invoked when tick processing is not active << 2045 */ << 2046 void quiet_vmstat(void) 1450 void quiet_vmstat(void) 2047 { 1451 { 2048 if (system_state != SYSTEM_RUNNING) 1452 if (system_state != SYSTEM_RUNNING) 2049 return; 1453 return; 2050 1454 2051 if (!delayed_work_pending(this_cpu_pt !! 1455 /* >> 1456 * If we are already in hands of the shepherd then there >> 1457 * is nothing for us to do here. >> 1458 */ >> 1459 if (cpumask_test_and_set_cpu(smp_processor_id(), cpu_stat_off)) 2052 return; 1460 return; 2053 1461 2054 if (!need_update(smp_processor_id())) 1462 if (!need_update(smp_processor_id())) 2055 return; 1463 return; 2056 1464 2057 /* 1465 /* 2058 * Just refresh counters and do not c 1466 * Just refresh counters and do not care about the pending delayed 2059 * vmstat_update. It doesn't fire tha 1467 * vmstat_update. It doesn't fire that often to matter and canceling 2060 * it would be too expensive from thi 1468 * it would be too expensive from this path. 2061 * vmstat_shepherd will take care abo 1469 * vmstat_shepherd will take care about that for us. 2062 */ 1470 */ 2063 refresh_cpu_vm_stats(false); 1471 refresh_cpu_vm_stats(false); 2064 } 1472 } 2065 1473 >> 1474 2066 /* 1475 /* 2067 * Shepherd worker thread that checks the 1476 * Shepherd worker thread that checks the 2068 * differentials of processors that have thei 1477 * differentials of processors that have their worker 2069 * threads for vm statistics updates disabled 1478 * threads for vm statistics updates disabled because of 2070 * inactivity. 1479 * inactivity. 2071 */ 1480 */ 2072 static void vmstat_shepherd(struct work_struc 1481 static void vmstat_shepherd(struct work_struct *w); 2073 1482 2074 static DECLARE_DEFERRABLE_WORK(shepherd, vmst 1483 static DECLARE_DEFERRABLE_WORK(shepherd, vmstat_shepherd); 2075 1484 2076 static void vmstat_shepherd(struct work_struc 1485 static void vmstat_shepherd(struct work_struct *w) 2077 { 1486 { 2078 int cpu; 1487 int cpu; 2079 1488 2080 cpus_read_lock(); !! 1489 get_online_cpus(); 2081 /* Check processors whose vmstat work 1490 /* Check processors whose vmstat worker threads have been disabled */ 2082 for_each_online_cpu(cpu) { !! 1491 for_each_cpu(cpu, cpu_stat_off) { 2083 struct delayed_work *dw = &pe 1492 struct delayed_work *dw = &per_cpu(vmstat_work, cpu); 2084 1493 2085 /* !! 1494 if (need_update(cpu)) { 2086 * In kernel users of vmstat !! 1495 if (cpumask_test_and_clear_cpu(cpu, cpu_stat_off)) 2087 * they are using zone_page_s !! 1496 queue_delayed_work_on(cpu, vmstat_wq, dw, 0); 2088 * an imprecision as the regu !! 1497 } else { 2089 * cumulative error can grow !! 1498 /* 2090 * !! 1499 * Cancel the work if quiet_vmstat has put this 2091 * From that POV the regular !! 1500 * cpu on cpu_stat_off because the work item might 2092 * been isolated from the ker !! 1501 * be still scheduled 2093 * infrastructure ever notici !! 1502 */ 2094 * for all isolated CPUs to a !! 1503 cancel_delayed_work(dw); 2095 */ !! 1504 } 2096 if (cpu_is_isolated(cpu)) << 2097 continue; << 2098 << 2099 if (!delayed_work_pending(dw) << 2100 queue_delayed_work_on << 2101 << 2102 cond_resched(); << 2103 } 1505 } 2104 cpus_read_unlock(); !! 1506 put_online_cpus(); 2105 1507 2106 schedule_delayed_work(&shepherd, 1508 schedule_delayed_work(&shepherd, 2107 round_jiffies_relative(sysctl 1509 round_jiffies_relative(sysctl_stat_interval)); 2108 } 1510 } 2109 1511 2110 static void __init start_shepherd_timer(void) 1512 static void __init start_shepherd_timer(void) 2111 { 1513 { 2112 int cpu; 1514 int cpu; 2113 1515 2114 for_each_possible_cpu(cpu) 1516 for_each_possible_cpu(cpu) 2115 INIT_DEFERRABLE_WORK(per_cpu_ !! 1517 INIT_DELAYED_WORK(per_cpu_ptr(&vmstat_work, cpu), 2116 vmstat_update); 1518 vmstat_update); 2117 1519 >> 1520 if (!alloc_cpumask_var(&cpu_stat_off, GFP_KERNEL)) >> 1521 BUG(); >> 1522 cpumask_copy(cpu_stat_off, cpu_online_mask); >> 1523 >> 1524 vmstat_wq = alloc_workqueue("vmstat", WQ_FREEZABLE|WQ_MEM_RECLAIM, 0); 2118 schedule_delayed_work(&shepherd, 1525 schedule_delayed_work(&shepherd, 2119 round_jiffies_relative(sysctl 1526 round_jiffies_relative(sysctl_stat_interval)); 2120 } 1527 } 2121 1528 2122 static void __init init_cpu_node_state(void) !! 1529 static void vmstat_cpu_dead(int node) 2123 { 1530 { 2124 int node; !! 1531 int cpu; 2125 1532 2126 for_each_online_node(node) { !! 1533 get_online_cpus(); 2127 if (!cpumask_empty(cpumask_of !! 1534 for_each_online_cpu(cpu) 2128 node_set_state(node, !! 1535 if (cpu_to_node(cpu) == node) 2129 } !! 1536 goto end; >> 1537 >> 1538 node_clear_state(node, N_CPU); >> 1539 end: >> 1540 put_online_cpus(); 2130 } 1541 } 2131 1542 2132 static int vmstat_cpu_online(unsigned int cpu !! 1543 /* >> 1544 * Use the cpu notifier to insure that the thresholds are recalculated >> 1545 * when necessary. >> 1546 */ >> 1547 static int vmstat_cpuup_callback(struct notifier_block *nfb, >> 1548 unsigned long action, >> 1549 void *hcpu) 2133 { 1550 { 2134 refresh_zone_stat_thresholds(); !! 1551 long cpu = (long)hcpu; 2135 1552 2136 if (!node_state(cpu_to_node(cpu), N_C !! 1553 switch (action) { >> 1554 case CPU_ONLINE: >> 1555 case CPU_ONLINE_FROZEN: >> 1556 refresh_zone_stat_thresholds(); 2137 node_set_state(cpu_to_node(cp 1557 node_set_state(cpu_to_node(cpu), N_CPU); >> 1558 cpumask_set_cpu(cpu, cpu_stat_off); >> 1559 break; >> 1560 case CPU_DOWN_PREPARE: >> 1561 case CPU_DOWN_PREPARE_FROZEN: >> 1562 cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu)); >> 1563 cpumask_clear_cpu(cpu, cpu_stat_off); >> 1564 break; >> 1565 case CPU_DOWN_FAILED: >> 1566 case CPU_DOWN_FAILED_FROZEN: >> 1567 cpumask_set_cpu(cpu, cpu_stat_off); >> 1568 break; >> 1569 case CPU_DEAD: >> 1570 case CPU_DEAD_FROZEN: >> 1571 refresh_zone_stat_thresholds(); >> 1572 vmstat_cpu_dead(cpu_to_node(cpu)); >> 1573 break; >> 1574 default: >> 1575 break; 2138 } 1576 } 2139 !! 1577 return NOTIFY_OK; 2140 return 0; << 2141 } << 2142 << 2143 static int vmstat_cpu_down_prep(unsigned int << 2144 { << 2145 cancel_delayed_work_sync(&per_cpu(vms << 2146 return 0; << 2147 } << 2148 << 2149 static int vmstat_cpu_dead(unsigned int cpu) << 2150 { << 2151 const struct cpumask *node_cpus; << 2152 int node; << 2153 << 2154 node = cpu_to_node(cpu); << 2155 << 2156 refresh_zone_stat_thresholds(); << 2157 node_cpus = cpumask_of_node(node); << 2158 if (!cpumask_empty(node_cpus)) << 2159 return 0; << 2160 << 2161 node_clear_state(node, N_CPU); << 2162 << 2163 return 0; << 2164 } 1578 } 2165 1579 >> 1580 static struct notifier_block vmstat_notifier = >> 1581 { &vmstat_cpuup_callback, NULL, 0 }; 2166 #endif 1582 #endif 2167 1583 2168 struct workqueue_struct *mm_percpu_wq; !! 1584 static int __init setup_vmstat(void) 2169 << 2170 void __init init_mm_internals(void) << 2171 { 1585 { 2172 int ret __maybe_unused; << 2173 << 2174 mm_percpu_wq = alloc_workqueue("mm_pe << 2175 << 2176 #ifdef CONFIG_SMP 1586 #ifdef CONFIG_SMP 2177 ret = cpuhp_setup_state_nocalls(CPUHP !! 1587 cpu_notifier_register_begin(); 2178 NULL, !! 1588 __register_cpu_notifier(&vmstat_notifier); 2179 if (ret < 0) << 2180 pr_err("vmstat: failed to reg << 2181 << 2182 ret = cpuhp_setup_state_nocalls(CPUHP << 2183 vmsta << 2184 vmsta << 2185 if (ret < 0) << 2186 pr_err("vmstat: failed to reg << 2187 << 2188 cpus_read_lock(); << 2189 init_cpu_node_state(); << 2190 cpus_read_unlock(); << 2191 1589 2192 start_shepherd_timer(); 1590 start_shepherd_timer(); >> 1591 cpu_notifier_register_done(); 2193 #endif 1592 #endif 2194 #ifdef CONFIG_PROC_FS 1593 #ifdef CONFIG_PROC_FS 2195 proc_create_seq("buddyinfo", 0444, NU !! 1594 proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations); 2196 proc_create_seq("pagetypeinfo", 0400, !! 1595 proc_create("pagetypeinfo", 0400, NULL, &pagetypeinfo_file_ops); 2197 proc_create_seq("vmstat", 0444, NULL, !! 1596 proc_create("vmstat", S_IRUGO, NULL, &proc_vmstat_file_operations); 2198 proc_create_seq("zoneinfo", 0444, NUL !! 1597 proc_create("zoneinfo", S_IRUGO, NULL, &proc_zoneinfo_file_operations); 2199 #endif 1598 #endif >> 1599 return 0; 2200 } 1600 } >> 1601 module_init(setup_vmstat) 2201 1602 2202 #if defined(CONFIG_DEBUG_FS) && defined(CONFI 1603 #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION) 2203 1604 2204 /* 1605 /* 2205 * Return an index indicating how much of the 1606 * Return an index indicating how much of the available free memory is 2206 * unusable for an allocation of the requeste 1607 * unusable for an allocation of the requested size. 2207 */ 1608 */ 2208 static int unusable_free_index(unsigned int o 1609 static int unusable_free_index(unsigned int order, 2209 struct contig 1610 struct contig_page_info *info) 2210 { 1611 { 2211 /* No free memory is interpreted as a 1612 /* No free memory is interpreted as all free memory is unusable */ 2212 if (info->free_pages == 0) 1613 if (info->free_pages == 0) 2213 return 1000; 1614 return 1000; 2214 1615 2215 /* 1616 /* 2216 * Index should be a value between 0 1617 * Index should be a value between 0 and 1. Return a value to 3 2217 * decimal places. 1618 * decimal places. 2218 * 1619 * 2219 * 0 => no fragmentation 1620 * 0 => no fragmentation 2220 * 1 => high fragmentation 1621 * 1 => high fragmentation 2221 */ 1622 */ 2222 return div_u64((info->free_pages - (i 1623 return div_u64((info->free_pages - (info->free_blocks_suitable << order)) * 1000ULL, info->free_pages); 2223 1624 2224 } 1625 } 2225 1626 2226 static void unusable_show_print(struct seq_fi 1627 static void unusable_show_print(struct seq_file *m, 2227 pg_da 1628 pg_data_t *pgdat, struct zone *zone) 2228 { 1629 { 2229 unsigned int order; 1630 unsigned int order; 2230 int index; 1631 int index; 2231 struct contig_page_info info; 1632 struct contig_page_info info; 2232 1633 2233 seq_printf(m, "Node %d, zone %8s ", 1634 seq_printf(m, "Node %d, zone %8s ", 2234 pgdat->node_i 1635 pgdat->node_id, 2235 zone->name); 1636 zone->name); 2236 for (order = 0; order < NR_PAGE_ORDER !! 1637 for (order = 0; order < MAX_ORDER; ++order) { 2237 fill_contig_page_info(zone, o 1638 fill_contig_page_info(zone, order, &info); 2238 index = unusable_free_index(o 1639 index = unusable_free_index(order, &info); 2239 seq_printf(m, "%d.%03d ", ind 1640 seq_printf(m, "%d.%03d ", index / 1000, index % 1000); 2240 } 1641 } 2241 1642 2242 seq_putc(m, '\n'); 1643 seq_putc(m, '\n'); 2243 } 1644 } 2244 1645 2245 /* 1646 /* 2246 * Display unusable free space index 1647 * Display unusable free space index 2247 * 1648 * 2248 * The unusable free space index measures how 1649 * The unusable free space index measures how much of the available free 2249 * memory cannot be used to satisfy an alloca 1650 * memory cannot be used to satisfy an allocation of a given size and is a 2250 * value between 0 and 1. The higher the valu 1651 * value between 0 and 1. The higher the value, the more of free memory is 2251 * unusable and by implication, the worse the 1652 * unusable and by implication, the worse the external fragmentation is. This 2252 * can be expressed as a percentage by multip 1653 * can be expressed as a percentage by multiplying by 100. 2253 */ 1654 */ 2254 static int unusable_show(struct seq_file *m, 1655 static int unusable_show(struct seq_file *m, void *arg) 2255 { 1656 { 2256 pg_data_t *pgdat = (pg_data_t *)arg; 1657 pg_data_t *pgdat = (pg_data_t *)arg; 2257 1658 2258 /* check memoryless node */ 1659 /* check memoryless node */ 2259 if (!node_state(pgdat->node_id, N_MEM 1660 if (!node_state(pgdat->node_id, N_MEMORY)) 2260 return 0; 1661 return 0; 2261 1662 2262 walk_zones_in_node(m, pgdat, true, fa !! 1663 walk_zones_in_node(m, pgdat, unusable_show_print); 2263 1664 2264 return 0; 1665 return 0; 2265 } 1666 } 2266 1667 2267 static const struct seq_operations unusable_s !! 1668 static const struct seq_operations unusable_op = { 2268 .start = frag_start, 1669 .start = frag_start, 2269 .next = frag_next, 1670 .next = frag_next, 2270 .stop = frag_stop, 1671 .stop = frag_stop, 2271 .show = unusable_show, 1672 .show = unusable_show, 2272 }; 1673 }; 2273 1674 2274 DEFINE_SEQ_ATTRIBUTE(unusable); !! 1675 static int unusable_open(struct inode *inode, struct file *file) >> 1676 { >> 1677 return seq_open(file, &unusable_op); >> 1678 } >> 1679 >> 1680 static const struct file_operations unusable_file_ops = { >> 1681 .open = unusable_open, >> 1682 .read = seq_read, >> 1683 .llseek = seq_lseek, >> 1684 .release = seq_release, >> 1685 }; 2275 1686 2276 static void extfrag_show_print(struct seq_fil 1687 static void extfrag_show_print(struct seq_file *m, 2277 pg_da 1688 pg_data_t *pgdat, struct zone *zone) 2278 { 1689 { 2279 unsigned int order; 1690 unsigned int order; 2280 int index; 1691 int index; 2281 1692 2282 /* Alloc on stack as interrupts are d 1693 /* Alloc on stack as interrupts are disabled for zone walk */ 2283 struct contig_page_info info; 1694 struct contig_page_info info; 2284 1695 2285 seq_printf(m, "Node %d, zone %8s ", 1696 seq_printf(m, "Node %d, zone %8s ", 2286 pgdat->node_i 1697 pgdat->node_id, 2287 zone->name); 1698 zone->name); 2288 for (order = 0; order < NR_PAGE_ORDER !! 1699 for (order = 0; order < MAX_ORDER; ++order) { 2289 fill_contig_page_info(zone, o 1700 fill_contig_page_info(zone, order, &info); 2290 index = __fragmentation_index 1701 index = __fragmentation_index(order, &info); 2291 seq_printf(m, "%2d.%03d ", in !! 1702 seq_printf(m, "%d.%03d ", index / 1000, index % 1000); 2292 } 1703 } 2293 1704 2294 seq_putc(m, '\n'); 1705 seq_putc(m, '\n'); 2295 } 1706 } 2296 1707 2297 /* 1708 /* 2298 * Display fragmentation index for orders tha 1709 * Display fragmentation index for orders that allocations would fail for 2299 */ 1710 */ 2300 static int extfrag_show(struct seq_file *m, v 1711 static int extfrag_show(struct seq_file *m, void *arg) 2301 { 1712 { 2302 pg_data_t *pgdat = (pg_data_t *)arg; 1713 pg_data_t *pgdat = (pg_data_t *)arg; 2303 1714 2304 walk_zones_in_node(m, pgdat, true, fa !! 1715 walk_zones_in_node(m, pgdat, extfrag_show_print); 2305 1716 2306 return 0; 1717 return 0; 2307 } 1718 } 2308 1719 2309 static const struct seq_operations extfrag_so !! 1720 static const struct seq_operations extfrag_op = { 2310 .start = frag_start, 1721 .start = frag_start, 2311 .next = frag_next, 1722 .next = frag_next, 2312 .stop = frag_stop, 1723 .stop = frag_stop, 2313 .show = extfrag_show, 1724 .show = extfrag_show, 2314 }; 1725 }; 2315 1726 2316 DEFINE_SEQ_ATTRIBUTE(extfrag); !! 1727 static int extfrag_open(struct inode *inode, struct file *file) >> 1728 { >> 1729 return seq_open(file, &extfrag_op); >> 1730 } >> 1731 >> 1732 static const struct file_operations extfrag_file_ops = { >> 1733 .open = extfrag_open, >> 1734 .read = seq_read, >> 1735 .llseek = seq_lseek, >> 1736 .release = seq_release, >> 1737 }; 2317 1738 2318 static int __init extfrag_debug_init(void) 1739 static int __init extfrag_debug_init(void) 2319 { 1740 { 2320 struct dentry *extfrag_debug_root; 1741 struct dentry *extfrag_debug_root; 2321 1742 2322 extfrag_debug_root = debugfs_create_d 1743 extfrag_debug_root = debugfs_create_dir("extfrag", NULL); >> 1744 if (!extfrag_debug_root) >> 1745 return -ENOMEM; 2323 1746 2324 debugfs_create_file("unusable_index", !! 1747 if (!debugfs_create_file("unusable_index", 0444, 2325 &unusable_fops); !! 1748 extfrag_debug_root, NULL, &unusable_file_ops)) 2326 !! 1749 goto fail; 2327 debugfs_create_file("extfrag_index", !! 1750 2328 &extfrag_fops); !! 1751 if (!debugfs_create_file("extfrag_index", 0444, >> 1752 extfrag_debug_root, NULL, &extfrag_file_ops)) >> 1753 goto fail; 2329 1754 2330 return 0; 1755 return 0; >> 1756 fail: >> 1757 debugfs_remove_recursive(extfrag_debug_root); >> 1758 return -ENOMEM; 2331 } 1759 } 2332 1760 2333 module_init(extfrag_debug_init); 1761 module_init(extfrag_debug_init); 2334 << 2335 #endif 1762 #endif 2336 1763
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.