1 /* 2 * 3 * Copyright IBM Corporation, 2012 4 * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> 5 * 6 * Cgroup v2 7 * Copyright (C) 2019 Red Hat, Inc. 8 * Author: Giuseppe Scrivano <gscrivan@redhat.com> 9 * 10 * This program is free software; you can redistribute it and/or modify it 11 * under the terms of version 2.1 of the GNU Lesser General Public License 12 * as published by the Free Software Foundation. 13 * 14 * This program is distributed in the hope that it would be useful, but 15 * WITHOUT ANY WARRANTY; without even the implied warranty of 16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 17 * 18 */ 19 20 #include <linux/cgroup.h> 21 #include <linux/page_counter.h> 22 #include <linux/slab.h> 23 #include <linux/hugetlb.h> 24 #include <linux/hugetlb_cgroup.h> 25 26 #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) 27 #define MEMFILE_IDX(val) (((val) >> 16) & 0xffff) 28 #define MEMFILE_ATTR(val) ((val) & 0xffff) 29 30 /* Use t->m[0] to encode the offset */ 31 #define MEMFILE_OFFSET(t, m0) (((offsetof(t, m0) << 16) | sizeof_field(t, m0))) 32 #define MEMFILE_OFFSET0(val) (((val) >> 16) & 0xffff) 33 #define MEMFILE_FIELD_SIZE(val) ((val) & 0xffff) 34 35 #define DFL_TMPL_SIZE ARRAY_SIZE(hugetlb_dfl_tmpl) 36 #define LEGACY_TMPL_SIZE ARRAY_SIZE(hugetlb_legacy_tmpl) 37 38 static struct hugetlb_cgroup *root_h_cgroup __read_mostly; 39 static struct cftype *dfl_files; 40 static struct cftype *legacy_files; 41 42 static inline struct page_counter * 43 __hugetlb_cgroup_counter_from_cgroup(struct hugetlb_cgroup *h_cg, int idx, 44 bool rsvd) 45 { 46 if (rsvd) 47 return &h_cg->rsvd_hugepage[idx]; 48 return &h_cg->hugepage[idx]; 49 } 50 51 static inline struct page_counter * 52 hugetlb_cgroup_counter_from_cgroup(struct hugetlb_cgroup *h_cg, int idx) 53 { 54 return __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, false); 55 } 56 57 static inline struct page_counter * 58 hugetlb_cgroup_counter_from_cgroup_rsvd(struct hugetlb_cgroup *h_cg, int idx) 59 { 60 return __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, true); 61 } 62 63 static inline 64 struct hugetlb_cgroup *hugetlb_cgroup_from_css(struct cgroup_subsys_state *s) 65 { 66 return s ? container_of(s, struct hugetlb_cgroup, css) : NULL; 67 } 68 69 static inline 70 struct hugetlb_cgroup *hugetlb_cgroup_from_task(struct task_struct *task) 71 { 72 return hugetlb_cgroup_from_css(task_css(task, hugetlb_cgrp_id)); 73 } 74 75 static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg) 76 { 77 return (h_cg == root_h_cgroup); 78 } 79 80 static inline struct hugetlb_cgroup * 81 parent_hugetlb_cgroup(struct hugetlb_cgroup *h_cg) 82 { 83 return hugetlb_cgroup_from_css(h_cg->css.parent); 84 } 85 86 static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg) 87 { 88 struct hstate *h; 89 90 for_each_hstate(h) { 91 if (page_counter_read( 92 hugetlb_cgroup_counter_from_cgroup(h_cg, hstate_index(h)))) 93 return true; 94 } 95 return false; 96 } 97 98 static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup, 99 struct hugetlb_cgroup *parent_h_cgroup) 100 { 101 int idx; 102 103 for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) { 104 struct page_counter *fault_parent = NULL; 105 struct page_counter *rsvd_parent = NULL; 106 unsigned long limit; 107 int ret; 108 109 if (parent_h_cgroup) { 110 fault_parent = hugetlb_cgroup_counter_from_cgroup( 111 parent_h_cgroup, idx); 112 rsvd_parent = hugetlb_cgroup_counter_from_cgroup_rsvd( 113 parent_h_cgroup, idx); 114 } 115 page_counter_init(hugetlb_cgroup_counter_from_cgroup(h_cgroup, 116 idx), 117 fault_parent); 118 page_counter_init( 119 hugetlb_cgroup_counter_from_cgroup_rsvd(h_cgroup, idx), 120 rsvd_parent); 121 122 limit = round_down(PAGE_COUNTER_MAX, 123 pages_per_huge_page(&hstates[idx])); 124 125 ret = page_counter_set_max( 126 hugetlb_cgroup_counter_from_cgroup(h_cgroup, idx), 127 limit); 128 VM_BUG_ON(ret); 129 ret = page_counter_set_max( 130 hugetlb_cgroup_counter_from_cgroup_rsvd(h_cgroup, idx), 131 limit); 132 VM_BUG_ON(ret); 133 } 134 } 135 136 static void hugetlb_cgroup_free(struct hugetlb_cgroup *h_cgroup) 137 { 138 int node; 139 140 for_each_node(node) 141 kfree(h_cgroup->nodeinfo[node]); 142 kfree(h_cgroup); 143 } 144 145 static struct cgroup_subsys_state * 146 hugetlb_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) 147 { 148 struct hugetlb_cgroup *parent_h_cgroup = hugetlb_cgroup_from_css(parent_css); 149 struct hugetlb_cgroup *h_cgroup; 150 int node; 151 152 h_cgroup = kzalloc(struct_size(h_cgroup, nodeinfo, nr_node_ids), 153 GFP_KERNEL); 154 155 if (!h_cgroup) 156 return ERR_PTR(-ENOMEM); 157 158 if (!parent_h_cgroup) 159 root_h_cgroup = h_cgroup; 160 161 /* 162 * TODO: this routine can waste much memory for nodes which will 163 * never be onlined. It's better to use memory hotplug callback 164 * function. 165 */ 166 for_each_node(node) { 167 /* Set node_to_alloc to NUMA_NO_NODE for offline nodes. */ 168 int node_to_alloc = 169 node_state(node, N_NORMAL_MEMORY) ? node : NUMA_NO_NODE; 170 h_cgroup->nodeinfo[node] = 171 kzalloc_node(sizeof(struct hugetlb_cgroup_per_node), 172 GFP_KERNEL, node_to_alloc); 173 if (!h_cgroup->nodeinfo[node]) 174 goto fail_alloc_nodeinfo; 175 } 176 177 hugetlb_cgroup_init(h_cgroup, parent_h_cgroup); 178 return &h_cgroup->css; 179 180 fail_alloc_nodeinfo: 181 hugetlb_cgroup_free(h_cgroup); 182 return ERR_PTR(-ENOMEM); 183 } 184 185 static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css) 186 { 187 hugetlb_cgroup_free(hugetlb_cgroup_from_css(css)); 188 } 189 190 /* 191 * Should be called with hugetlb_lock held. 192 * Since we are holding hugetlb_lock, pages cannot get moved from 193 * active list or uncharged from the cgroup, So no need to get 194 * page reference and test for page active here. This function 195 * cannot fail. 196 */ 197 static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg, 198 struct page *page) 199 { 200 unsigned int nr_pages; 201 struct page_counter *counter; 202 struct hugetlb_cgroup *page_hcg; 203 struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg); 204 struct folio *folio = page_folio(page); 205 206 page_hcg = hugetlb_cgroup_from_folio(folio); 207 /* 208 * We can have pages in active list without any cgroup 209 * ie, hugepage with less than 3 pages. We can safely 210 * ignore those pages. 211 */ 212 if (!page_hcg || page_hcg != h_cg) 213 goto out; 214 215 nr_pages = compound_nr(page); 216 if (!parent) { 217 parent = root_h_cgroup; 218 /* root has no limit */ 219 page_counter_charge(&parent->hugepage[idx], nr_pages); 220 } 221 counter = &h_cg->hugepage[idx]; 222 /* Take the pages off the local counter */ 223 page_counter_cancel(counter, nr_pages); 224 225 set_hugetlb_cgroup(folio, parent); 226 out: 227 return; 228 } 229 230 /* 231 * Force the hugetlb cgroup to empty the hugetlb resources by moving them to 232 * the parent cgroup. 233 */ 234 static void hugetlb_cgroup_css_offline(struct cgroup_subsys_state *css) 235 { 236 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); 237 struct hstate *h; 238 struct page *page; 239 240 do { 241 for_each_hstate(h) { 242 spin_lock_irq(&hugetlb_lock); 243 list_for_each_entry(page, &h->hugepage_activelist, lru) 244 hugetlb_cgroup_move_parent(hstate_index(h), h_cg, page); 245 246 spin_unlock_irq(&hugetlb_lock); 247 } 248 cond_resched(); 249 } while (hugetlb_cgroup_have_usage(h_cg)); 250 } 251 252 static inline void hugetlb_event(struct hugetlb_cgroup *hugetlb, int idx, 253 enum hugetlb_memory_event event) 254 { 255 atomic_long_inc(&hugetlb->events_local[idx][event]); 256 cgroup_file_notify(&hugetlb->events_local_file[idx]); 257 258 do { 259 atomic_long_inc(&hugetlb->events[idx][event]); 260 cgroup_file_notify(&hugetlb->events_file[idx]); 261 } while ((hugetlb = parent_hugetlb_cgroup(hugetlb)) && 262 !hugetlb_cgroup_is_root(hugetlb)); 263 } 264 265 static int __hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, 266 struct hugetlb_cgroup **ptr, 267 bool rsvd) 268 { 269 int ret = 0; 270 struct page_counter *counter; 271 struct hugetlb_cgroup *h_cg = NULL; 272 273 if (hugetlb_cgroup_disabled()) 274 goto done; 275 again: 276 rcu_read_lock(); 277 h_cg = hugetlb_cgroup_from_task(current); 278 if (!css_tryget(&h_cg->css)) { 279 rcu_read_unlock(); 280 goto again; 281 } 282 rcu_read_unlock(); 283 284 if (!page_counter_try_charge( 285 __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, rsvd), 286 nr_pages, &counter)) { 287 ret = -ENOMEM; 288 hugetlb_event(h_cg, idx, HUGETLB_MAX); 289 css_put(&h_cg->css); 290 goto done; 291 } 292 /* Reservations take a reference to the css because they do not get 293 * reparented. 294 */ 295 if (!rsvd) 296 css_put(&h_cg->css); 297 done: 298 *ptr = h_cg; 299 return ret; 300 } 301 302 int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, 303 struct hugetlb_cgroup **ptr) 304 { 305 return __hugetlb_cgroup_charge_cgroup(idx, nr_pages, ptr, false); 306 } 307 308 int hugetlb_cgroup_charge_cgroup_rsvd(int idx, unsigned long nr_pages, 309 struct hugetlb_cgroup **ptr) 310 { 311 return __hugetlb_cgroup_charge_cgroup(idx, nr_pages, ptr, true); 312 } 313 314 /* Should be called with hugetlb_lock held */ 315 static void __hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, 316 struct hugetlb_cgroup *h_cg, 317 struct folio *folio, bool rsvd) 318 { 319 if (hugetlb_cgroup_disabled() || !h_cg) 320 return; 321 lockdep_assert_held(&hugetlb_lock); 322 __set_hugetlb_cgroup(folio, h_cg, rsvd); 323 if (!rsvd) { 324 unsigned long usage = 325 h_cg->nodeinfo[folio_nid(folio)]->usage[idx]; 326 /* 327 * This write is not atomic due to fetching usage and writing 328 * to it, but that's fine because we call this with 329 * hugetlb_lock held anyway. 330 */ 331 WRITE_ONCE(h_cg->nodeinfo[folio_nid(folio)]->usage[idx], 332 usage + nr_pages); 333 } 334 } 335 336 void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, 337 struct hugetlb_cgroup *h_cg, 338 struct folio *folio) 339 { 340 __hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, folio, false); 341 } 342 343 void hugetlb_cgroup_commit_charge_rsvd(int idx, unsigned long nr_pages, 344 struct hugetlb_cgroup *h_cg, 345 struct folio *folio) 346 { 347 __hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, folio, true); 348 } 349 350 /* 351 * Should be called with hugetlb_lock held 352 */ 353 static void __hugetlb_cgroup_uncharge_folio(int idx, unsigned long nr_pages, 354 struct folio *folio, bool rsvd) 355 { 356 struct hugetlb_cgroup *h_cg; 357 358 if (hugetlb_cgroup_disabled()) 359 return; 360 lockdep_assert_held(&hugetlb_lock); 361 h_cg = __hugetlb_cgroup_from_folio(folio, rsvd); 362 if (unlikely(!h_cg)) 363 return; 364 __set_hugetlb_cgroup(folio, NULL, rsvd); 365 366 page_counter_uncharge(__hugetlb_cgroup_counter_from_cgroup(h_cg, idx, 367 rsvd), 368 nr_pages); 369 370 if (rsvd) 371 css_put(&h_cg->css); 372 else { 373 unsigned long usage = 374 h_cg->nodeinfo[folio_nid(folio)]->usage[idx]; 375 /* 376 * This write is not atomic due to fetching usage and writing 377 * to it, but that's fine because we call this with 378 * hugetlb_lock held anyway. 379 */ 380 WRITE_ONCE(h_cg->nodeinfo[folio_nid(folio)]->usage[idx], 381 usage - nr_pages); 382 } 383 } 384 385 void hugetlb_cgroup_uncharge_folio(int idx, unsigned long nr_pages, 386 struct folio *folio) 387 { 388 __hugetlb_cgroup_uncharge_folio(idx, nr_pages, folio, false); 389 } 390 391 void hugetlb_cgroup_uncharge_folio_rsvd(int idx, unsigned long nr_pages, 392 struct folio *folio) 393 { 394 __hugetlb_cgroup_uncharge_folio(idx, nr_pages, folio, true); 395 } 396 397 static void __hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, 398 struct hugetlb_cgroup *h_cg, 399 bool rsvd) 400 { 401 if (hugetlb_cgroup_disabled() || !h_cg) 402 return; 403 404 page_counter_uncharge(__hugetlb_cgroup_counter_from_cgroup(h_cg, idx, 405 rsvd), 406 nr_pages); 407 408 if (rsvd) 409 css_put(&h_cg->css); 410 } 411 412 void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, 413 struct hugetlb_cgroup *h_cg) 414 { 415 __hugetlb_cgroup_uncharge_cgroup(idx, nr_pages, h_cg, false); 416 } 417 418 void hugetlb_cgroup_uncharge_cgroup_rsvd(int idx, unsigned long nr_pages, 419 struct hugetlb_cgroup *h_cg) 420 { 421 __hugetlb_cgroup_uncharge_cgroup(idx, nr_pages, h_cg, true); 422 } 423 424 void hugetlb_cgroup_uncharge_counter(struct resv_map *resv, unsigned long start, 425 unsigned long end) 426 { 427 if (hugetlb_cgroup_disabled() || !resv || !resv->reservation_counter || 428 !resv->css) 429 return; 430 431 page_counter_uncharge(resv->reservation_counter, 432 (end - start) * resv->pages_per_hpage); 433 css_put(resv->css); 434 } 435 436 void hugetlb_cgroup_uncharge_file_region(struct resv_map *resv, 437 struct file_region *rg, 438 unsigned long nr_pages, 439 bool region_del) 440 { 441 if (hugetlb_cgroup_disabled() || !resv || !rg || !nr_pages) 442 return; 443 444 if (rg->reservation_counter && resv->pages_per_hpage && 445 !resv->reservation_counter) { 446 page_counter_uncharge(rg->reservation_counter, 447 nr_pages * resv->pages_per_hpage); 448 /* 449 * Only do css_put(rg->css) when we delete the entire region 450 * because one file_region must hold exactly one css reference. 451 */ 452 if (region_del) 453 css_put(rg->css); 454 } 455 } 456 457 enum { 458 RES_USAGE, 459 RES_RSVD_USAGE, 460 RES_LIMIT, 461 RES_RSVD_LIMIT, 462 RES_MAX_USAGE, 463 RES_RSVD_MAX_USAGE, 464 RES_FAILCNT, 465 RES_RSVD_FAILCNT, 466 }; 467 468 static int hugetlb_cgroup_read_numa_stat(struct seq_file *seq, void *dummy) 469 { 470 int nid; 471 struct cftype *cft = seq_cft(seq); 472 int idx = MEMFILE_IDX(cft->private); 473 bool legacy = !cgroup_subsys_on_dfl(hugetlb_cgrp_subsys); 474 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq)); 475 struct cgroup_subsys_state *css; 476 unsigned long usage; 477 478 if (legacy) { 479 /* Add up usage across all nodes for the non-hierarchical total. */ 480 usage = 0; 481 for_each_node_state(nid, N_MEMORY) 482 usage += READ_ONCE(h_cg->nodeinfo[nid]->usage[idx]); 483 seq_printf(seq, "total=%lu", usage * PAGE_SIZE); 484 485 /* Simply print the per-node usage for the non-hierarchical total. */ 486 for_each_node_state(nid, N_MEMORY) 487 seq_printf(seq, " N%d=%lu", nid, 488 READ_ONCE(h_cg->nodeinfo[nid]->usage[idx]) * 489 PAGE_SIZE); 490 seq_putc(seq, '\n'); 491 } 492 493 /* 494 * The hierarchical total is pretty much the value recorded by the 495 * counter, so use that. 496 */ 497 seq_printf(seq, "%stotal=%lu", legacy ? "hierarchical_" : "", 498 page_counter_read(&h_cg->hugepage[idx]) * PAGE_SIZE); 499 500 /* 501 * For each node, transverse the css tree to obtain the hierarchical 502 * node usage. 503 */ 504 for_each_node_state(nid, N_MEMORY) { 505 usage = 0; 506 rcu_read_lock(); 507 css_for_each_descendant_pre(css, &h_cg->css) { 508 usage += READ_ONCE(hugetlb_cgroup_from_css(css) 509 ->nodeinfo[nid] 510 ->usage[idx]); 511 } 512 rcu_read_unlock(); 513 seq_printf(seq, " N%d=%lu", nid, usage * PAGE_SIZE); 514 } 515 516 seq_putc(seq, '\n'); 517 518 return 0; 519 } 520 521 static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css, 522 struct cftype *cft) 523 { 524 struct page_counter *counter; 525 struct page_counter *rsvd_counter; 526 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); 527 528 counter = &h_cg->hugepage[MEMFILE_IDX(cft->private)]; 529 rsvd_counter = &h_cg->rsvd_hugepage[MEMFILE_IDX(cft->private)]; 530 531 switch (MEMFILE_ATTR(cft->private)) { 532 case RES_USAGE: 533 return (u64)page_counter_read(counter) * PAGE_SIZE; 534 case RES_RSVD_USAGE: 535 return (u64)page_counter_read(rsvd_counter) * PAGE_SIZE; 536 case RES_LIMIT: 537 return (u64)counter->max * PAGE_SIZE; 538 case RES_RSVD_LIMIT: 539 return (u64)rsvd_counter->max * PAGE_SIZE; 540 case RES_MAX_USAGE: 541 return (u64)counter->watermark * PAGE_SIZE; 542 case RES_RSVD_MAX_USAGE: 543 return (u64)rsvd_counter->watermark * PAGE_SIZE; 544 case RES_FAILCNT: 545 return counter->failcnt; 546 case RES_RSVD_FAILCNT: 547 return rsvd_counter->failcnt; 548 default: 549 BUG(); 550 } 551 } 552 553 static int hugetlb_cgroup_read_u64_max(struct seq_file *seq, void *v) 554 { 555 int idx; 556 u64 val; 557 struct cftype *cft = seq_cft(seq); 558 unsigned long limit; 559 struct page_counter *counter; 560 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq)); 561 562 idx = MEMFILE_IDX(cft->private); 563 counter = &h_cg->hugepage[idx]; 564 565 limit = round_down(PAGE_COUNTER_MAX, 566 pages_per_huge_page(&hstates[idx])); 567 568 switch (MEMFILE_ATTR(cft->private)) { 569 case RES_RSVD_USAGE: 570 counter = &h_cg->rsvd_hugepage[idx]; 571 fallthrough; 572 case RES_USAGE: 573 val = (u64)page_counter_read(counter); 574 seq_printf(seq, "%llu\n", val * PAGE_SIZE); 575 break; 576 case RES_RSVD_LIMIT: 577 counter = &h_cg->rsvd_hugepage[idx]; 578 fallthrough; 579 case RES_LIMIT: 580 val = (u64)counter->max; 581 if (val == limit) 582 seq_puts(seq, "max\n"); 583 else 584 seq_printf(seq, "%llu\n", val * PAGE_SIZE); 585 break; 586 default: 587 BUG(); 588 } 589 590 return 0; 591 } 592 593 static DEFINE_MUTEX(hugetlb_limit_mutex); 594 595 static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of, 596 char *buf, size_t nbytes, loff_t off, 597 const char *max) 598 { 599 int ret, idx; 600 unsigned long nr_pages; 601 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of)); 602 bool rsvd = false; 603 604 if (hugetlb_cgroup_is_root(h_cg)) /* Can't set limit on root */ 605 return -EINVAL; 606 607 buf = strstrip(buf); 608 ret = page_counter_memparse(buf, max, &nr_pages); 609 if (ret) 610 return ret; 611 612 idx = MEMFILE_IDX(of_cft(of)->private); 613 nr_pages = round_down(nr_pages, pages_per_huge_page(&hstates[idx])); 614 615 switch (MEMFILE_ATTR(of_cft(of)->private)) { 616 case RES_RSVD_LIMIT: 617 rsvd = true; 618 fallthrough; 619 case RES_LIMIT: 620 mutex_lock(&hugetlb_limit_mutex); 621 ret = page_counter_set_max( 622 __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, rsvd), 623 nr_pages); 624 mutex_unlock(&hugetlb_limit_mutex); 625 break; 626 default: 627 ret = -EINVAL; 628 break; 629 } 630 return ret ?: nbytes; 631 } 632 633 static ssize_t hugetlb_cgroup_write_legacy(struct kernfs_open_file *of, 634 char *buf, size_t nbytes, loff_t off) 635 { 636 return hugetlb_cgroup_write(of, buf, nbytes, off, "-1"); 637 } 638 639 static ssize_t hugetlb_cgroup_write_dfl(struct kernfs_open_file *of, 640 char *buf, size_t nbytes, loff_t off) 641 { 642 return hugetlb_cgroup_write(of, buf, nbytes, off, "max"); 643 } 644 645 static ssize_t hugetlb_cgroup_reset(struct kernfs_open_file *of, 646 char *buf, size_t nbytes, loff_t off) 647 { 648 int ret = 0; 649 struct page_counter *counter, *rsvd_counter; 650 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of)); 651 652 counter = &h_cg->hugepage[MEMFILE_IDX(of_cft(of)->private)]; 653 rsvd_counter = &h_cg->rsvd_hugepage[MEMFILE_IDX(of_cft(of)->private)]; 654 655 switch (MEMFILE_ATTR(of_cft(of)->private)) { 656 case RES_MAX_USAGE: 657 page_counter_reset_watermark(counter); 658 break; 659 case RES_RSVD_MAX_USAGE: 660 page_counter_reset_watermark(rsvd_counter); 661 break; 662 case RES_FAILCNT: 663 counter->failcnt = 0; 664 break; 665 case RES_RSVD_FAILCNT: 666 rsvd_counter->failcnt = 0; 667 break; 668 default: 669 ret = -EINVAL; 670 break; 671 } 672 return ret ?: nbytes; 673 } 674 675 static char *mem_fmt(char *buf, int size, unsigned long hsize) 676 { 677 if (hsize >= SZ_1G) 678 snprintf(buf, size, "%luGB", hsize / SZ_1G); 679 else if (hsize >= SZ_1M) 680 snprintf(buf, size, "%luMB", hsize / SZ_1M); 681 else 682 snprintf(buf, size, "%luKB", hsize / SZ_1K); 683 return buf; 684 } 685 686 static int __hugetlb_events_show(struct seq_file *seq, bool local) 687 { 688 int idx; 689 long max; 690 struct cftype *cft = seq_cft(seq); 691 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq)); 692 693 idx = MEMFILE_IDX(cft->private); 694 695 if (local) 696 max = atomic_long_read(&h_cg->events_local[idx][HUGETLB_MAX]); 697 else 698 max = atomic_long_read(&h_cg->events[idx][HUGETLB_MAX]); 699 700 seq_printf(seq, "max %lu\n", max); 701 702 return 0; 703 } 704 705 static int hugetlb_events_show(struct seq_file *seq, void *v) 706 { 707 return __hugetlb_events_show(seq, false); 708 } 709 710 static int hugetlb_events_local_show(struct seq_file *seq, void *v) 711 { 712 return __hugetlb_events_show(seq, true); 713 } 714 715 static struct cftype hugetlb_dfl_tmpl[] = { 716 { 717 .name = "max", 718 .private = RES_LIMIT, 719 .seq_show = hugetlb_cgroup_read_u64_max, 720 .write = hugetlb_cgroup_write_dfl, 721 .flags = CFTYPE_NOT_ON_ROOT, 722 }, 723 { 724 .name = "rsvd.max", 725 .private = RES_RSVD_LIMIT, 726 .seq_show = hugetlb_cgroup_read_u64_max, 727 .write = hugetlb_cgroup_write_dfl, 728 .flags = CFTYPE_NOT_ON_ROOT, 729 }, 730 { 731 .name = "current", 732 .private = RES_USAGE, 733 .seq_show = hugetlb_cgroup_read_u64_max, 734 .flags = CFTYPE_NOT_ON_ROOT, 735 }, 736 { 737 .name = "rsvd.current", 738 .private = RES_RSVD_USAGE, 739 .seq_show = hugetlb_cgroup_read_u64_max, 740 .flags = CFTYPE_NOT_ON_ROOT, 741 }, 742 { 743 .name = "events", 744 .seq_show = hugetlb_events_show, 745 .file_offset = MEMFILE_OFFSET(struct hugetlb_cgroup, events_file[0]), 746 .flags = CFTYPE_NOT_ON_ROOT, 747 }, 748 { 749 .name = "events.local", 750 .seq_show = hugetlb_events_local_show, 751 .file_offset = MEMFILE_OFFSET(struct hugetlb_cgroup, events_local_file[0]), 752 .flags = CFTYPE_NOT_ON_ROOT, 753 }, 754 { 755 .name = "numa_stat", 756 .seq_show = hugetlb_cgroup_read_numa_stat, 757 .flags = CFTYPE_NOT_ON_ROOT, 758 }, 759 /* don't need terminator here */ 760 }; 761 762 static struct cftype hugetlb_legacy_tmpl[] = { 763 { 764 .name = "limit_in_bytes", 765 .private = RES_LIMIT, 766 .read_u64 = hugetlb_cgroup_read_u64, 767 .write = hugetlb_cgroup_write_legacy, 768 }, 769 { 770 .name = "rsvd.limit_in_bytes", 771 .private = RES_RSVD_LIMIT, 772 .read_u64 = hugetlb_cgroup_read_u64, 773 .write = hugetlb_cgroup_write_legacy, 774 }, 775 { 776 .name = "usage_in_bytes", 777 .private = RES_USAGE, 778 .read_u64 = hugetlb_cgroup_read_u64, 779 }, 780 { 781 .name = "rsvd.usage_in_bytes", 782 .private = RES_RSVD_USAGE, 783 .read_u64 = hugetlb_cgroup_read_u64, 784 }, 785 { 786 .name = "max_usage_in_bytes", 787 .private = RES_MAX_USAGE, 788 .write = hugetlb_cgroup_reset, 789 .read_u64 = hugetlb_cgroup_read_u64, 790 }, 791 { 792 .name = "rsvd.max_usage_in_bytes", 793 .private = RES_RSVD_MAX_USAGE, 794 .write = hugetlb_cgroup_reset, 795 .read_u64 = hugetlb_cgroup_read_u64, 796 }, 797 { 798 .name = "failcnt", 799 .private = RES_FAILCNT, 800 .write = hugetlb_cgroup_reset, 801 .read_u64 = hugetlb_cgroup_read_u64, 802 }, 803 { 804 .name = "rsvd.failcnt", 805 .private = RES_RSVD_FAILCNT, 806 .write = hugetlb_cgroup_reset, 807 .read_u64 = hugetlb_cgroup_read_u64, 808 }, 809 { 810 .name = "numa_stat", 811 .seq_show = hugetlb_cgroup_read_numa_stat, 812 }, 813 /* don't need terminator here */ 814 }; 815 816 static void __init 817 hugetlb_cgroup_cfttypes_init(struct hstate *h, struct cftype *cft, 818 struct cftype *tmpl, int tmpl_size) 819 { 820 char buf[32]; 821 int i, idx = hstate_index(h); 822 823 /* format the size */ 824 mem_fmt(buf, sizeof(buf), huge_page_size(h)); 825 826 for (i = 0; i < tmpl_size; cft++, tmpl++, i++) { 827 *cft = *tmpl; 828 /* rebuild the name */ 829 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.%s", buf, tmpl->name); 830 /* rebuild the private */ 831 cft->private = MEMFILE_PRIVATE(idx, tmpl->private); 832 /* rebuild the file_offset */ 833 if (tmpl->file_offset) { 834 unsigned int offset = tmpl->file_offset; 835 836 cft->file_offset = MEMFILE_OFFSET0(offset) + 837 MEMFILE_FIELD_SIZE(offset) * idx; 838 } 839 840 lockdep_register_key(&cft->lockdep_key); 841 } 842 } 843 844 static void __init __hugetlb_cgroup_file_dfl_init(struct hstate *h) 845 { 846 int idx = hstate_index(h); 847 848 hugetlb_cgroup_cfttypes_init(h, dfl_files + idx * DFL_TMPL_SIZE, 849 hugetlb_dfl_tmpl, DFL_TMPL_SIZE); 850 } 851 852 static void __init __hugetlb_cgroup_file_legacy_init(struct hstate *h) 853 { 854 int idx = hstate_index(h); 855 856 hugetlb_cgroup_cfttypes_init(h, legacy_files + idx * LEGACY_TMPL_SIZE, 857 hugetlb_legacy_tmpl, LEGACY_TMPL_SIZE); 858 } 859 860 static void __init __hugetlb_cgroup_file_init(struct hstate *h) 861 { 862 __hugetlb_cgroup_file_dfl_init(h); 863 __hugetlb_cgroup_file_legacy_init(h); 864 } 865 866 static void __init __hugetlb_cgroup_file_pre_init(void) 867 { 868 int cft_count; 869 870 cft_count = hugetlb_max_hstate * DFL_TMPL_SIZE + 1; /* add terminator */ 871 dfl_files = kcalloc(cft_count, sizeof(struct cftype), GFP_KERNEL); 872 BUG_ON(!dfl_files); 873 cft_count = hugetlb_max_hstate * LEGACY_TMPL_SIZE + 1; /* add terminator */ 874 legacy_files = kcalloc(cft_count, sizeof(struct cftype), GFP_KERNEL); 875 BUG_ON(!legacy_files); 876 } 877 878 static void __init __hugetlb_cgroup_file_post_init(void) 879 { 880 WARN_ON(cgroup_add_dfl_cftypes(&hugetlb_cgrp_subsys, 881 dfl_files)); 882 WARN_ON(cgroup_add_legacy_cftypes(&hugetlb_cgrp_subsys, 883 legacy_files)); 884 } 885 886 void __init hugetlb_cgroup_file_init(void) 887 { 888 struct hstate *h; 889 890 __hugetlb_cgroup_file_pre_init(); 891 for_each_hstate(h) 892 __hugetlb_cgroup_file_init(h); 893 __hugetlb_cgroup_file_post_init(); 894 } 895 896 /* 897 * hugetlb_lock will make sure a parallel cgroup rmdir won't happen 898 * when we migrate hugepages 899 */ 900 void hugetlb_cgroup_migrate(struct folio *old_folio, struct folio *new_folio) 901 { 902 struct hugetlb_cgroup *h_cg; 903 struct hugetlb_cgroup *h_cg_rsvd; 904 struct hstate *h = folio_hstate(old_folio); 905 906 if (hugetlb_cgroup_disabled()) 907 return; 908 909 spin_lock_irq(&hugetlb_lock); 910 h_cg = hugetlb_cgroup_from_folio(old_folio); 911 h_cg_rsvd = hugetlb_cgroup_from_folio_rsvd(old_folio); 912 set_hugetlb_cgroup(old_folio, NULL); 913 set_hugetlb_cgroup_rsvd(old_folio, NULL); 914 915 /* move the h_cg details to new cgroup */ 916 set_hugetlb_cgroup(new_folio, h_cg); 917 set_hugetlb_cgroup_rsvd(new_folio, h_cg_rsvd); 918 list_move(&new_folio->lru, &h->hugepage_activelist); 919 spin_unlock_irq(&hugetlb_lock); 920 return; 921 } 922 923 static struct cftype hugetlb_files[] = { 924 {} /* terminate */ 925 }; 926 927 struct cgroup_subsys hugetlb_cgrp_subsys = { 928 .css_alloc = hugetlb_cgroup_css_alloc, 929 .css_offline = hugetlb_cgroup_css_offline, 930 .css_free = hugetlb_cgroup_css_free, 931 .dfl_cftypes = hugetlb_files, 932 .legacy_cftypes = hugetlb_files, 933 }; 934
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.