1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * RDMA resource limiting controller for cgroups. 4 * 5 * Used to allow a cgroup hierarchy to stop processes from consuming 6 * additional RDMA resources after a certain limit is reached. 7 * 8 * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com> 9 */ 10 11 #include <linux/bitops.h> 12 #include <linux/slab.h> 13 #include <linux/seq_file.h> 14 #include <linux/cgroup.h> 15 #include <linux/parser.h> 16 #include <linux/cgroup_rdma.h> 17 18 #define RDMACG_MAX_STR "max" 19 20 /* 21 * Protects list of resource pools maintained on per cgroup basis 22 * and rdma device list. 23 */ 24 static DEFINE_MUTEX(rdmacg_mutex); 25 static LIST_HEAD(rdmacg_devices); 26 27 enum rdmacg_file_type { 28 RDMACG_RESOURCE_TYPE_MAX, 29 RDMACG_RESOURCE_TYPE_STAT, 30 }; 31 32 /* 33 * resource table definition as to be seen by the user. 34 * Need to add entries to it when more resources are 35 * added/defined at IB verb/core layer. 36 */ 37 static char const *rdmacg_resource_names[] = { 38 [RDMACG_RESOURCE_HCA_HANDLE] = "hca_handle", 39 [RDMACG_RESOURCE_HCA_OBJECT] = "hca_object", 40 }; 41 42 /* resource tracker for each resource of rdma cgroup */ 43 struct rdmacg_resource { 44 int max; 45 int usage; 46 }; 47 48 /* 49 * resource pool object which represents per cgroup, per device 50 * resources. There are multiple instances of this object per cgroup, 51 * therefore it cannot be embedded within rdma_cgroup structure. It 52 * is maintained as list. 53 */ 54 struct rdmacg_resource_pool { 55 struct rdmacg_device *device; 56 struct rdmacg_resource resources[RDMACG_RESOURCE_MAX]; 57 58 struct list_head cg_node; 59 struct list_head dev_node; 60 61 /* count active user tasks of this pool */ 62 u64 usage_sum; 63 /* total number counts which are set to max */ 64 int num_max_cnt; 65 }; 66 67 static struct rdma_cgroup *css_rdmacg(struct cgroup_subsys_state *css) 68 { 69 return container_of(css, struct rdma_cgroup, css); 70 } 71 72 static struct rdma_cgroup *parent_rdmacg(struct rdma_cgroup *cg) 73 { 74 return css_rdmacg(cg->css.parent); 75 } 76 77 static inline struct rdma_cgroup *get_current_rdmacg(void) 78 { 79 return css_rdmacg(task_get_css(current, rdma_cgrp_id)); 80 } 81 82 static void set_resource_limit(struct rdmacg_resource_pool *rpool, 83 int index, int new_max) 84 { 85 if (new_max == S32_MAX) { 86 if (rpool->resources[index].max != S32_MAX) 87 rpool->num_max_cnt++; 88 } else { 89 if (rpool->resources[index].max == S32_MAX) 90 rpool->num_max_cnt--; 91 } 92 rpool->resources[index].max = new_max; 93 } 94 95 static void set_all_resource_max_limit(struct rdmacg_resource_pool *rpool) 96 { 97 int i; 98 99 for (i = 0; i < RDMACG_RESOURCE_MAX; i++) 100 set_resource_limit(rpool, i, S32_MAX); 101 } 102 103 static void free_cg_rpool_locked(struct rdmacg_resource_pool *rpool) 104 { 105 lockdep_assert_held(&rdmacg_mutex); 106 107 list_del(&rpool->cg_node); 108 list_del(&rpool->dev_node); 109 kfree(rpool); 110 } 111 112 static struct rdmacg_resource_pool * 113 find_cg_rpool_locked(struct rdma_cgroup *cg, 114 struct rdmacg_device *device) 115 116 { 117 struct rdmacg_resource_pool *pool; 118 119 lockdep_assert_held(&rdmacg_mutex); 120 121 list_for_each_entry(pool, &cg->rpools, cg_node) 122 if (pool->device == device) 123 return pool; 124 125 return NULL; 126 } 127 128 static struct rdmacg_resource_pool * 129 get_cg_rpool_locked(struct rdma_cgroup *cg, struct rdmacg_device *device) 130 { 131 struct rdmacg_resource_pool *rpool; 132 133 rpool = find_cg_rpool_locked(cg, device); 134 if (rpool) 135 return rpool; 136 137 rpool = kzalloc(sizeof(*rpool), GFP_KERNEL); 138 if (!rpool) 139 return ERR_PTR(-ENOMEM); 140 141 rpool->device = device; 142 set_all_resource_max_limit(rpool); 143 144 INIT_LIST_HEAD(&rpool->cg_node); 145 INIT_LIST_HEAD(&rpool->dev_node); 146 list_add_tail(&rpool->cg_node, &cg->rpools); 147 list_add_tail(&rpool->dev_node, &device->rpools); 148 return rpool; 149 } 150 151 /** 152 * uncharge_cg_locked - uncharge resource for rdma cgroup 153 * @cg: pointer to cg to uncharge and all parents in hierarchy 154 * @device: pointer to rdmacg device 155 * @index: index of the resource to uncharge in cg (resource pool) 156 * 157 * It also frees the resource pool which was created as part of 158 * charging operation when there are no resources attached to 159 * resource pool. 160 */ 161 static void 162 uncharge_cg_locked(struct rdma_cgroup *cg, 163 struct rdmacg_device *device, 164 enum rdmacg_resource_type index) 165 { 166 struct rdmacg_resource_pool *rpool; 167 168 rpool = find_cg_rpool_locked(cg, device); 169 170 /* 171 * rpool cannot be null at this stage. Let kernel operate in case 172 * if there a bug in IB stack or rdma controller, instead of crashing 173 * the system. 174 */ 175 if (unlikely(!rpool)) { 176 pr_warn("Invalid device %p or rdma cgroup %p\n", cg, device); 177 return; 178 } 179 180 rpool->resources[index].usage--; 181 182 /* 183 * A negative count (or overflow) is invalid, 184 * it indicates a bug in the rdma controller. 185 */ 186 WARN_ON_ONCE(rpool->resources[index].usage < 0); 187 rpool->usage_sum--; 188 if (rpool->usage_sum == 0 && 189 rpool->num_max_cnt == RDMACG_RESOURCE_MAX) { 190 /* 191 * No user of the rpool and all entries are set to max, so 192 * safe to delete this rpool. 193 */ 194 free_cg_rpool_locked(rpool); 195 } 196 } 197 198 /** 199 * rdmacg_uncharge_hierarchy - hierarchically uncharge rdma resource count 200 * @cg: pointer to cg to uncharge and all parents in hierarchy 201 * @device: pointer to rdmacg device 202 * @stop_cg: while traversing hirerchy, when meet with stop_cg cgroup 203 * stop uncharging 204 * @index: index of the resource to uncharge in cg in given resource pool 205 */ 206 static void rdmacg_uncharge_hierarchy(struct rdma_cgroup *cg, 207 struct rdmacg_device *device, 208 struct rdma_cgroup *stop_cg, 209 enum rdmacg_resource_type index) 210 { 211 struct rdma_cgroup *p; 212 213 mutex_lock(&rdmacg_mutex); 214 215 for (p = cg; p != stop_cg; p = parent_rdmacg(p)) 216 uncharge_cg_locked(p, device, index); 217 218 mutex_unlock(&rdmacg_mutex); 219 220 css_put(&cg->css); 221 } 222 223 /** 224 * rdmacg_uncharge - hierarchically uncharge rdma resource count 225 * @cg: pointer to cg to uncharge and all parents in hierarchy 226 * @device: pointer to rdmacg device 227 * @index: index of the resource to uncharge in cgroup in given resource pool 228 */ 229 void rdmacg_uncharge(struct rdma_cgroup *cg, 230 struct rdmacg_device *device, 231 enum rdmacg_resource_type index) 232 { 233 if (index >= RDMACG_RESOURCE_MAX) 234 return; 235 236 rdmacg_uncharge_hierarchy(cg, device, NULL, index); 237 } 238 EXPORT_SYMBOL(rdmacg_uncharge); 239 240 /** 241 * rdmacg_try_charge - hierarchically try to charge the rdma resource 242 * @rdmacg: pointer to rdma cgroup which will own this resource 243 * @device: pointer to rdmacg device 244 * @index: index of the resource to charge in cgroup (resource pool) 245 * 246 * This function follows charging resource in hierarchical way. 247 * It will fail if the charge would cause the new value to exceed the 248 * hierarchical limit. 249 * Returns 0 if the charge succeeded, otherwise -EAGAIN, -ENOMEM or -EINVAL. 250 * Returns pointer to rdmacg for this resource when charging is successful. 251 * 252 * Charger needs to account resources on two criteria. 253 * (a) per cgroup & (b) per device resource usage. 254 * Per cgroup resource usage ensures that tasks of cgroup doesn't cross 255 * the configured limits. Per device provides granular configuration 256 * in multi device usage. It allocates resource pool in the hierarchy 257 * for each parent it come across for first resource. Later on resource 258 * pool will be available. Therefore it will be much faster thereon 259 * to charge/uncharge. 260 */ 261 int rdmacg_try_charge(struct rdma_cgroup **rdmacg, 262 struct rdmacg_device *device, 263 enum rdmacg_resource_type index) 264 { 265 struct rdma_cgroup *cg, *p; 266 struct rdmacg_resource_pool *rpool; 267 s64 new; 268 int ret = 0; 269 270 if (index >= RDMACG_RESOURCE_MAX) 271 return -EINVAL; 272 273 /* 274 * hold on to css, as cgroup can be removed but resource 275 * accounting happens on css. 276 */ 277 cg = get_current_rdmacg(); 278 279 mutex_lock(&rdmacg_mutex); 280 for (p = cg; p; p = parent_rdmacg(p)) { 281 rpool = get_cg_rpool_locked(p, device); 282 if (IS_ERR(rpool)) { 283 ret = PTR_ERR(rpool); 284 goto err; 285 } else { 286 new = rpool->resources[index].usage + 1; 287 if (new > rpool->resources[index].max) { 288 ret = -EAGAIN; 289 goto err; 290 } else { 291 rpool->resources[index].usage = new; 292 rpool->usage_sum++; 293 } 294 } 295 } 296 mutex_unlock(&rdmacg_mutex); 297 298 *rdmacg = cg; 299 return 0; 300 301 err: 302 mutex_unlock(&rdmacg_mutex); 303 rdmacg_uncharge_hierarchy(cg, device, p, index); 304 return ret; 305 } 306 EXPORT_SYMBOL(rdmacg_try_charge); 307 308 /** 309 * rdmacg_register_device - register rdmacg device to rdma controller. 310 * @device: pointer to rdmacg device whose resources need to be accounted. 311 * 312 * If IB stack wish a device to participate in rdma cgroup resource 313 * tracking, it must invoke this API to register with rdma cgroup before 314 * any user space application can start using the RDMA resources. 315 */ 316 void rdmacg_register_device(struct rdmacg_device *device) 317 { 318 INIT_LIST_HEAD(&device->dev_node); 319 INIT_LIST_HEAD(&device->rpools); 320 321 mutex_lock(&rdmacg_mutex); 322 list_add_tail(&device->dev_node, &rdmacg_devices); 323 mutex_unlock(&rdmacg_mutex); 324 } 325 EXPORT_SYMBOL(rdmacg_register_device); 326 327 /** 328 * rdmacg_unregister_device - unregister rdmacg device from rdma controller. 329 * @device: pointer to rdmacg device which was previously registered with rdma 330 * controller using rdmacg_register_device(). 331 * 332 * IB stack must invoke this after all the resources of the IB device 333 * are destroyed and after ensuring that no more resources will be created 334 * when this API is invoked. 335 */ 336 void rdmacg_unregister_device(struct rdmacg_device *device) 337 { 338 struct rdmacg_resource_pool *rpool, *tmp; 339 340 /* 341 * Synchronize with any active resource settings, 342 * usage query happening via configfs. 343 */ 344 mutex_lock(&rdmacg_mutex); 345 list_del_init(&device->dev_node); 346 347 /* 348 * Now that this device is off the cgroup list, its safe to free 349 * all the rpool resources. 350 */ 351 list_for_each_entry_safe(rpool, tmp, &device->rpools, dev_node) 352 free_cg_rpool_locked(rpool); 353 354 mutex_unlock(&rdmacg_mutex); 355 } 356 EXPORT_SYMBOL(rdmacg_unregister_device); 357 358 static int parse_resource(char *c, int *intval) 359 { 360 substring_t argstr; 361 char *name, *value = c; 362 size_t len; 363 int ret, i; 364 365 name = strsep(&value, "="); 366 if (!name || !value) 367 return -EINVAL; 368 369 i = match_string(rdmacg_resource_names, RDMACG_RESOURCE_MAX, name); 370 if (i < 0) 371 return i; 372 373 len = strlen(value); 374 375 argstr.from = value; 376 argstr.to = value + len; 377 378 ret = match_int(&argstr, intval); 379 if (ret >= 0) { 380 if (*intval < 0) 381 return -EINVAL; 382 return i; 383 } 384 if (strncmp(value, RDMACG_MAX_STR, len) == 0) { 385 *intval = S32_MAX; 386 return i; 387 } 388 return -EINVAL; 389 } 390 391 static int rdmacg_parse_limits(char *options, 392 int *new_limits, unsigned long *enables) 393 { 394 char *c; 395 int err = -EINVAL; 396 397 /* parse resource options */ 398 while ((c = strsep(&options, " ")) != NULL) { 399 int index, intval; 400 401 index = parse_resource(c, &intval); 402 if (index < 0) 403 goto err; 404 405 new_limits[index] = intval; 406 *enables |= BIT(index); 407 } 408 return 0; 409 410 err: 411 return err; 412 } 413 414 static struct rdmacg_device *rdmacg_get_device_locked(const char *name) 415 { 416 struct rdmacg_device *device; 417 418 lockdep_assert_held(&rdmacg_mutex); 419 420 list_for_each_entry(device, &rdmacg_devices, dev_node) 421 if (!strcmp(name, device->name)) 422 return device; 423 424 return NULL; 425 } 426 427 static ssize_t rdmacg_resource_set_max(struct kernfs_open_file *of, 428 char *buf, size_t nbytes, loff_t off) 429 { 430 struct rdma_cgroup *cg = css_rdmacg(of_css(of)); 431 const char *dev_name; 432 struct rdmacg_resource_pool *rpool; 433 struct rdmacg_device *device; 434 char *options = strstrip(buf); 435 int *new_limits; 436 unsigned long enables = 0; 437 int i = 0, ret = 0; 438 439 /* extract the device name first */ 440 dev_name = strsep(&options, " "); 441 if (!dev_name) { 442 ret = -EINVAL; 443 goto err; 444 } 445 446 new_limits = kcalloc(RDMACG_RESOURCE_MAX, sizeof(int), GFP_KERNEL); 447 if (!new_limits) { 448 ret = -ENOMEM; 449 goto err; 450 } 451 452 ret = rdmacg_parse_limits(options, new_limits, &enables); 453 if (ret) 454 goto parse_err; 455 456 /* acquire lock to synchronize with hot plug devices */ 457 mutex_lock(&rdmacg_mutex); 458 459 device = rdmacg_get_device_locked(dev_name); 460 if (!device) { 461 ret = -ENODEV; 462 goto dev_err; 463 } 464 465 rpool = get_cg_rpool_locked(cg, device); 466 if (IS_ERR(rpool)) { 467 ret = PTR_ERR(rpool); 468 goto dev_err; 469 } 470 471 /* now set the new limits of the rpool */ 472 for_each_set_bit(i, &enables, RDMACG_RESOURCE_MAX) 473 set_resource_limit(rpool, i, new_limits[i]); 474 475 if (rpool->usage_sum == 0 && 476 rpool->num_max_cnt == RDMACG_RESOURCE_MAX) { 477 /* 478 * No user of the rpool and all entries are set to max, so 479 * safe to delete this rpool. 480 */ 481 free_cg_rpool_locked(rpool); 482 } 483 484 dev_err: 485 mutex_unlock(&rdmacg_mutex); 486 487 parse_err: 488 kfree(new_limits); 489 490 err: 491 return ret ?: nbytes; 492 } 493 494 static void print_rpool_values(struct seq_file *sf, 495 struct rdmacg_resource_pool *rpool) 496 { 497 enum rdmacg_file_type sf_type; 498 int i; 499 u32 value; 500 501 sf_type = seq_cft(sf)->private; 502 503 for (i = 0; i < RDMACG_RESOURCE_MAX; i++) { 504 seq_puts(sf, rdmacg_resource_names[i]); 505 seq_putc(sf, '='); 506 if (sf_type == RDMACG_RESOURCE_TYPE_MAX) { 507 if (rpool) 508 value = rpool->resources[i].max; 509 else 510 value = S32_MAX; 511 } else { 512 if (rpool) 513 value = rpool->resources[i].usage; 514 else 515 value = 0; 516 } 517 518 if (value == S32_MAX) 519 seq_puts(sf, RDMACG_MAX_STR); 520 else 521 seq_printf(sf, "%d", value); 522 seq_putc(sf, ' '); 523 } 524 } 525 526 static int rdmacg_resource_read(struct seq_file *sf, void *v) 527 { 528 struct rdmacg_device *device; 529 struct rdmacg_resource_pool *rpool; 530 struct rdma_cgroup *cg = css_rdmacg(seq_css(sf)); 531 532 mutex_lock(&rdmacg_mutex); 533 534 list_for_each_entry(device, &rdmacg_devices, dev_node) { 535 seq_printf(sf, "%s ", device->name); 536 537 rpool = find_cg_rpool_locked(cg, device); 538 print_rpool_values(sf, rpool); 539 540 seq_putc(sf, '\n'); 541 } 542 543 mutex_unlock(&rdmacg_mutex); 544 return 0; 545 } 546 547 static struct cftype rdmacg_files[] = { 548 { 549 .name = "max", 550 .write = rdmacg_resource_set_max, 551 .seq_show = rdmacg_resource_read, 552 .private = RDMACG_RESOURCE_TYPE_MAX, 553 .flags = CFTYPE_NOT_ON_ROOT, 554 }, 555 { 556 .name = "current", 557 .seq_show = rdmacg_resource_read, 558 .private = RDMACG_RESOURCE_TYPE_STAT, 559 .flags = CFTYPE_NOT_ON_ROOT, 560 }, 561 { } /* terminate */ 562 }; 563 564 static struct cgroup_subsys_state * 565 rdmacg_css_alloc(struct cgroup_subsys_state *parent) 566 { 567 struct rdma_cgroup *cg; 568 569 cg = kzalloc(sizeof(*cg), GFP_KERNEL); 570 if (!cg) 571 return ERR_PTR(-ENOMEM); 572 573 INIT_LIST_HEAD(&cg->rpools); 574 return &cg->css; 575 } 576 577 static void rdmacg_css_free(struct cgroup_subsys_state *css) 578 { 579 struct rdma_cgroup *cg = css_rdmacg(css); 580 581 kfree(cg); 582 } 583 584 /** 585 * rdmacg_css_offline - cgroup css_offline callback 586 * @css: css of interest 587 * 588 * This function is called when @css is about to go away and responsible 589 * for shooting down all rdmacg associated with @css. As part of that it 590 * marks all the resource pool entries to max value, so that when resources are 591 * uncharged, associated resource pool can be freed as well. 592 */ 593 static void rdmacg_css_offline(struct cgroup_subsys_state *css) 594 { 595 struct rdma_cgroup *cg = css_rdmacg(css); 596 struct rdmacg_resource_pool *rpool; 597 598 mutex_lock(&rdmacg_mutex); 599 600 list_for_each_entry(rpool, &cg->rpools, cg_node) 601 set_all_resource_max_limit(rpool); 602 603 mutex_unlock(&rdmacg_mutex); 604 } 605 606 struct cgroup_subsys rdma_cgrp_subsys = { 607 .css_alloc = rdmacg_css_alloc, 608 .css_free = rdmacg_css_free, 609 .css_offline = rdmacg_css_offline, 610 .legacy_cftypes = rdmacg_files, 611 .dfl_cftypes = rdmacg_files, 612 }; 613
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.