~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/kernel/cgroup/rdma.c

Version: ~ [ linux-6.11.5 ] ~ [ linux-6.10.14 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.58 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.114 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.169 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.228 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.284 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.322 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.9 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 // SPDX-License-Identifier: GPL-2.0-only
  2 /*
  3  * RDMA resource limiting controller for cgroups.
  4  *
  5  * Used to allow a cgroup hierarchy to stop processes from consuming
  6  * additional RDMA resources after a certain limit is reached.
  7  *
  8  * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com>
  9  */
 10 
 11 #include <linux/bitops.h>
 12 #include <linux/slab.h>
 13 #include <linux/seq_file.h>
 14 #include <linux/cgroup.h>
 15 #include <linux/parser.h>
 16 #include <linux/cgroup_rdma.h>
 17 
 18 #define RDMACG_MAX_STR "max"
 19 
 20 /*
 21  * Protects list of resource pools maintained on per cgroup basis
 22  * and rdma device list.
 23  */
 24 static DEFINE_MUTEX(rdmacg_mutex);
 25 static LIST_HEAD(rdmacg_devices);
 26 
 27 enum rdmacg_file_type {
 28         RDMACG_RESOURCE_TYPE_MAX,
 29         RDMACG_RESOURCE_TYPE_STAT,
 30 };
 31 
 32 /*
 33  * resource table definition as to be seen by the user.
 34  * Need to add entries to it when more resources are
 35  * added/defined at IB verb/core layer.
 36  */
 37 static char const *rdmacg_resource_names[] = {
 38         [RDMACG_RESOURCE_HCA_HANDLE]    = "hca_handle",
 39         [RDMACG_RESOURCE_HCA_OBJECT]    = "hca_object",
 40 };
 41 
 42 /* resource tracker for each resource of rdma cgroup */
 43 struct rdmacg_resource {
 44         int max;
 45         int usage;
 46 };
 47 
 48 /*
 49  * resource pool object which represents per cgroup, per device
 50  * resources. There are multiple instances of this object per cgroup,
 51  * therefore it cannot be embedded within rdma_cgroup structure. It
 52  * is maintained as list.
 53  */
 54 struct rdmacg_resource_pool {
 55         struct rdmacg_device    *device;
 56         struct rdmacg_resource  resources[RDMACG_RESOURCE_MAX];
 57 
 58         struct list_head        cg_node;
 59         struct list_head        dev_node;
 60 
 61         /* count active user tasks of this pool */
 62         u64                     usage_sum;
 63         /* total number counts which are set to max */
 64         int                     num_max_cnt;
 65 };
 66 
 67 static struct rdma_cgroup *css_rdmacg(struct cgroup_subsys_state *css)
 68 {
 69         return container_of(css, struct rdma_cgroup, css);
 70 }
 71 
 72 static struct rdma_cgroup *parent_rdmacg(struct rdma_cgroup *cg)
 73 {
 74         return css_rdmacg(cg->css.parent);
 75 }
 76 
 77 static inline struct rdma_cgroup *get_current_rdmacg(void)
 78 {
 79         return css_rdmacg(task_get_css(current, rdma_cgrp_id));
 80 }
 81 
 82 static void set_resource_limit(struct rdmacg_resource_pool *rpool,
 83                                int index, int new_max)
 84 {
 85         if (new_max == S32_MAX) {
 86                 if (rpool->resources[index].max != S32_MAX)
 87                         rpool->num_max_cnt++;
 88         } else {
 89                 if (rpool->resources[index].max == S32_MAX)
 90                         rpool->num_max_cnt--;
 91         }
 92         rpool->resources[index].max = new_max;
 93 }
 94 
 95 static void set_all_resource_max_limit(struct rdmacg_resource_pool *rpool)
 96 {
 97         int i;
 98 
 99         for (i = 0; i < RDMACG_RESOURCE_MAX; i++)
100                 set_resource_limit(rpool, i, S32_MAX);
101 }
102 
103 static void free_cg_rpool_locked(struct rdmacg_resource_pool *rpool)
104 {
105         lockdep_assert_held(&rdmacg_mutex);
106 
107         list_del(&rpool->cg_node);
108         list_del(&rpool->dev_node);
109         kfree(rpool);
110 }
111 
112 static struct rdmacg_resource_pool *
113 find_cg_rpool_locked(struct rdma_cgroup *cg,
114                      struct rdmacg_device *device)
115 
116 {
117         struct rdmacg_resource_pool *pool;
118 
119         lockdep_assert_held(&rdmacg_mutex);
120 
121         list_for_each_entry(pool, &cg->rpools, cg_node)
122                 if (pool->device == device)
123                         return pool;
124 
125         return NULL;
126 }
127 
128 static struct rdmacg_resource_pool *
129 get_cg_rpool_locked(struct rdma_cgroup *cg, struct rdmacg_device *device)
130 {
131         struct rdmacg_resource_pool *rpool;
132 
133         rpool = find_cg_rpool_locked(cg, device);
134         if (rpool)
135                 return rpool;
136 
137         rpool = kzalloc(sizeof(*rpool), GFP_KERNEL);
138         if (!rpool)
139                 return ERR_PTR(-ENOMEM);
140 
141         rpool->device = device;
142         set_all_resource_max_limit(rpool);
143 
144         INIT_LIST_HEAD(&rpool->cg_node);
145         INIT_LIST_HEAD(&rpool->dev_node);
146         list_add_tail(&rpool->cg_node, &cg->rpools);
147         list_add_tail(&rpool->dev_node, &device->rpools);
148         return rpool;
149 }
150 
151 /**
152  * uncharge_cg_locked - uncharge resource for rdma cgroup
153  * @cg: pointer to cg to uncharge and all parents in hierarchy
154  * @device: pointer to rdmacg device
155  * @index: index of the resource to uncharge in cg (resource pool)
156  *
157  * It also frees the resource pool which was created as part of
158  * charging operation when there are no resources attached to
159  * resource pool.
160  */
161 static void
162 uncharge_cg_locked(struct rdma_cgroup *cg,
163                    struct rdmacg_device *device,
164                    enum rdmacg_resource_type index)
165 {
166         struct rdmacg_resource_pool *rpool;
167 
168         rpool = find_cg_rpool_locked(cg, device);
169 
170         /*
171          * rpool cannot be null at this stage. Let kernel operate in case
172          * if there a bug in IB stack or rdma controller, instead of crashing
173          * the system.
174          */
175         if (unlikely(!rpool)) {
176                 pr_warn("Invalid device %p or rdma cgroup %p\n", cg, device);
177                 return;
178         }
179 
180         rpool->resources[index].usage--;
181 
182         /*
183          * A negative count (or overflow) is invalid,
184          * it indicates a bug in the rdma controller.
185          */
186         WARN_ON_ONCE(rpool->resources[index].usage < 0);
187         rpool->usage_sum--;
188         if (rpool->usage_sum == 0 &&
189             rpool->num_max_cnt == RDMACG_RESOURCE_MAX) {
190                 /*
191                  * No user of the rpool and all entries are set to max, so
192                  * safe to delete this rpool.
193                  */
194                 free_cg_rpool_locked(rpool);
195         }
196 }
197 
198 /**
199  * rdmacg_uncharge_hierarchy - hierarchically uncharge rdma resource count
200  * @cg: pointer to cg to uncharge and all parents in hierarchy
201  * @device: pointer to rdmacg device
202  * @stop_cg: while traversing hirerchy, when meet with stop_cg cgroup
203  *           stop uncharging
204  * @index: index of the resource to uncharge in cg in given resource pool
205  */
206 static void rdmacg_uncharge_hierarchy(struct rdma_cgroup *cg,
207                                      struct rdmacg_device *device,
208                                      struct rdma_cgroup *stop_cg,
209                                      enum rdmacg_resource_type index)
210 {
211         struct rdma_cgroup *p;
212 
213         mutex_lock(&rdmacg_mutex);
214 
215         for (p = cg; p != stop_cg; p = parent_rdmacg(p))
216                 uncharge_cg_locked(p, device, index);
217 
218         mutex_unlock(&rdmacg_mutex);
219 
220         css_put(&cg->css);
221 }
222 
223 /**
224  * rdmacg_uncharge - hierarchically uncharge rdma resource count
225  * @cg: pointer to cg to uncharge and all parents in hierarchy
226  * @device: pointer to rdmacg device
227  * @index: index of the resource to uncharge in cgroup in given resource pool
228  */
229 void rdmacg_uncharge(struct rdma_cgroup *cg,
230                      struct rdmacg_device *device,
231                      enum rdmacg_resource_type index)
232 {
233         if (index >= RDMACG_RESOURCE_MAX)
234                 return;
235 
236         rdmacg_uncharge_hierarchy(cg, device, NULL, index);
237 }
238 EXPORT_SYMBOL(rdmacg_uncharge);
239 
240 /**
241  * rdmacg_try_charge - hierarchically try to charge the rdma resource
242  * @rdmacg: pointer to rdma cgroup which will own this resource
243  * @device: pointer to rdmacg device
244  * @index: index of the resource to charge in cgroup (resource pool)
245  *
246  * This function follows charging resource in hierarchical way.
247  * It will fail if the charge would cause the new value to exceed the
248  * hierarchical limit.
249  * Returns 0 if the charge succeeded, otherwise -EAGAIN, -ENOMEM or -EINVAL.
250  * Returns pointer to rdmacg for this resource when charging is successful.
251  *
252  * Charger needs to account resources on two criteria.
253  * (a) per cgroup & (b) per device resource usage.
254  * Per cgroup resource usage ensures that tasks of cgroup doesn't cross
255  * the configured limits. Per device provides granular configuration
256  * in multi device usage. It allocates resource pool in the hierarchy
257  * for each parent it come across for first resource. Later on resource
258  * pool will be available. Therefore it will be much faster thereon
259  * to charge/uncharge.
260  */
261 int rdmacg_try_charge(struct rdma_cgroup **rdmacg,
262                       struct rdmacg_device *device,
263                       enum rdmacg_resource_type index)
264 {
265         struct rdma_cgroup *cg, *p;
266         struct rdmacg_resource_pool *rpool;
267         s64 new;
268         int ret = 0;
269 
270         if (index >= RDMACG_RESOURCE_MAX)
271                 return -EINVAL;
272 
273         /*
274          * hold on to css, as cgroup can be removed but resource
275          * accounting happens on css.
276          */
277         cg = get_current_rdmacg();
278 
279         mutex_lock(&rdmacg_mutex);
280         for (p = cg; p; p = parent_rdmacg(p)) {
281                 rpool = get_cg_rpool_locked(p, device);
282                 if (IS_ERR(rpool)) {
283                         ret = PTR_ERR(rpool);
284                         goto err;
285                 } else {
286                         new = rpool->resources[index].usage + 1;
287                         if (new > rpool->resources[index].max) {
288                                 ret = -EAGAIN;
289                                 goto err;
290                         } else {
291                                 rpool->resources[index].usage = new;
292                                 rpool->usage_sum++;
293                         }
294                 }
295         }
296         mutex_unlock(&rdmacg_mutex);
297 
298         *rdmacg = cg;
299         return 0;
300 
301 err:
302         mutex_unlock(&rdmacg_mutex);
303         rdmacg_uncharge_hierarchy(cg, device, p, index);
304         return ret;
305 }
306 EXPORT_SYMBOL(rdmacg_try_charge);
307 
308 /**
309  * rdmacg_register_device - register rdmacg device to rdma controller.
310  * @device: pointer to rdmacg device whose resources need to be accounted.
311  *
312  * If IB stack wish a device to participate in rdma cgroup resource
313  * tracking, it must invoke this API to register with rdma cgroup before
314  * any user space application can start using the RDMA resources.
315  */
316 void rdmacg_register_device(struct rdmacg_device *device)
317 {
318         INIT_LIST_HEAD(&device->dev_node);
319         INIT_LIST_HEAD(&device->rpools);
320 
321         mutex_lock(&rdmacg_mutex);
322         list_add_tail(&device->dev_node, &rdmacg_devices);
323         mutex_unlock(&rdmacg_mutex);
324 }
325 EXPORT_SYMBOL(rdmacg_register_device);
326 
327 /**
328  * rdmacg_unregister_device - unregister rdmacg device from rdma controller.
329  * @device: pointer to rdmacg device which was previously registered with rdma
330  *          controller using rdmacg_register_device().
331  *
332  * IB stack must invoke this after all the resources of the IB device
333  * are destroyed and after ensuring that no more resources will be created
334  * when this API is invoked.
335  */
336 void rdmacg_unregister_device(struct rdmacg_device *device)
337 {
338         struct rdmacg_resource_pool *rpool, *tmp;
339 
340         /*
341          * Synchronize with any active resource settings,
342          * usage query happening via configfs.
343          */
344         mutex_lock(&rdmacg_mutex);
345         list_del_init(&device->dev_node);
346 
347         /*
348          * Now that this device is off the cgroup list, its safe to free
349          * all the rpool resources.
350          */
351         list_for_each_entry_safe(rpool, tmp, &device->rpools, dev_node)
352                 free_cg_rpool_locked(rpool);
353 
354         mutex_unlock(&rdmacg_mutex);
355 }
356 EXPORT_SYMBOL(rdmacg_unregister_device);
357 
358 static int parse_resource(char *c, int *intval)
359 {
360         substring_t argstr;
361         char *name, *value = c;
362         size_t len;
363         int ret, i;
364 
365         name = strsep(&value, "=");
366         if (!name || !value)
367                 return -EINVAL;
368 
369         i = match_string(rdmacg_resource_names, RDMACG_RESOURCE_MAX, name);
370         if (i < 0)
371                 return i;
372 
373         len = strlen(value);
374 
375         argstr.from = value;
376         argstr.to = value + len;
377 
378         ret = match_int(&argstr, intval);
379         if (ret >= 0) {
380                 if (*intval < 0)
381                         return -EINVAL;
382                 return i;
383         }
384         if (strncmp(value, RDMACG_MAX_STR, len) == 0) {
385                 *intval = S32_MAX;
386                 return i;
387         }
388         return -EINVAL;
389 }
390 
391 static int rdmacg_parse_limits(char *options,
392                                int *new_limits, unsigned long *enables)
393 {
394         char *c;
395         int err = -EINVAL;
396 
397         /* parse resource options */
398         while ((c = strsep(&options, " ")) != NULL) {
399                 int index, intval;
400 
401                 index = parse_resource(c, &intval);
402                 if (index < 0)
403                         goto err;
404 
405                 new_limits[index] = intval;
406                 *enables |= BIT(index);
407         }
408         return 0;
409 
410 err:
411         return err;
412 }
413 
414 static struct rdmacg_device *rdmacg_get_device_locked(const char *name)
415 {
416         struct rdmacg_device *device;
417 
418         lockdep_assert_held(&rdmacg_mutex);
419 
420         list_for_each_entry(device, &rdmacg_devices, dev_node)
421                 if (!strcmp(name, device->name))
422                         return device;
423 
424         return NULL;
425 }
426 
427 static ssize_t rdmacg_resource_set_max(struct kernfs_open_file *of,
428                                        char *buf, size_t nbytes, loff_t off)
429 {
430         struct rdma_cgroup *cg = css_rdmacg(of_css(of));
431         const char *dev_name;
432         struct rdmacg_resource_pool *rpool;
433         struct rdmacg_device *device;
434         char *options = strstrip(buf);
435         int *new_limits;
436         unsigned long enables = 0;
437         int i = 0, ret = 0;
438 
439         /* extract the device name first */
440         dev_name = strsep(&options, " ");
441         if (!dev_name) {
442                 ret = -EINVAL;
443                 goto err;
444         }
445 
446         new_limits = kcalloc(RDMACG_RESOURCE_MAX, sizeof(int), GFP_KERNEL);
447         if (!new_limits) {
448                 ret = -ENOMEM;
449                 goto err;
450         }
451 
452         ret = rdmacg_parse_limits(options, new_limits, &enables);
453         if (ret)
454                 goto parse_err;
455 
456         /* acquire lock to synchronize with hot plug devices */
457         mutex_lock(&rdmacg_mutex);
458 
459         device = rdmacg_get_device_locked(dev_name);
460         if (!device) {
461                 ret = -ENODEV;
462                 goto dev_err;
463         }
464 
465         rpool = get_cg_rpool_locked(cg, device);
466         if (IS_ERR(rpool)) {
467                 ret = PTR_ERR(rpool);
468                 goto dev_err;
469         }
470 
471         /* now set the new limits of the rpool */
472         for_each_set_bit(i, &enables, RDMACG_RESOURCE_MAX)
473                 set_resource_limit(rpool, i, new_limits[i]);
474 
475         if (rpool->usage_sum == 0 &&
476             rpool->num_max_cnt == RDMACG_RESOURCE_MAX) {
477                 /*
478                  * No user of the rpool and all entries are set to max, so
479                  * safe to delete this rpool.
480                  */
481                 free_cg_rpool_locked(rpool);
482         }
483 
484 dev_err:
485         mutex_unlock(&rdmacg_mutex);
486 
487 parse_err:
488         kfree(new_limits);
489 
490 err:
491         return ret ?: nbytes;
492 }
493 
494 static void print_rpool_values(struct seq_file *sf,
495                                struct rdmacg_resource_pool *rpool)
496 {
497         enum rdmacg_file_type sf_type;
498         int i;
499         u32 value;
500 
501         sf_type = seq_cft(sf)->private;
502 
503         for (i = 0; i < RDMACG_RESOURCE_MAX; i++) {
504                 seq_puts(sf, rdmacg_resource_names[i]);
505                 seq_putc(sf, '=');
506                 if (sf_type == RDMACG_RESOURCE_TYPE_MAX) {
507                         if (rpool)
508                                 value = rpool->resources[i].max;
509                         else
510                                 value = S32_MAX;
511                 } else {
512                         if (rpool)
513                                 value = rpool->resources[i].usage;
514                         else
515                                 value = 0;
516                 }
517 
518                 if (value == S32_MAX)
519                         seq_puts(sf, RDMACG_MAX_STR);
520                 else
521                         seq_printf(sf, "%d", value);
522                 seq_putc(sf, ' ');
523         }
524 }
525 
526 static int rdmacg_resource_read(struct seq_file *sf, void *v)
527 {
528         struct rdmacg_device *device;
529         struct rdmacg_resource_pool *rpool;
530         struct rdma_cgroup *cg = css_rdmacg(seq_css(sf));
531 
532         mutex_lock(&rdmacg_mutex);
533 
534         list_for_each_entry(device, &rdmacg_devices, dev_node) {
535                 seq_printf(sf, "%s ", device->name);
536 
537                 rpool = find_cg_rpool_locked(cg, device);
538                 print_rpool_values(sf, rpool);
539 
540                 seq_putc(sf, '\n');
541         }
542 
543         mutex_unlock(&rdmacg_mutex);
544         return 0;
545 }
546 
547 static struct cftype rdmacg_files[] = {
548         {
549                 .name = "max",
550                 .write = rdmacg_resource_set_max,
551                 .seq_show = rdmacg_resource_read,
552                 .private = RDMACG_RESOURCE_TYPE_MAX,
553                 .flags = CFTYPE_NOT_ON_ROOT,
554         },
555         {
556                 .name = "current",
557                 .seq_show = rdmacg_resource_read,
558                 .private = RDMACG_RESOURCE_TYPE_STAT,
559                 .flags = CFTYPE_NOT_ON_ROOT,
560         },
561         { }     /* terminate */
562 };
563 
564 static struct cgroup_subsys_state *
565 rdmacg_css_alloc(struct cgroup_subsys_state *parent)
566 {
567         struct rdma_cgroup *cg;
568 
569         cg = kzalloc(sizeof(*cg), GFP_KERNEL);
570         if (!cg)
571                 return ERR_PTR(-ENOMEM);
572 
573         INIT_LIST_HEAD(&cg->rpools);
574         return &cg->css;
575 }
576 
577 static void rdmacg_css_free(struct cgroup_subsys_state *css)
578 {
579         struct rdma_cgroup *cg = css_rdmacg(css);
580 
581         kfree(cg);
582 }
583 
584 /**
585  * rdmacg_css_offline - cgroup css_offline callback
586  * @css: css of interest
587  *
588  * This function is called when @css is about to go away and responsible
589  * for shooting down all rdmacg associated with @css. As part of that it
590  * marks all the resource pool entries to max value, so that when resources are
591  * uncharged, associated resource pool can be freed as well.
592  */
593 static void rdmacg_css_offline(struct cgroup_subsys_state *css)
594 {
595         struct rdma_cgroup *cg = css_rdmacg(css);
596         struct rdmacg_resource_pool *rpool;
597 
598         mutex_lock(&rdmacg_mutex);
599 
600         list_for_each_entry(rpool, &cg->rpools, cg_node)
601                 set_all_resource_max_limit(rpool);
602 
603         mutex_unlock(&rdmacg_mutex);
604 }
605 
606 struct cgroup_subsys rdma_cgrp_subsys = {
607         .css_alloc      = rdmacg_css_alloc,
608         .css_free       = rdmacg_css_free,
609         .css_offline    = rdmacg_css_offline,
610         .legacy_cftypes = rdmacg_files,
611         .dfl_cftypes    = rdmacg_files,
612 };
613 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

sflogo.php