1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Copyright (c) 2022 Google */ 3 #include <linux/bpf.h> 4 #include <linux/btf_ids.h> 5 #include <linux/cgroup.h> 6 #include <linux/kernel.h> 7 #include <linux/seq_file.h> 8 9 #include "../cgroup/cgroup-internal.h" /* cgroup_mutex and cgroup_is_dead */ 10 11 /* cgroup_iter provides four modes of traversal to the cgroup hierarchy. 12 * 13 * 1. Walk the descendants of a cgroup in pre-order. 14 * 2. Walk the descendants of a cgroup in post-order. 15 * 3. Walk the ancestors of a cgroup. 16 * 4. Show the given cgroup only. 17 * 18 * For walking descendants, cgroup_iter can walk in either pre-order or 19 * post-order. For walking ancestors, the iter walks up from a cgroup to 20 * the root. 21 * 22 * The iter program can terminate the walk early by returning 1. Walk 23 * continues if prog returns 0. 24 * 25 * The prog can check (seq->num == 0) to determine whether this is 26 * the first element. The prog may also be passed a NULL cgroup, 27 * which means the walk has completed and the prog has a chance to 28 * do post-processing, such as outputting an epilogue. 29 * 30 * Note: the iter_prog is called with cgroup_mutex held. 31 * 32 * Currently only one session is supported, which means, depending on the 33 * volume of data bpf program intends to send to user space, the number 34 * of cgroups that can be walked is limited. For example, given the current 35 * buffer size is 8 * PAGE_SIZE, if the program sends 64B data for each 36 * cgroup, assuming PAGE_SIZE is 4kb, the total number of cgroups that can 37 * be walked is 512. This is a limitation of cgroup_iter. If the output data 38 * is larger than the kernel buffer size, after all data in the kernel buffer 39 * is consumed by user space, the subsequent read() syscall will signal 40 * EOPNOTSUPP. In order to work around, the user may have to update their 41 * program to reduce the volume of data sent to output. For example, skip 42 * some uninteresting cgroups. 43 */ 44 45 struct bpf_iter__cgroup { 46 __bpf_md_ptr(struct bpf_iter_meta *, meta); 47 __bpf_md_ptr(struct cgroup *, cgroup); 48 }; 49 50 struct cgroup_iter_priv { 51 struct cgroup_subsys_state *start_css; 52 bool visited_all; 53 bool terminate; 54 int order; 55 }; 56 57 static void *cgroup_iter_seq_start(struct seq_file *seq, loff_t *pos) 58 { 59 struct cgroup_iter_priv *p = seq->private; 60 61 cgroup_lock(); 62 63 /* cgroup_iter doesn't support read across multiple sessions. */ 64 if (*pos > 0) { 65 if (p->visited_all) 66 return NULL; 67 68 /* Haven't visited all, but because cgroup_mutex has dropped, 69 * return -EOPNOTSUPP to indicate incomplete iteration. 70 */ 71 return ERR_PTR(-EOPNOTSUPP); 72 } 73 74 ++*pos; 75 p->terminate = false; 76 p->visited_all = false; 77 if (p->order == BPF_CGROUP_ITER_DESCENDANTS_PRE) 78 return css_next_descendant_pre(NULL, p->start_css); 79 else if (p->order == BPF_CGROUP_ITER_DESCENDANTS_POST) 80 return css_next_descendant_post(NULL, p->start_css); 81 else /* BPF_CGROUP_ITER_SELF_ONLY and BPF_CGROUP_ITER_ANCESTORS_UP */ 82 return p->start_css; 83 } 84 85 static int __cgroup_iter_seq_show(struct seq_file *seq, 86 struct cgroup_subsys_state *css, int in_stop); 87 88 static void cgroup_iter_seq_stop(struct seq_file *seq, void *v) 89 { 90 struct cgroup_iter_priv *p = seq->private; 91 92 cgroup_unlock(); 93 94 /* pass NULL to the prog for post-processing */ 95 if (!v) { 96 __cgroup_iter_seq_show(seq, NULL, true); 97 p->visited_all = true; 98 } 99 } 100 101 static void *cgroup_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos) 102 { 103 struct cgroup_subsys_state *curr = (struct cgroup_subsys_state *)v; 104 struct cgroup_iter_priv *p = seq->private; 105 106 ++*pos; 107 if (p->terminate) 108 return NULL; 109 110 if (p->order == BPF_CGROUP_ITER_DESCENDANTS_PRE) 111 return css_next_descendant_pre(curr, p->start_css); 112 else if (p->order == BPF_CGROUP_ITER_DESCENDANTS_POST) 113 return css_next_descendant_post(curr, p->start_css); 114 else if (p->order == BPF_CGROUP_ITER_ANCESTORS_UP) 115 return curr->parent; 116 else /* BPF_CGROUP_ITER_SELF_ONLY */ 117 return NULL; 118 } 119 120 static int __cgroup_iter_seq_show(struct seq_file *seq, 121 struct cgroup_subsys_state *css, int in_stop) 122 { 123 struct cgroup_iter_priv *p = seq->private; 124 struct bpf_iter__cgroup ctx; 125 struct bpf_iter_meta meta; 126 struct bpf_prog *prog; 127 int ret = 0; 128 129 /* cgroup is dead, skip this element */ 130 if (css && cgroup_is_dead(css->cgroup)) 131 return 0; 132 133 ctx.meta = &meta; 134 ctx.cgroup = css ? css->cgroup : NULL; 135 meta.seq = seq; 136 prog = bpf_iter_get_info(&meta, in_stop); 137 if (prog) 138 ret = bpf_iter_run_prog(prog, &ctx); 139 140 /* if prog returns > 0, terminate after this element. */ 141 if (ret != 0) 142 p->terminate = true; 143 144 return 0; 145 } 146 147 static int cgroup_iter_seq_show(struct seq_file *seq, void *v) 148 { 149 return __cgroup_iter_seq_show(seq, (struct cgroup_subsys_state *)v, 150 false); 151 } 152 153 static const struct seq_operations cgroup_iter_seq_ops = { 154 .start = cgroup_iter_seq_start, 155 .next = cgroup_iter_seq_next, 156 .stop = cgroup_iter_seq_stop, 157 .show = cgroup_iter_seq_show, 158 }; 159 160 BTF_ID_LIST_GLOBAL_SINGLE(bpf_cgroup_btf_id, struct, cgroup) 161 162 static int cgroup_iter_seq_init(void *priv, struct bpf_iter_aux_info *aux) 163 { 164 struct cgroup_iter_priv *p = (struct cgroup_iter_priv *)priv; 165 struct cgroup *cgrp = aux->cgroup.start; 166 167 /* bpf_iter_attach_cgroup() has already acquired an extra reference 168 * for the start cgroup, but the reference may be released after 169 * cgroup_iter_seq_init(), so acquire another reference for the 170 * start cgroup. 171 */ 172 p->start_css = &cgrp->self; 173 css_get(p->start_css); 174 p->terminate = false; 175 p->visited_all = false; 176 p->order = aux->cgroup.order; 177 return 0; 178 } 179 180 static void cgroup_iter_seq_fini(void *priv) 181 { 182 struct cgroup_iter_priv *p = (struct cgroup_iter_priv *)priv; 183 184 css_put(p->start_css); 185 } 186 187 static const struct bpf_iter_seq_info cgroup_iter_seq_info = { 188 .seq_ops = &cgroup_iter_seq_ops, 189 .init_seq_private = cgroup_iter_seq_init, 190 .fini_seq_private = cgroup_iter_seq_fini, 191 .seq_priv_size = sizeof(struct cgroup_iter_priv), 192 }; 193 194 static int bpf_iter_attach_cgroup(struct bpf_prog *prog, 195 union bpf_iter_link_info *linfo, 196 struct bpf_iter_aux_info *aux) 197 { 198 int fd = linfo->cgroup.cgroup_fd; 199 u64 id = linfo->cgroup.cgroup_id; 200 int order = linfo->cgroup.order; 201 struct cgroup *cgrp; 202 203 if (order != BPF_CGROUP_ITER_DESCENDANTS_PRE && 204 order != BPF_CGROUP_ITER_DESCENDANTS_POST && 205 order != BPF_CGROUP_ITER_ANCESTORS_UP && 206 order != BPF_CGROUP_ITER_SELF_ONLY) 207 return -EINVAL; 208 209 if (fd && id) 210 return -EINVAL; 211 212 if (fd) 213 cgrp = cgroup_v1v2_get_from_fd(fd); 214 else if (id) 215 cgrp = cgroup_get_from_id(id); 216 else /* walk the entire hierarchy by default. */ 217 cgrp = cgroup_get_from_path("/"); 218 219 if (IS_ERR(cgrp)) 220 return PTR_ERR(cgrp); 221 222 aux->cgroup.start = cgrp; 223 aux->cgroup.order = order; 224 return 0; 225 } 226 227 static void bpf_iter_detach_cgroup(struct bpf_iter_aux_info *aux) 228 { 229 cgroup_put(aux->cgroup.start); 230 } 231 232 static void bpf_iter_cgroup_show_fdinfo(const struct bpf_iter_aux_info *aux, 233 struct seq_file *seq) 234 { 235 char *buf; 236 237 buf = kzalloc(PATH_MAX, GFP_KERNEL); 238 if (!buf) { 239 seq_puts(seq, "cgroup_path:\t<unknown>\n"); 240 goto show_order; 241 } 242 243 /* If cgroup_path_ns() fails, buf will be an empty string, cgroup_path 244 * will print nothing. 245 * 246 * Path is in the calling process's cgroup namespace. 247 */ 248 cgroup_path_ns(aux->cgroup.start, buf, PATH_MAX, 249 current->nsproxy->cgroup_ns); 250 seq_printf(seq, "cgroup_path:\t%s\n", buf); 251 kfree(buf); 252 253 show_order: 254 if (aux->cgroup.order == BPF_CGROUP_ITER_DESCENDANTS_PRE) 255 seq_puts(seq, "order: descendants_pre\n"); 256 else if (aux->cgroup.order == BPF_CGROUP_ITER_DESCENDANTS_POST) 257 seq_puts(seq, "order: descendants_post\n"); 258 else if (aux->cgroup.order == BPF_CGROUP_ITER_ANCESTORS_UP) 259 seq_puts(seq, "order: ancestors_up\n"); 260 else /* BPF_CGROUP_ITER_SELF_ONLY */ 261 seq_puts(seq, "order: self_only\n"); 262 } 263 264 static int bpf_iter_cgroup_fill_link_info(const struct bpf_iter_aux_info *aux, 265 struct bpf_link_info *info) 266 { 267 info->iter.cgroup.order = aux->cgroup.order; 268 info->iter.cgroup.cgroup_id = cgroup_id(aux->cgroup.start); 269 return 0; 270 } 271 272 DEFINE_BPF_ITER_FUNC(cgroup, struct bpf_iter_meta *meta, 273 struct cgroup *cgroup) 274 275 static struct bpf_iter_reg bpf_cgroup_reg_info = { 276 .target = "cgroup", 277 .feature = BPF_ITER_RESCHED, 278 .attach_target = bpf_iter_attach_cgroup, 279 .detach_target = bpf_iter_detach_cgroup, 280 .show_fdinfo = bpf_iter_cgroup_show_fdinfo, 281 .fill_link_info = bpf_iter_cgroup_fill_link_info, 282 .ctx_arg_info_size = 1, 283 .ctx_arg_info = { 284 { offsetof(struct bpf_iter__cgroup, cgroup), 285 PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED }, 286 }, 287 .seq_info = &cgroup_iter_seq_info, 288 }; 289 290 static int __init bpf_cgroup_iter_init(void) 291 { 292 bpf_cgroup_reg_info.ctx_arg_info[0].btf_id = bpf_cgroup_btf_id[0]; 293 return bpf_iter_reg_target(&bpf_cgroup_reg_info); 294 } 295 296 late_initcall(bpf_cgroup_iter_init); 297 298 struct bpf_iter_css { 299 __u64 __opaque[3]; 300 } __attribute__((aligned(8))); 301 302 struct bpf_iter_css_kern { 303 struct cgroup_subsys_state *start; 304 struct cgroup_subsys_state *pos; 305 unsigned int flags; 306 } __attribute__((aligned(8))); 307 308 __bpf_kfunc_start_defs(); 309 310 __bpf_kfunc int bpf_iter_css_new(struct bpf_iter_css *it, 311 struct cgroup_subsys_state *start, unsigned int flags) 312 { 313 struct bpf_iter_css_kern *kit = (void *)it; 314 315 BUILD_BUG_ON(sizeof(struct bpf_iter_css_kern) > sizeof(struct bpf_iter_css)); 316 BUILD_BUG_ON(__alignof__(struct bpf_iter_css_kern) != __alignof__(struct bpf_iter_css)); 317 318 kit->start = NULL; 319 switch (flags) { 320 case BPF_CGROUP_ITER_DESCENDANTS_PRE: 321 case BPF_CGROUP_ITER_DESCENDANTS_POST: 322 case BPF_CGROUP_ITER_ANCESTORS_UP: 323 break; 324 default: 325 return -EINVAL; 326 } 327 328 kit->start = start; 329 kit->pos = NULL; 330 kit->flags = flags; 331 return 0; 332 } 333 334 __bpf_kfunc struct cgroup_subsys_state *bpf_iter_css_next(struct bpf_iter_css *it) 335 { 336 struct bpf_iter_css_kern *kit = (void *)it; 337 338 if (!kit->start) 339 return NULL; 340 341 switch (kit->flags) { 342 case BPF_CGROUP_ITER_DESCENDANTS_PRE: 343 kit->pos = css_next_descendant_pre(kit->pos, kit->start); 344 break; 345 case BPF_CGROUP_ITER_DESCENDANTS_POST: 346 kit->pos = css_next_descendant_post(kit->pos, kit->start); 347 break; 348 case BPF_CGROUP_ITER_ANCESTORS_UP: 349 kit->pos = kit->pos ? kit->pos->parent : kit->start; 350 } 351 352 return kit->pos; 353 } 354 355 __bpf_kfunc void bpf_iter_css_destroy(struct bpf_iter_css *it) 356 { 357 } 358 359 __bpf_kfunc_end_defs(); 360
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.