~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/net/sched/sch_api.c

Version: ~ [ linux-6.11.5 ] ~ [ linux-6.10.14 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.58 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.114 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.169 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.228 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.284 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.322 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.9 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 // SPDX-License-Identifier: GPL-2.0-or-later
  2 /*
  3  * net/sched/sch_api.c  Packet scheduler API.
  4  *
  5  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  6  *
  7  * Fixes:
  8  *
  9  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
 10  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
 11  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
 12  */
 13 
 14 #include <linux/module.h>
 15 #include <linux/types.h>
 16 #include <linux/kernel.h>
 17 #include <linux/string.h>
 18 #include <linux/errno.h>
 19 #include <linux/skbuff.h>
 20 #include <linux/init.h>
 21 #include <linux/proc_fs.h>
 22 #include <linux/seq_file.h>
 23 #include <linux/kmod.h>
 24 #include <linux/list.h>
 25 #include <linux/hrtimer.h>
 26 #include <linux/slab.h>
 27 #include <linux/hashtable.h>
 28 
 29 #include <net/net_namespace.h>
 30 #include <net/sock.h>
 31 #include <net/netlink.h>
 32 #include <net/pkt_sched.h>
 33 #include <net/pkt_cls.h>
 34 #include <net/tc_wrapper.h>
 35 
 36 #include <trace/events/qdisc.h>
 37 
 38 /*
 39 
 40    Short review.
 41    -------------
 42 
 43    This file consists of two interrelated parts:
 44 
 45    1. queueing disciplines manager frontend.
 46    2. traffic classes manager frontend.
 47 
 48    Generally, queueing discipline ("qdisc") is a black box,
 49    which is able to enqueue packets and to dequeue them (when
 50    device is ready to send something) in order and at times
 51    determined by algorithm hidden in it.
 52 
 53    qdisc's are divided to two categories:
 54    - "queues", which have no internal structure visible from outside.
 55    - "schedulers", which split all the packets to "traffic classes",
 56      using "packet classifiers" (look at cls_api.c)
 57 
 58    In turn, classes may have child qdiscs (as rule, queues)
 59    attached to them etc. etc. etc.
 60 
 61    The goal of the routines in this file is to translate
 62    information supplied by user in the form of handles
 63    to more intelligible for kernel form, to make some sanity
 64    checks and part of work, which is common to all qdiscs
 65    and to provide rtnetlink notifications.
 66 
 67    All real intelligent work is done inside qdisc modules.
 68 
 69 
 70 
 71    Every discipline has two major routines: enqueue and dequeue.
 72 
 73    ---dequeue
 74 
 75    dequeue usually returns a skb to send. It is allowed to return NULL,
 76    but it does not mean that queue is empty, it just means that
 77    discipline does not want to send anything this time.
 78    Queue is really empty if q->q.qlen == 0.
 79    For complicated disciplines with multiple queues q->q is not
 80    real packet queue, but however q->q.qlen must be valid.
 81 
 82    ---enqueue
 83 
 84    enqueue returns 0, if packet was enqueued successfully.
 85    If packet (this one or another one) was dropped, it returns
 86    not zero error code.
 87    NET_XMIT_DROP        - this packet dropped
 88      Expected action: do not backoff, but wait until queue will clear.
 89    NET_XMIT_CN          - probably this packet enqueued, but another one dropped.
 90      Expected action: backoff or ignore
 91 
 92    Auxiliary routines:
 93 
 94    ---peek
 95 
 96    like dequeue but without removing a packet from the queue
 97 
 98    ---reset
 99 
100    returns qdisc to initial state: purge all buffers, clear all
101    timers, counters (except for statistics) etc.
102 
103    ---init
104 
105    initializes newly created qdisc.
106 
107    ---destroy
108 
109    destroys resources allocated by init and during lifetime of qdisc.
110 
111    ---change
112 
113    changes qdisc parameters.
114  */
115 
116 /* Protects list of registered TC modules. It is pure SMP lock. */
117 static DEFINE_RWLOCK(qdisc_mod_lock);
118 
119 
120 /************************************************
121  *      Queueing disciplines manipulation.      *
122  ************************************************/
123 
124 
125 /* The list of all installed queueing disciplines. */
126 
127 static struct Qdisc_ops *qdisc_base;
128 
129 /* Register/unregister queueing discipline */
130 
131 int register_qdisc(struct Qdisc_ops *qops)
132 {
133         struct Qdisc_ops *q, **qp;
134         int rc = -EEXIST;
135 
136         write_lock(&qdisc_mod_lock);
137         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
138                 if (!strcmp(qops->id, q->id))
139                         goto out;
140 
141         if (qops->enqueue == NULL)
142                 qops->enqueue = noop_qdisc_ops.enqueue;
143         if (qops->peek == NULL) {
144                 if (qops->dequeue == NULL)
145                         qops->peek = noop_qdisc_ops.peek;
146                 else
147                         goto out_einval;
148         }
149         if (qops->dequeue == NULL)
150                 qops->dequeue = noop_qdisc_ops.dequeue;
151 
152         if (qops->cl_ops) {
153                 const struct Qdisc_class_ops *cops = qops->cl_ops;
154 
155                 if (!(cops->find && cops->walk && cops->leaf))
156                         goto out_einval;
157 
158                 if (cops->tcf_block && !(cops->bind_tcf && cops->unbind_tcf))
159                         goto out_einval;
160         }
161 
162         qops->next = NULL;
163         *qp = qops;
164         rc = 0;
165 out:
166         write_unlock(&qdisc_mod_lock);
167         return rc;
168 
169 out_einval:
170         rc = -EINVAL;
171         goto out;
172 }
173 EXPORT_SYMBOL(register_qdisc);
174 
175 void unregister_qdisc(struct Qdisc_ops *qops)
176 {
177         struct Qdisc_ops *q, **qp;
178         int err = -ENOENT;
179 
180         write_lock(&qdisc_mod_lock);
181         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
182                 if (q == qops)
183                         break;
184         if (q) {
185                 *qp = q->next;
186                 q->next = NULL;
187                 err = 0;
188         }
189         write_unlock(&qdisc_mod_lock);
190 
191         WARN(err, "unregister qdisc(%s) failed\n", qops->id);
192 }
193 EXPORT_SYMBOL(unregister_qdisc);
194 
195 /* Get default qdisc if not otherwise specified */
196 void qdisc_get_default(char *name, size_t len)
197 {
198         read_lock(&qdisc_mod_lock);
199         strscpy(name, default_qdisc_ops->id, len);
200         read_unlock(&qdisc_mod_lock);
201 }
202 
203 static struct Qdisc_ops *qdisc_lookup_default(const char *name)
204 {
205         struct Qdisc_ops *q = NULL;
206 
207         for (q = qdisc_base; q; q = q->next) {
208                 if (!strcmp(name, q->id)) {
209                         if (!try_module_get(q->owner))
210                                 q = NULL;
211                         break;
212                 }
213         }
214 
215         return q;
216 }
217 
218 /* Set new default qdisc to use */
219 int qdisc_set_default(const char *name)
220 {
221         const struct Qdisc_ops *ops;
222 
223         if (!capable(CAP_NET_ADMIN))
224                 return -EPERM;
225 
226         write_lock(&qdisc_mod_lock);
227         ops = qdisc_lookup_default(name);
228         if (!ops) {
229                 /* Not found, drop lock and try to load module */
230                 write_unlock(&qdisc_mod_lock);
231                 request_module(NET_SCH_ALIAS_PREFIX "%s", name);
232                 write_lock(&qdisc_mod_lock);
233 
234                 ops = qdisc_lookup_default(name);
235         }
236 
237         if (ops) {
238                 /* Set new default */
239                 module_put(default_qdisc_ops->owner);
240                 default_qdisc_ops = ops;
241         }
242         write_unlock(&qdisc_mod_lock);
243 
244         return ops ? 0 : -ENOENT;
245 }
246 
247 #ifdef CONFIG_NET_SCH_DEFAULT
248 /* Set default value from kernel config */
249 static int __init sch_default_qdisc(void)
250 {
251         return qdisc_set_default(CONFIG_DEFAULT_NET_SCH);
252 }
253 late_initcall(sch_default_qdisc);
254 #endif
255 
256 /* We know handle. Find qdisc among all qdisc's attached to device
257  * (root qdisc, all its children, children of children etc.)
258  * Note: caller either uses rtnl or rcu_read_lock()
259  */
260 
261 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
262 {
263         struct Qdisc *q;
264 
265         if (!qdisc_dev(root))
266                 return (root->handle == handle ? root : NULL);
267 
268         if (!(root->flags & TCQ_F_BUILTIN) &&
269             root->handle == handle)
270                 return root;
271 
272         hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle,
273                                    lockdep_rtnl_is_held()) {
274                 if (q->handle == handle)
275                         return q;
276         }
277         return NULL;
278 }
279 
280 void qdisc_hash_add(struct Qdisc *q, bool invisible)
281 {
282         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
283                 ASSERT_RTNL();
284                 hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle);
285                 if (invisible)
286                         q->flags |= TCQ_F_INVISIBLE;
287         }
288 }
289 EXPORT_SYMBOL(qdisc_hash_add);
290 
291 void qdisc_hash_del(struct Qdisc *q)
292 {
293         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
294                 ASSERT_RTNL();
295                 hash_del_rcu(&q->hash);
296         }
297 }
298 EXPORT_SYMBOL(qdisc_hash_del);
299 
300 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
301 {
302         struct Qdisc *q;
303 
304         if (!handle)
305                 return NULL;
306         q = qdisc_match_from_root(rtnl_dereference(dev->qdisc), handle);
307         if (q)
308                 goto out;
309 
310         if (dev_ingress_queue(dev))
311                 q = qdisc_match_from_root(
312                         rtnl_dereference(dev_ingress_queue(dev)->qdisc_sleeping),
313                         handle);
314 out:
315         return q;
316 }
317 
318 struct Qdisc *qdisc_lookup_rcu(struct net_device *dev, u32 handle)
319 {
320         struct netdev_queue *nq;
321         struct Qdisc *q;
322 
323         if (!handle)
324                 return NULL;
325         q = qdisc_match_from_root(rcu_dereference(dev->qdisc), handle);
326         if (q)
327                 goto out;
328 
329         nq = dev_ingress_queue_rcu(dev);
330         if (nq)
331                 q = qdisc_match_from_root(rcu_dereference(nq->qdisc_sleeping),
332                                           handle);
333 out:
334         return q;
335 }
336 
337 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
338 {
339         unsigned long cl;
340         const struct Qdisc_class_ops *cops = p->ops->cl_ops;
341 
342         if (cops == NULL)
343                 return NULL;
344         cl = cops->find(p, classid);
345 
346         if (cl == 0)
347                 return NULL;
348         return cops->leaf(p, cl);
349 }
350 
351 /* Find queueing discipline by name */
352 
353 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
354 {
355         struct Qdisc_ops *q = NULL;
356 
357         if (kind) {
358                 read_lock(&qdisc_mod_lock);
359                 for (q = qdisc_base; q; q = q->next) {
360                         if (nla_strcmp(kind, q->id) == 0) {
361                                 if (!try_module_get(q->owner))
362                                         q = NULL;
363                                 break;
364                         }
365                 }
366                 read_unlock(&qdisc_mod_lock);
367         }
368         return q;
369 }
370 
371 /* The linklayer setting were not transferred from iproute2, in older
372  * versions, and the rate tables lookup systems have been dropped in
373  * the kernel. To keep backward compatible with older iproute2 tc
374  * utils, we detect the linklayer setting by detecting if the rate
375  * table were modified.
376  *
377  * For linklayer ATM table entries, the rate table will be aligned to
378  * 48 bytes, thus some table entries will contain the same value.  The
379  * mpu (min packet unit) is also encoded into the old rate table, thus
380  * starting from the mpu, we find low and high table entries for
381  * mapping this cell.  If these entries contain the same value, when
382  * the rate tables have been modified for linklayer ATM.
383  *
384  * This is done by rounding mpu to the nearest 48 bytes cell/entry,
385  * and then roundup to the next cell, calc the table entry one below,
386  * and compare.
387  */
388 static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
389 {
390         int low       = roundup(r->mpu, 48);
391         int high      = roundup(low+1, 48);
392         int cell_low  = low >> r->cell_log;
393         int cell_high = (high >> r->cell_log) - 1;
394 
395         /* rtab is too inaccurate at rates > 100Mbit/s */
396         if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
397                 pr_debug("TC linklayer: Giving up ATM detection\n");
398                 return TC_LINKLAYER_ETHERNET;
399         }
400 
401         if ((cell_high > cell_low) && (cell_high < 256)
402             && (rtab[cell_low] == rtab[cell_high])) {
403                 pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
404                          cell_low, cell_high, rtab[cell_high]);
405                 return TC_LINKLAYER_ATM;
406         }
407         return TC_LINKLAYER_ETHERNET;
408 }
409 
410 static struct qdisc_rate_table *qdisc_rtab_list;
411 
412 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
413                                         struct nlattr *tab,
414                                         struct netlink_ext_ack *extack)
415 {
416         struct qdisc_rate_table *rtab;
417 
418         if (tab == NULL || r->rate == 0 ||
419             r->cell_log == 0 || r->cell_log >= 32 ||
420             nla_len(tab) != TC_RTAB_SIZE) {
421                 NL_SET_ERR_MSG(extack, "Invalid rate table parameters for searching");
422                 return NULL;
423         }
424 
425         for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
426                 if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
427                     !memcmp(&rtab->data, nla_data(tab), 1024)) {
428                         rtab->refcnt++;
429                         return rtab;
430                 }
431         }
432 
433         rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
434         if (rtab) {
435                 rtab->rate = *r;
436                 rtab->refcnt = 1;
437                 memcpy(rtab->data, nla_data(tab), 1024);
438                 if (r->linklayer == TC_LINKLAYER_UNAWARE)
439                         r->linklayer = __detect_linklayer(r, rtab->data);
440                 rtab->next = qdisc_rtab_list;
441                 qdisc_rtab_list = rtab;
442         } else {
443                 NL_SET_ERR_MSG(extack, "Failed to allocate new qdisc rate table");
444         }
445         return rtab;
446 }
447 EXPORT_SYMBOL(qdisc_get_rtab);
448 
449 void qdisc_put_rtab(struct qdisc_rate_table *tab)
450 {
451         struct qdisc_rate_table *rtab, **rtabp;
452 
453         if (!tab || --tab->refcnt)
454                 return;
455 
456         for (rtabp = &qdisc_rtab_list;
457              (rtab = *rtabp) != NULL;
458              rtabp = &rtab->next) {
459                 if (rtab == tab) {
460                         *rtabp = rtab->next;
461                         kfree(rtab);
462                         return;
463                 }
464         }
465 }
466 EXPORT_SYMBOL(qdisc_put_rtab);
467 
468 static LIST_HEAD(qdisc_stab_list);
469 
470 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
471         [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
472         [TCA_STAB_DATA] = { .type = NLA_BINARY },
473 };
474 
475 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt,
476                                                struct netlink_ext_ack *extack)
477 {
478         struct nlattr *tb[TCA_STAB_MAX + 1];
479         struct qdisc_size_table *stab;
480         struct tc_sizespec *s;
481         unsigned int tsize = 0;
482         u16 *tab = NULL;
483         int err;
484 
485         err = nla_parse_nested_deprecated(tb, TCA_STAB_MAX, opt, stab_policy,
486                                           extack);
487         if (err < 0)
488                 return ERR_PTR(err);
489         if (!tb[TCA_STAB_BASE]) {
490                 NL_SET_ERR_MSG(extack, "Size table base attribute is missing");
491                 return ERR_PTR(-EINVAL);
492         }
493 
494         s = nla_data(tb[TCA_STAB_BASE]);
495 
496         if (s->tsize > 0) {
497                 if (!tb[TCA_STAB_DATA]) {
498                         NL_SET_ERR_MSG(extack, "Size table data attribute is missing");
499                         return ERR_PTR(-EINVAL);
500                 }
501                 tab = nla_data(tb[TCA_STAB_DATA]);
502                 tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
503         }
504 
505         if (tsize != s->tsize || (!tab && tsize > 0)) {
506                 NL_SET_ERR_MSG(extack, "Invalid size of size table");
507                 return ERR_PTR(-EINVAL);
508         }
509 
510         list_for_each_entry(stab, &qdisc_stab_list, list) {
511                 if (memcmp(&stab->szopts, s, sizeof(*s)))
512                         continue;
513                 if (tsize > 0 &&
514                     memcmp(stab->data, tab, flex_array_size(stab, data, tsize)))
515                         continue;
516                 stab->refcnt++;
517                 return stab;
518         }
519 
520         if (s->size_log > STAB_SIZE_LOG_MAX ||
521             s->cell_log > STAB_SIZE_LOG_MAX) {
522                 NL_SET_ERR_MSG(extack, "Invalid logarithmic size of size table");
523                 return ERR_PTR(-EINVAL);
524         }
525 
526         stab = kmalloc(struct_size(stab, data, tsize), GFP_KERNEL);
527         if (!stab)
528                 return ERR_PTR(-ENOMEM);
529 
530         stab->refcnt = 1;
531         stab->szopts = *s;
532         if (tsize > 0)
533                 memcpy(stab->data, tab, flex_array_size(stab, data, tsize));
534 
535         list_add_tail(&stab->list, &qdisc_stab_list);
536 
537         return stab;
538 }
539 
540 void qdisc_put_stab(struct qdisc_size_table *tab)
541 {
542         if (!tab)
543                 return;
544 
545         if (--tab->refcnt == 0) {
546                 list_del(&tab->list);
547                 kfree_rcu(tab, rcu);
548         }
549 }
550 EXPORT_SYMBOL(qdisc_put_stab);
551 
552 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
553 {
554         struct nlattr *nest;
555 
556         nest = nla_nest_start_noflag(skb, TCA_STAB);
557         if (nest == NULL)
558                 goto nla_put_failure;
559         if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
560                 goto nla_put_failure;
561         nla_nest_end(skb, nest);
562 
563         return skb->len;
564 
565 nla_put_failure:
566         return -1;
567 }
568 
569 void __qdisc_calculate_pkt_len(struct sk_buff *skb,
570                                const struct qdisc_size_table *stab)
571 {
572         int pkt_len, slot;
573 
574         pkt_len = skb->len + stab->szopts.overhead;
575         if (unlikely(!stab->szopts.tsize))
576                 goto out;
577 
578         slot = pkt_len + stab->szopts.cell_align;
579         if (unlikely(slot < 0))
580                 slot = 0;
581 
582         slot >>= stab->szopts.cell_log;
583         if (likely(slot < stab->szopts.tsize))
584                 pkt_len = stab->data[slot];
585         else
586                 pkt_len = stab->data[stab->szopts.tsize - 1] *
587                                 (slot / stab->szopts.tsize) +
588                                 stab->data[slot % stab->szopts.tsize];
589 
590         pkt_len <<= stab->szopts.size_log;
591 out:
592         if (unlikely(pkt_len < 1))
593                 pkt_len = 1;
594         qdisc_skb_cb(skb)->pkt_len = pkt_len;
595 }
596 
597 void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
598 {
599         if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
600                 pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
601                         txt, qdisc->ops->id, qdisc->handle >> 16);
602                 qdisc->flags |= TCQ_F_WARN_NONWC;
603         }
604 }
605 EXPORT_SYMBOL(qdisc_warn_nonwc);
606 
607 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
608 {
609         struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
610                                                  timer);
611 
612         rcu_read_lock();
613         __netif_schedule(qdisc_root(wd->qdisc));
614         rcu_read_unlock();
615 
616         return HRTIMER_NORESTART;
617 }
618 
619 void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc,
620                                  clockid_t clockid)
621 {
622         hrtimer_init(&wd->timer, clockid, HRTIMER_MODE_ABS_PINNED);
623         wd->timer.function = qdisc_watchdog;
624         wd->qdisc = qdisc;
625 }
626 EXPORT_SYMBOL(qdisc_watchdog_init_clockid);
627 
628 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
629 {
630         qdisc_watchdog_init_clockid(wd, qdisc, CLOCK_MONOTONIC);
631 }
632 EXPORT_SYMBOL(qdisc_watchdog_init);
633 
634 void qdisc_watchdog_schedule_range_ns(struct qdisc_watchdog *wd, u64 expires,
635                                       u64 delta_ns)
636 {
637         bool deactivated;
638 
639         rcu_read_lock();
640         deactivated = test_bit(__QDISC_STATE_DEACTIVATED,
641                                &qdisc_root_sleeping(wd->qdisc)->state);
642         rcu_read_unlock();
643         if (deactivated)
644                 return;
645 
646         if (hrtimer_is_queued(&wd->timer)) {
647                 u64 softexpires;
648 
649                 softexpires = ktime_to_ns(hrtimer_get_softexpires(&wd->timer));
650                 /* If timer is already set in [expires, expires + delta_ns],
651                  * do not reprogram it.
652                  */
653                 if (softexpires - expires <= delta_ns)
654                         return;
655         }
656 
657         hrtimer_start_range_ns(&wd->timer,
658                                ns_to_ktime(expires),
659                                delta_ns,
660                                HRTIMER_MODE_ABS_PINNED);
661 }
662 EXPORT_SYMBOL(qdisc_watchdog_schedule_range_ns);
663 
664 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
665 {
666         hrtimer_cancel(&wd->timer);
667 }
668 EXPORT_SYMBOL(qdisc_watchdog_cancel);
669 
670 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
671 {
672         struct hlist_head *h;
673         unsigned int i;
674 
675         h = kvmalloc_array(n, sizeof(struct hlist_head), GFP_KERNEL);
676 
677         if (h != NULL) {
678                 for (i = 0; i < n; i++)
679                         INIT_HLIST_HEAD(&h[i]);
680         }
681         return h;
682 }
683 
684 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
685 {
686         struct Qdisc_class_common *cl;
687         struct hlist_node *next;
688         struct hlist_head *nhash, *ohash;
689         unsigned int nsize, nmask, osize;
690         unsigned int i, h;
691 
692         /* Rehash when load factor exceeds 0.75 */
693         if (clhash->hashelems * 4 <= clhash->hashsize * 3)
694                 return;
695         nsize = clhash->hashsize * 2;
696         nmask = nsize - 1;
697         nhash = qdisc_class_hash_alloc(nsize);
698         if (nhash == NULL)
699                 return;
700 
701         ohash = clhash->hash;
702         osize = clhash->hashsize;
703 
704         sch_tree_lock(sch);
705         for (i = 0; i < osize; i++) {
706                 hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
707                         h = qdisc_class_hash(cl->classid, nmask);
708                         hlist_add_head(&cl->hnode, &nhash[h]);
709                 }
710         }
711         clhash->hash     = nhash;
712         clhash->hashsize = nsize;
713         clhash->hashmask = nmask;
714         sch_tree_unlock(sch);
715 
716         kvfree(ohash);
717 }
718 EXPORT_SYMBOL(qdisc_class_hash_grow);
719 
720 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
721 {
722         unsigned int size = 4;
723 
724         clhash->hash = qdisc_class_hash_alloc(size);
725         if (!clhash->hash)
726                 return -ENOMEM;
727         clhash->hashsize  = size;
728         clhash->hashmask  = size - 1;
729         clhash->hashelems = 0;
730         return 0;
731 }
732 EXPORT_SYMBOL(qdisc_class_hash_init);
733 
734 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
735 {
736         kvfree(clhash->hash);
737 }
738 EXPORT_SYMBOL(qdisc_class_hash_destroy);
739 
740 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
741                              struct Qdisc_class_common *cl)
742 {
743         unsigned int h;
744 
745         INIT_HLIST_NODE(&cl->hnode);
746         h = qdisc_class_hash(cl->classid, clhash->hashmask);
747         hlist_add_head(&cl->hnode, &clhash->hash[h]);
748         clhash->hashelems++;
749 }
750 EXPORT_SYMBOL(qdisc_class_hash_insert);
751 
752 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
753                              struct Qdisc_class_common *cl)
754 {
755         hlist_del(&cl->hnode);
756         clhash->hashelems--;
757 }
758 EXPORT_SYMBOL(qdisc_class_hash_remove);
759 
760 /* Allocate an unique handle from space managed by kernel
761  * Possible range is [8000-FFFF]:0000 (0x8000 values)
762  */
763 static u32 qdisc_alloc_handle(struct net_device *dev)
764 {
765         int i = 0x8000;
766         static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
767 
768         do {
769                 autohandle += TC_H_MAKE(0x10000U, 0);
770                 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
771                         autohandle = TC_H_MAKE(0x80000000U, 0);
772                 if (!qdisc_lookup(dev, autohandle))
773                         return autohandle;
774                 cond_resched();
775         } while (--i > 0);
776 
777         return 0;
778 }
779 
780 void qdisc_tree_reduce_backlog(struct Qdisc *sch, int n, int len)
781 {
782         bool qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED;
783         const struct Qdisc_class_ops *cops;
784         unsigned long cl;
785         u32 parentid;
786         bool notify;
787         int drops;
788 
789         if (n == 0 && len == 0)
790                 return;
791         drops = max_t(int, n, 0);
792         rcu_read_lock();
793         while ((parentid = sch->parent)) {
794                 if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
795                         break;
796 
797                 if (sch->flags & TCQ_F_NOPARENT)
798                         break;
799                 /* Notify parent qdisc only if child qdisc becomes empty.
800                  *
801                  * If child was empty even before update then backlog
802                  * counter is screwed and we skip notification because
803                  * parent class is already passive.
804                  *
805                  * If the original child was offloaded then it is allowed
806                  * to be seem as empty, so the parent is notified anyway.
807                  */
808                 notify = !sch->q.qlen && !WARN_ON_ONCE(!n &&
809                                                        !qdisc_is_offloaded);
810                 /* TODO: perform the search on a per txq basis */
811                 sch = qdisc_lookup_rcu(qdisc_dev(sch), TC_H_MAJ(parentid));
812                 if (sch == NULL) {
813                         WARN_ON_ONCE(parentid != TC_H_ROOT);
814                         break;
815                 }
816                 cops = sch->ops->cl_ops;
817                 if (notify && cops->qlen_notify) {
818                         cl = cops->find(sch, parentid);
819                         cops->qlen_notify(sch, cl);
820                 }
821                 sch->q.qlen -= n;
822                 sch->qstats.backlog -= len;
823                 __qdisc_qstats_drop(sch, drops);
824         }
825         rcu_read_unlock();
826 }
827 EXPORT_SYMBOL(qdisc_tree_reduce_backlog);
828 
829 int qdisc_offload_dump_helper(struct Qdisc *sch, enum tc_setup_type type,
830                               void *type_data)
831 {
832         struct net_device *dev = qdisc_dev(sch);
833         int err;
834 
835         sch->flags &= ~TCQ_F_OFFLOADED;
836         if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
837                 return 0;
838 
839         err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
840         if (err == -EOPNOTSUPP)
841                 return 0;
842 
843         if (!err)
844                 sch->flags |= TCQ_F_OFFLOADED;
845 
846         return err;
847 }
848 EXPORT_SYMBOL(qdisc_offload_dump_helper);
849 
850 void qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch,
851                                 struct Qdisc *new, struct Qdisc *old,
852                                 enum tc_setup_type type, void *type_data,
853                                 struct netlink_ext_ack *extack)
854 {
855         bool any_qdisc_is_offloaded;
856         int err;
857 
858         if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
859                 return;
860 
861         err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
862 
863         /* Don't report error if the graft is part of destroy operation. */
864         if (!err || !new || new == &noop_qdisc)
865                 return;
866 
867         /* Don't report error if the parent, the old child and the new
868          * one are not offloaded.
869          */
870         any_qdisc_is_offloaded = new->flags & TCQ_F_OFFLOADED;
871         any_qdisc_is_offloaded |= sch && sch->flags & TCQ_F_OFFLOADED;
872         any_qdisc_is_offloaded |= old && old->flags & TCQ_F_OFFLOADED;
873 
874         if (any_qdisc_is_offloaded)
875                 NL_SET_ERR_MSG(extack, "Offloading graft operation failed.");
876 }
877 EXPORT_SYMBOL(qdisc_offload_graft_helper);
878 
879 void qdisc_offload_query_caps(struct net_device *dev,
880                               enum tc_setup_type type,
881                               void *caps, size_t caps_len)
882 {
883         const struct net_device_ops *ops = dev->netdev_ops;
884         struct tc_query_caps_base base = {
885                 .type = type,
886                 .caps = caps,
887         };
888 
889         memset(caps, 0, caps_len);
890 
891         if (ops->ndo_setup_tc)
892                 ops->ndo_setup_tc(dev, TC_QUERY_CAPS, &base);
893 }
894 EXPORT_SYMBOL(qdisc_offload_query_caps);
895 
896 static void qdisc_offload_graft_root(struct net_device *dev,
897                                      struct Qdisc *new, struct Qdisc *old,
898                                      struct netlink_ext_ack *extack)
899 {
900         struct tc_root_qopt_offload graft_offload = {
901                 .command        = TC_ROOT_GRAFT,
902                 .handle         = new ? new->handle : 0,
903                 .ingress        = (new && new->flags & TCQ_F_INGRESS) ||
904                                   (old && old->flags & TCQ_F_INGRESS),
905         };
906 
907         qdisc_offload_graft_helper(dev, NULL, new, old,
908                                    TC_SETUP_ROOT_QDISC, &graft_offload, extack);
909 }
910 
911 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
912                          u32 portid, u32 seq, u16 flags, int event,
913                          struct netlink_ext_ack *extack)
914 {
915         struct gnet_stats_basic_sync __percpu *cpu_bstats = NULL;
916         struct gnet_stats_queue __percpu *cpu_qstats = NULL;
917         struct tcmsg *tcm;
918         struct nlmsghdr  *nlh;
919         unsigned char *b = skb_tail_pointer(skb);
920         struct gnet_dump d;
921         struct qdisc_size_table *stab;
922         u32 block_index;
923         __u32 qlen;
924 
925         cond_resched();
926         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
927         if (!nlh)
928                 goto out_nlmsg_trim;
929         tcm = nlmsg_data(nlh);
930         tcm->tcm_family = AF_UNSPEC;
931         tcm->tcm__pad1 = 0;
932         tcm->tcm__pad2 = 0;
933         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
934         tcm->tcm_parent = clid;
935         tcm->tcm_handle = q->handle;
936         tcm->tcm_info = refcount_read(&q->refcnt);
937         if (nla_put_string(skb, TCA_KIND, q->ops->id))
938                 goto nla_put_failure;
939         if (q->ops->ingress_block_get) {
940                 block_index = q->ops->ingress_block_get(q);
941                 if (block_index &&
942                     nla_put_u32(skb, TCA_INGRESS_BLOCK, block_index))
943                         goto nla_put_failure;
944         }
945         if (q->ops->egress_block_get) {
946                 block_index = q->ops->egress_block_get(q);
947                 if (block_index &&
948                     nla_put_u32(skb, TCA_EGRESS_BLOCK, block_index))
949                         goto nla_put_failure;
950         }
951         if (q->ops->dump && q->ops->dump(q, skb) < 0)
952                 goto nla_put_failure;
953         if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED)))
954                 goto nla_put_failure;
955         qlen = qdisc_qlen_sum(q);
956 
957         stab = rtnl_dereference(q->stab);
958         if (stab && qdisc_dump_stab(skb, stab) < 0)
959                 goto nla_put_failure;
960 
961         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
962                                          NULL, &d, TCA_PAD) < 0)
963                 goto nla_put_failure;
964 
965         if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
966                 goto nla_put_failure;
967 
968         if (qdisc_is_percpu_stats(q)) {
969                 cpu_bstats = q->cpu_bstats;
970                 cpu_qstats = q->cpu_qstats;
971         }
972 
973         if (gnet_stats_copy_basic(&d, cpu_bstats, &q->bstats, true) < 0 ||
974             gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
975             gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
976                 goto nla_put_failure;
977 
978         if (gnet_stats_finish_copy(&d) < 0)
979                 goto nla_put_failure;
980 
981         if (extack && extack->_msg &&
982             nla_put_string(skb, TCA_EXT_WARN_MSG, extack->_msg))
983                 goto out_nlmsg_trim;
984 
985         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
986 
987         return skb->len;
988 
989 out_nlmsg_trim:
990 nla_put_failure:
991         nlmsg_trim(skb, b);
992         return -1;
993 }
994 
995 static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible)
996 {
997         if (q->flags & TCQ_F_BUILTIN)
998                 return true;
999         if ((q->flags & TCQ_F_INVISIBLE) && !dump_invisible)
1000                 return true;
1001 
1002         return false;
1003 }
1004 
1005 static int qdisc_get_notify(struct net *net, struct sk_buff *oskb,
1006                             struct nlmsghdr *n, u32 clid, struct Qdisc *q,
1007                             struct netlink_ext_ack *extack)
1008 {
1009         struct sk_buff *skb;
1010         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1011 
1012         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1013         if (!skb)
1014                 return -ENOBUFS;
1015 
1016         if (!tc_qdisc_dump_ignore(q, false)) {
1017                 if (tc_fill_qdisc(skb, q, clid, portid, n->nlmsg_seq, 0,
1018                                   RTM_NEWQDISC, extack) < 0)
1019                         goto err_out;
1020         }
1021 
1022         if (skb->len)
1023                 return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1024                                       n->nlmsg_flags & NLM_F_ECHO);
1025 
1026 err_out:
1027         kfree_skb(skb);
1028         return -EINVAL;
1029 }
1030 
1031 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
1032                         struct nlmsghdr *n, u32 clid,
1033                         struct Qdisc *old, struct Qdisc *new,
1034                         struct netlink_ext_ack *extack)
1035 {
1036         struct sk_buff *skb;
1037         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1038 
1039         if (!rtnl_notify_needed(net, n->nlmsg_flags, RTNLGRP_TC))
1040                 return 0;
1041 
1042         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1043         if (!skb)
1044                 return -ENOBUFS;
1045 
1046         if (old && !tc_qdisc_dump_ignore(old, false)) {
1047                 if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
1048                                   0, RTM_DELQDISC, extack) < 0)
1049                         goto err_out;
1050         }
1051         if (new && !tc_qdisc_dump_ignore(new, false)) {
1052                 if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
1053                                   old ? NLM_F_REPLACE : 0, RTM_NEWQDISC, extack) < 0)
1054                         goto err_out;
1055         }
1056 
1057         if (skb->len)
1058                 return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1059                                       n->nlmsg_flags & NLM_F_ECHO);
1060 
1061 err_out:
1062         kfree_skb(skb);
1063         return -EINVAL;
1064 }
1065 
1066 static void notify_and_destroy(struct net *net, struct sk_buff *skb,
1067                                struct nlmsghdr *n, u32 clid,
1068                                struct Qdisc *old, struct Qdisc *new,
1069                                struct netlink_ext_ack *extack)
1070 {
1071         if (new || old)
1072                 qdisc_notify(net, skb, n, clid, old, new, extack);
1073 
1074         if (old)
1075                 qdisc_put(old);
1076 }
1077 
1078 static void qdisc_clear_nolock(struct Qdisc *sch)
1079 {
1080         sch->flags &= ~TCQ_F_NOLOCK;
1081         if (!(sch->flags & TCQ_F_CPUSTATS))
1082                 return;
1083 
1084         free_percpu(sch->cpu_bstats);
1085         free_percpu(sch->cpu_qstats);
1086         sch->cpu_bstats = NULL;
1087         sch->cpu_qstats = NULL;
1088         sch->flags &= ~TCQ_F_CPUSTATS;
1089 }
1090 
1091 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
1092  * to device "dev".
1093  *
1094  * When appropriate send a netlink notification using 'skb'
1095  * and "n".
1096  *
1097  * On success, destroy old qdisc.
1098  */
1099 
1100 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
1101                        struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
1102                        struct Qdisc *new, struct Qdisc *old,
1103                        struct netlink_ext_ack *extack)
1104 {
1105         struct Qdisc *q = old;
1106         struct net *net = dev_net(dev);
1107 
1108         if (parent == NULL) {
1109                 unsigned int i, num_q, ingress;
1110                 struct netdev_queue *dev_queue;
1111 
1112                 ingress = 0;
1113                 num_q = dev->num_tx_queues;
1114                 if ((q && q->flags & TCQ_F_INGRESS) ||
1115                     (new && new->flags & TCQ_F_INGRESS)) {
1116                         ingress = 1;
1117                         dev_queue = dev_ingress_queue(dev);
1118                         if (!dev_queue) {
1119                                 NL_SET_ERR_MSG(extack, "Device does not have an ingress queue");
1120                                 return -ENOENT;
1121                         }
1122 
1123                         q = rtnl_dereference(dev_queue->qdisc_sleeping);
1124 
1125                         /* This is the counterpart of that qdisc_refcount_inc_nz() call in
1126                          * __tcf_qdisc_find() for filter requests.
1127                          */
1128                         if (!qdisc_refcount_dec_if_one(q)) {
1129                                 NL_SET_ERR_MSG(extack,
1130                                                "Current ingress or clsact Qdisc has ongoing filter requests");
1131                                 return -EBUSY;
1132                         }
1133                 }
1134 
1135                 if (dev->flags & IFF_UP)
1136                         dev_deactivate(dev);
1137 
1138                 qdisc_offload_graft_root(dev, new, old, extack);
1139 
1140                 if (new && new->ops->attach && !ingress)
1141                         goto skip;
1142 
1143                 if (!ingress) {
1144                         for (i = 0; i < num_q; i++) {
1145                                 dev_queue = netdev_get_tx_queue(dev, i);
1146                                 old = dev_graft_qdisc(dev_queue, new);
1147 
1148                                 if (new && i > 0)
1149                                         qdisc_refcount_inc(new);
1150                                 qdisc_put(old);
1151                         }
1152                 } else {
1153                         old = dev_graft_qdisc(dev_queue, NULL);
1154 
1155                         /* {ingress,clsact}_destroy() @old before grafting @new to avoid
1156                          * unprotected concurrent accesses to net_device::miniq_{in,e}gress
1157                          * pointer(s) in mini_qdisc_pair_swap().
1158                          */
1159                         qdisc_notify(net, skb, n, classid, old, new, extack);
1160                         qdisc_destroy(old);
1161 
1162                         dev_graft_qdisc(dev_queue, new);
1163                 }
1164 
1165 skip:
1166                 if (!ingress) {
1167                         old = rtnl_dereference(dev->qdisc);
1168                         if (new && !new->ops->attach)
1169                                 qdisc_refcount_inc(new);
1170                         rcu_assign_pointer(dev->qdisc, new ? : &noop_qdisc);
1171 
1172                         notify_and_destroy(net, skb, n, classid, old, new, extack);
1173 
1174                         if (new && new->ops->attach)
1175                                 new->ops->attach(new);
1176                 }
1177 
1178                 if (dev->flags & IFF_UP)
1179                         dev_activate(dev);
1180         } else {
1181                 const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
1182                 unsigned long cl;
1183                 int err;
1184 
1185                 /* Only support running class lockless if parent is lockless */
1186                 if (new && (new->flags & TCQ_F_NOLOCK) && !(parent->flags & TCQ_F_NOLOCK))
1187                         qdisc_clear_nolock(new);
1188 
1189                 if (!cops || !cops->graft)
1190                         return -EOPNOTSUPP;
1191 
1192                 cl = cops->find(parent, classid);
1193                 if (!cl) {
1194                         NL_SET_ERR_MSG(extack, "Specified class not found");
1195                         return -ENOENT;
1196                 }
1197 
1198                 if (new && new->ops == &noqueue_qdisc_ops) {
1199                         NL_SET_ERR_MSG(extack, "Cannot assign noqueue to a class");
1200                         return -EINVAL;
1201                 }
1202 
1203                 if (new &&
1204                     !(parent->flags & TCQ_F_MQROOT) &&
1205                     rcu_access_pointer(new->stab)) {
1206                         NL_SET_ERR_MSG(extack, "STAB not supported on a non root");
1207                         return -EINVAL;
1208                 }
1209                 err = cops->graft(parent, cl, new, &old, extack);
1210                 if (err)
1211                         return err;
1212                 notify_and_destroy(net, skb, n, classid, old, new, extack);
1213         }
1214         return 0;
1215 }
1216 
1217 static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca,
1218                                    struct netlink_ext_ack *extack)
1219 {
1220         u32 block_index;
1221 
1222         if (tca[TCA_INGRESS_BLOCK]) {
1223                 block_index = nla_get_u32(tca[TCA_INGRESS_BLOCK]);
1224 
1225                 if (!block_index) {
1226                         NL_SET_ERR_MSG(extack, "Ingress block index cannot be 0");
1227                         return -EINVAL;
1228                 }
1229                 if (!sch->ops->ingress_block_set) {
1230                         NL_SET_ERR_MSG(extack, "Ingress block sharing is not supported");
1231                         return -EOPNOTSUPP;
1232                 }
1233                 sch->ops->ingress_block_set(sch, block_index);
1234         }
1235         if (tca[TCA_EGRESS_BLOCK]) {
1236                 block_index = nla_get_u32(tca[TCA_EGRESS_BLOCK]);
1237 
1238                 if (!block_index) {
1239                         NL_SET_ERR_MSG(extack, "Egress block index cannot be 0");
1240                         return -EINVAL;
1241                 }
1242                 if (!sch->ops->egress_block_set) {
1243                         NL_SET_ERR_MSG(extack, "Egress block sharing is not supported");
1244                         return -EOPNOTSUPP;
1245                 }
1246                 sch->ops->egress_block_set(sch, block_index);
1247         }
1248         return 0;
1249 }
1250 
1251 /*
1252    Allocate and initialize new qdisc.
1253 
1254    Parameters are passed via opt.
1255  */
1256 
1257 static struct Qdisc *qdisc_create(struct net_device *dev,
1258                                   struct netdev_queue *dev_queue,
1259                                   u32 parent, u32 handle,
1260                                   struct nlattr **tca, int *errp,
1261                                   struct netlink_ext_ack *extack)
1262 {
1263         int err;
1264         struct nlattr *kind = tca[TCA_KIND];
1265         struct Qdisc *sch;
1266         struct Qdisc_ops *ops;
1267         struct qdisc_size_table *stab;
1268 
1269         ops = qdisc_lookup_ops(kind);
1270 #ifdef CONFIG_MODULES
1271         if (ops == NULL && kind != NULL) {
1272                 char name[IFNAMSIZ];
1273                 if (nla_strscpy(name, kind, IFNAMSIZ) >= 0) {
1274                         /* We dropped the RTNL semaphore in order to
1275                          * perform the module load.  So, even if we
1276                          * succeeded in loading the module we have to
1277                          * tell the caller to replay the request.  We
1278                          * indicate this using -EAGAIN.
1279                          * We replay the request because the device may
1280                          * go away in the mean time.
1281                          */
1282                         rtnl_unlock();
1283                         request_module(NET_SCH_ALIAS_PREFIX "%s", name);
1284                         rtnl_lock();
1285                         ops = qdisc_lookup_ops(kind);
1286                         if (ops != NULL) {
1287                                 /* We will try again qdisc_lookup_ops,
1288                                  * so don't keep a reference.
1289                                  */
1290                                 module_put(ops->owner);
1291                                 err = -EAGAIN;
1292                                 goto err_out;
1293                         }
1294                 }
1295         }
1296 #endif
1297 
1298         err = -ENOENT;
1299         if (!ops) {
1300                 NL_SET_ERR_MSG(extack, "Specified qdisc kind is unknown");
1301                 goto err_out;
1302         }
1303 
1304         sch = qdisc_alloc(dev_queue, ops, extack);
1305         if (IS_ERR(sch)) {
1306                 err = PTR_ERR(sch);
1307                 goto err_out2;
1308         }
1309 
1310         sch->parent = parent;
1311 
1312         if (handle == TC_H_INGRESS) {
1313                 if (!(sch->flags & TCQ_F_INGRESS)) {
1314                         NL_SET_ERR_MSG(extack,
1315                                        "Specified parent ID is reserved for ingress and clsact Qdiscs");
1316                         err = -EINVAL;
1317                         goto err_out3;
1318                 }
1319                 handle = TC_H_MAKE(TC_H_INGRESS, 0);
1320         } else {
1321                 if (handle == 0) {
1322                         handle = qdisc_alloc_handle(dev);
1323                         if (handle == 0) {
1324                                 NL_SET_ERR_MSG(extack, "Maximum number of qdisc handles was exceeded");
1325                                 err = -ENOSPC;
1326                                 goto err_out3;
1327                         }
1328                 }
1329                 if (!netif_is_multiqueue(dev))
1330                         sch->flags |= TCQ_F_ONETXQUEUE;
1331         }
1332 
1333         sch->handle = handle;
1334 
1335         /* This exist to keep backward compatible with a userspace
1336          * loophole, what allowed userspace to get IFF_NO_QUEUE
1337          * facility on older kernels by setting tx_queue_len=0 (prior
1338          * to qdisc init), and then forgot to reinit tx_queue_len
1339          * before again attaching a qdisc.
1340          */
1341         if ((dev->priv_flags & IFF_NO_QUEUE) && (dev->tx_queue_len == 0)) {
1342                 WRITE_ONCE(dev->tx_queue_len, DEFAULT_TX_QUEUE_LEN);
1343                 netdev_info(dev, "Caught tx_queue_len zero misconfig\n");
1344         }
1345 
1346         err = qdisc_block_indexes_set(sch, tca, extack);
1347         if (err)
1348                 goto err_out3;
1349 
1350         if (tca[TCA_STAB]) {
1351                 stab = qdisc_get_stab(tca[TCA_STAB], extack);
1352                 if (IS_ERR(stab)) {
1353                         err = PTR_ERR(stab);
1354                         goto err_out3;
1355                 }
1356                 rcu_assign_pointer(sch->stab, stab);
1357         }
1358 
1359         if (ops->init) {
1360                 err = ops->init(sch, tca[TCA_OPTIONS], extack);
1361                 if (err != 0)
1362                         goto err_out4;
1363         }
1364 
1365         if (tca[TCA_RATE]) {
1366                 err = -EOPNOTSUPP;
1367                 if (sch->flags & TCQ_F_MQROOT) {
1368                         NL_SET_ERR_MSG(extack, "Cannot attach rate estimator to a multi-queue root qdisc");
1369                         goto err_out4;
1370                 }
1371 
1372                 err = gen_new_estimator(&sch->bstats,
1373                                         sch->cpu_bstats,
1374                                         &sch->rate_est,
1375                                         NULL,
1376                                         true,
1377                                         tca[TCA_RATE]);
1378                 if (err) {
1379                         NL_SET_ERR_MSG(extack, "Failed to generate new estimator");
1380                         goto err_out4;
1381                 }
1382         }
1383 
1384         qdisc_hash_add(sch, false);
1385         trace_qdisc_create(ops, dev, parent);
1386 
1387         return sch;
1388 
1389 err_out4:
1390         /* Even if ops->init() failed, we call ops->destroy()
1391          * like qdisc_create_dflt().
1392          */
1393         if (ops->destroy)
1394                 ops->destroy(sch);
1395         qdisc_put_stab(rtnl_dereference(sch->stab));
1396 err_out3:
1397         lockdep_unregister_key(&sch->root_lock_key);
1398         netdev_put(dev, &sch->dev_tracker);
1399         qdisc_free(sch);
1400 err_out2:
1401         module_put(ops->owner);
1402 err_out:
1403         *errp = err;
1404         return NULL;
1405 }
1406 
1407 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca,
1408                         struct netlink_ext_ack *extack)
1409 {
1410         struct qdisc_size_table *ostab, *stab = NULL;
1411         int err = 0;
1412 
1413         if (tca[TCA_OPTIONS]) {
1414                 if (!sch->ops->change) {
1415                         NL_SET_ERR_MSG(extack, "Change operation not supported by specified qdisc");
1416                         return -EINVAL;
1417                 }
1418                 if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
1419                         NL_SET_ERR_MSG(extack, "Change of blocks is not supported");
1420                         return -EOPNOTSUPP;
1421                 }
1422                 err = sch->ops->change(sch, tca[TCA_OPTIONS], extack);
1423                 if (err)
1424                         return err;
1425         }
1426 
1427         if (tca[TCA_STAB]) {
1428                 stab = qdisc_get_stab(tca[TCA_STAB], extack);
1429                 if (IS_ERR(stab))
1430                         return PTR_ERR(stab);
1431         }
1432 
1433         ostab = rtnl_dereference(sch->stab);
1434         rcu_assign_pointer(sch->stab, stab);
1435         qdisc_put_stab(ostab);
1436 
1437         if (tca[TCA_RATE]) {
1438                 /* NB: ignores errors from replace_estimator
1439                    because change can't be undone. */
1440                 if (sch->flags & TCQ_F_MQROOT)
1441                         goto out;
1442                 gen_replace_estimator(&sch->bstats,
1443                                       sch->cpu_bstats,
1444                                       &sch->rate_est,
1445                                       NULL,
1446                                       true,
1447                                       tca[TCA_RATE]);
1448         }
1449 out:
1450         return 0;
1451 }
1452 
1453 struct check_loop_arg {
1454         struct qdisc_walker     w;
1455         struct Qdisc            *p;
1456         int                     depth;
1457 };
1458 
1459 static int check_loop_fn(struct Qdisc *q, unsigned long cl,
1460                          struct qdisc_walker *w);
1461 
1462 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
1463 {
1464         struct check_loop_arg   arg;
1465 
1466         if (q->ops->cl_ops == NULL)
1467                 return 0;
1468 
1469         arg.w.stop = arg.w.skip = arg.w.count = 0;
1470         arg.w.fn = check_loop_fn;
1471         arg.depth = depth;
1472         arg.p = p;
1473         q->ops->cl_ops->walk(q, &arg.w);
1474         return arg.w.stop ? -ELOOP : 0;
1475 }
1476 
1477 static int
1478 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
1479 {
1480         struct Qdisc *leaf;
1481         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1482         struct check_loop_arg *arg = (struct check_loop_arg *)w;
1483 
1484         leaf = cops->leaf(q, cl);
1485         if (leaf) {
1486                 if (leaf == arg->p || arg->depth > 7)
1487                         return -ELOOP;
1488                 return check_loop(leaf, arg->p, arg->depth + 1);
1489         }
1490         return 0;
1491 }
1492 
1493 const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = {
1494         [TCA_KIND]              = { .type = NLA_STRING },
1495         [TCA_RATE]              = { .type = NLA_BINARY,
1496                                     .len = sizeof(struct tc_estimator) },
1497         [TCA_STAB]              = { .type = NLA_NESTED },
1498         [TCA_DUMP_INVISIBLE]    = { .type = NLA_FLAG },
1499         [TCA_CHAIN]             = { .type = NLA_U32 },
1500         [TCA_INGRESS_BLOCK]     = { .type = NLA_U32 },
1501         [TCA_EGRESS_BLOCK]      = { .type = NLA_U32 },
1502 };
1503 
1504 /*
1505  * Delete/get qdisc.
1506  */
1507 
1508 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1509                         struct netlink_ext_ack *extack)
1510 {
1511         struct net *net = sock_net(skb->sk);
1512         struct tcmsg *tcm = nlmsg_data(n);
1513         struct nlattr *tca[TCA_MAX + 1];
1514         struct net_device *dev;
1515         u32 clid;
1516         struct Qdisc *q = NULL;
1517         struct Qdisc *p = NULL;
1518         int err;
1519 
1520         err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1521                                      rtm_tca_policy, extack);
1522         if (err < 0)
1523                 return err;
1524 
1525         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1526         if (!dev)
1527                 return -ENODEV;
1528 
1529         clid = tcm->tcm_parent;
1530         if (clid) {
1531                 if (clid != TC_H_ROOT) {
1532                         if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
1533                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1534                                 if (!p) {
1535                                         NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified classid");
1536                                         return -ENOENT;
1537                                 }
1538                                 q = qdisc_leaf(p, clid);
1539                         } else if (dev_ingress_queue(dev)) {
1540                                 q = rtnl_dereference(dev_ingress_queue(dev)->qdisc_sleeping);
1541                         }
1542                 } else {
1543                         q = rtnl_dereference(dev->qdisc);
1544                 }
1545                 if (!q) {
1546                         NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device");
1547                         return -ENOENT;
1548                 }
1549 
1550                 if (tcm->tcm_handle && q->handle != tcm->tcm_handle) {
1551                         NL_SET_ERR_MSG(extack, "Invalid handle");
1552                         return -EINVAL;
1553                 }
1554         } else {
1555                 q = qdisc_lookup(dev, tcm->tcm_handle);
1556                 if (!q) {
1557                         NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified handle");
1558                         return -ENOENT;
1559                 }
1560         }
1561 
1562         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1563                 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1564                 return -EINVAL;
1565         }
1566 
1567         if (n->nlmsg_type == RTM_DELQDISC) {
1568                 if (!clid) {
1569                         NL_SET_ERR_MSG(extack, "Classid cannot be zero");
1570                         return -EINVAL;
1571                 }
1572                 if (q->handle == 0) {
1573                         NL_SET_ERR_MSG(extack, "Cannot delete qdisc with handle of zero");
1574                         return -ENOENT;
1575                 }
1576                 err = qdisc_graft(dev, p, skb, n, clid, NULL, q, extack);
1577                 if (err != 0)
1578                         return err;
1579         } else {
1580                 qdisc_get_notify(net, skb, n, clid, q, NULL);
1581         }
1582         return 0;
1583 }
1584 
1585 static bool req_create_or_replace(struct nlmsghdr *n)
1586 {
1587         return (n->nlmsg_flags & NLM_F_CREATE &&
1588                 n->nlmsg_flags & NLM_F_REPLACE);
1589 }
1590 
1591 static bool req_create_exclusive(struct nlmsghdr *n)
1592 {
1593         return (n->nlmsg_flags & NLM_F_CREATE &&
1594                 n->nlmsg_flags & NLM_F_EXCL);
1595 }
1596 
1597 static bool req_change(struct nlmsghdr *n)
1598 {
1599         return (!(n->nlmsg_flags & NLM_F_CREATE) &&
1600                 !(n->nlmsg_flags & NLM_F_REPLACE) &&
1601                 !(n->nlmsg_flags & NLM_F_EXCL));
1602 }
1603 
1604 /*
1605  * Create/change qdisc.
1606  */
1607 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1608                            struct netlink_ext_ack *extack)
1609 {
1610         struct net *net = sock_net(skb->sk);
1611         struct tcmsg *tcm;
1612         struct nlattr *tca[TCA_MAX + 1];
1613         struct net_device *dev;
1614         u32 clid;
1615         struct Qdisc *q, *p;
1616         int err;
1617 
1618 replay:
1619         /* Reinit, just in case something touches this. */
1620         err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1621                                      rtm_tca_policy, extack);
1622         if (err < 0)
1623                 return err;
1624 
1625         tcm = nlmsg_data(n);
1626         clid = tcm->tcm_parent;
1627         q = p = NULL;
1628 
1629         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1630         if (!dev)
1631                 return -ENODEV;
1632 
1633 
1634         if (clid) {
1635                 if (clid != TC_H_ROOT) {
1636                         if (clid != TC_H_INGRESS) {
1637                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1638                                 if (!p) {
1639                                         NL_SET_ERR_MSG(extack, "Failed to find specified qdisc");
1640                                         return -ENOENT;
1641                                 }
1642                                 q = qdisc_leaf(p, clid);
1643                         } else if (dev_ingress_queue_create(dev)) {
1644                                 q = rtnl_dereference(dev_ingress_queue(dev)->qdisc_sleeping);
1645                         }
1646                 } else {
1647                         q = rtnl_dereference(dev->qdisc);
1648                 }
1649 
1650                 /* It may be default qdisc, ignore it */
1651                 if (q && q->handle == 0)
1652                         q = NULL;
1653 
1654                 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1655                         if (tcm->tcm_handle) {
1656                                 if (q && !(n->nlmsg_flags & NLM_F_REPLACE)) {
1657                                         NL_SET_ERR_MSG(extack, "NLM_F_REPLACE needed to override");
1658                                         return -EEXIST;
1659                                 }
1660                                 if (TC_H_MIN(tcm->tcm_handle)) {
1661                                         NL_SET_ERR_MSG(extack, "Invalid minor handle");
1662                                         return -EINVAL;
1663                                 }
1664                                 q = qdisc_lookup(dev, tcm->tcm_handle);
1665                                 if (!q)
1666                                         goto create_n_graft;
1667                                 if (n->nlmsg_flags & NLM_F_EXCL) {
1668                                         NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot override");
1669                                         return -EEXIST;
1670                                 }
1671                                 if (tca[TCA_KIND] &&
1672                                     nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1673                                         NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1674                                         return -EINVAL;
1675                                 }
1676                                 if (q->flags & TCQ_F_INGRESS) {
1677                                         NL_SET_ERR_MSG(extack,
1678                                                        "Cannot regraft ingress or clsact Qdiscs");
1679                                         return -EINVAL;
1680                                 }
1681                                 if (q == p ||
1682                                     (p && check_loop(q, p, 0))) {
1683                                         NL_SET_ERR_MSG(extack, "Qdisc parent/child loop detected");
1684                                         return -ELOOP;
1685                                 }
1686                                 if (clid == TC_H_INGRESS) {
1687                                         NL_SET_ERR_MSG(extack, "Ingress cannot graft directly");
1688                                         return -EINVAL;
1689                                 }
1690                                 qdisc_refcount_inc(q);
1691                                 goto graft;
1692                         } else {
1693                                 if (!q)
1694                                         goto create_n_graft;
1695 
1696                                 /* This magic test requires explanation.
1697                                  *
1698                                  *   We know, that some child q is already
1699                                  *   attached to this parent and have choice:
1700                                  *   1) change it or 2) create/graft new one.
1701                                  *   If the requested qdisc kind is different
1702                                  *   than the existing one, then we choose graft.
1703                                  *   If they are the same then this is "change"
1704                                  *   operation - just let it fallthrough..
1705                                  *
1706                                  *   1. We are allowed to create/graft only
1707                                  *   if the request is explicitly stating
1708                                  *   "please create if it doesn't exist".
1709                                  *
1710                                  *   2. If the request is to exclusive create
1711                                  *   then the qdisc tcm_handle is not expected
1712                                  *   to exist, so that we choose create/graft too.
1713                                  *
1714                                  *   3. The last case is when no flags are set.
1715                                  *   This will happen when for example tc
1716                                  *   utility issues a "change" command.
1717                                  *   Alas, it is sort of hole in API, we
1718                                  *   cannot decide what to do unambiguously.
1719                                  *   For now we select create/graft.
1720                                  */
1721                                 if (tca[TCA_KIND] &&
1722                                     nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1723                                         if (req_create_or_replace(n) ||
1724                                             req_create_exclusive(n))
1725                                                 goto create_n_graft;
1726                                         else if (req_change(n))
1727                                                 goto create_n_graft2;
1728                                 }
1729                         }
1730                 }
1731         } else {
1732                 if (!tcm->tcm_handle) {
1733                         NL_SET_ERR_MSG(extack, "Handle cannot be zero");
1734                         return -EINVAL;
1735                 }
1736                 q = qdisc_lookup(dev, tcm->tcm_handle);
1737         }
1738 
1739         /* Change qdisc parameters */
1740         if (!q) {
1741                 NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1742                 return -ENOENT;
1743         }
1744         if (n->nlmsg_flags & NLM_F_EXCL) {
1745                 NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot modify");
1746                 return -EEXIST;
1747         }
1748         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1749                 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1750                 return -EINVAL;
1751         }
1752         err = qdisc_change(q, tca, extack);
1753         if (err == 0)
1754                 qdisc_notify(net, skb, n, clid, NULL, q, extack);
1755         return err;
1756 
1757 create_n_graft:
1758         if (!(n->nlmsg_flags & NLM_F_CREATE)) {
1759                 NL_SET_ERR_MSG(extack, "Qdisc not found. To create specify NLM_F_CREATE flag");
1760                 return -ENOENT;
1761         }
1762 create_n_graft2:
1763         if (clid == TC_H_INGRESS) {
1764                 if (dev_ingress_queue(dev)) {
1765                         q = qdisc_create(dev, dev_ingress_queue(dev),
1766                                          tcm->tcm_parent, tcm->tcm_parent,
1767                                          tca, &err, extack);
1768                 } else {
1769                         NL_SET_ERR_MSG(extack, "Cannot find ingress queue for specified device");
1770                         err = -ENOENT;
1771                 }
1772         } else {
1773                 struct netdev_queue *dev_queue;
1774 
1775                 if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1776                         dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1777                 else if (p)
1778                         dev_queue = p->dev_queue;
1779                 else
1780                         dev_queue = netdev_get_tx_queue(dev, 0);
1781 
1782                 q = qdisc_create(dev, dev_queue,
1783                                  tcm->tcm_parent, tcm->tcm_handle,
1784                                  tca, &err, extack);
1785         }
1786         if (q == NULL) {
1787                 if (err == -EAGAIN)
1788                         goto replay;
1789                 return err;
1790         }
1791 
1792 graft:
1793         err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack);
1794         if (err) {
1795                 if (q)
1796                         qdisc_put(q);
1797                 return err;
1798         }
1799 
1800         return 0;
1801 }
1802 
1803 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1804                               struct netlink_callback *cb,
1805                               int *q_idx_p, int s_q_idx, bool recur,
1806                               bool dump_invisible)
1807 {
1808         int ret = 0, q_idx = *q_idx_p;
1809         struct Qdisc *q;
1810         int b;
1811 
1812         if (!root)
1813                 return 0;
1814 
1815         q = root;
1816         if (q_idx < s_q_idx) {
1817                 q_idx++;
1818         } else {
1819                 if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1820                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1821                                   cb->nlh->nlmsg_seq, NLM_F_MULTI,
1822                                   RTM_NEWQDISC, NULL) <= 0)
1823                         goto done;
1824                 q_idx++;
1825         }
1826 
1827         /* If dumping singletons, there is no qdisc_dev(root) and the singleton
1828          * itself has already been dumped.
1829          *
1830          * If we've already dumped the top-level (ingress) qdisc above and the global
1831          * qdisc hashtable, we don't want to hit it again
1832          */
1833         if (!qdisc_dev(root) || !recur)
1834                 goto out;
1835 
1836         hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
1837                 if (q_idx < s_q_idx) {
1838                         q_idx++;
1839                         continue;
1840                 }
1841                 if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1842                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1843                                   cb->nlh->nlmsg_seq, NLM_F_MULTI,
1844                                   RTM_NEWQDISC, NULL) <= 0)
1845                         goto done;
1846                 q_idx++;
1847         }
1848 
1849 out:
1850         *q_idx_p = q_idx;
1851         return ret;
1852 done:
1853         ret = -1;
1854         goto out;
1855 }
1856 
1857 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1858 {
1859         struct net *net = sock_net(skb->sk);
1860         int idx, q_idx;
1861         int s_idx, s_q_idx;
1862         struct net_device *dev;
1863         const struct nlmsghdr *nlh = cb->nlh;
1864         struct nlattr *tca[TCA_MAX + 1];
1865         int err;
1866 
1867         s_idx = cb->args[0];
1868         s_q_idx = q_idx = cb->args[1];
1869 
1870         idx = 0;
1871         ASSERT_RTNL();
1872 
1873         err = nlmsg_parse_deprecated(nlh, sizeof(struct tcmsg), tca, TCA_MAX,
1874                                      rtm_tca_policy, cb->extack);
1875         if (err < 0)
1876                 return err;
1877 
1878         for_each_netdev(net, dev) {
1879                 struct netdev_queue *dev_queue;
1880 
1881                 if (idx < s_idx)
1882                         goto cont;
1883                 if (idx > s_idx)
1884                         s_q_idx = 0;
1885                 q_idx = 0;
1886 
1887                 if (tc_dump_qdisc_root(rtnl_dereference(dev->qdisc),
1888                                        skb, cb, &q_idx, s_q_idx,
1889                                        true, tca[TCA_DUMP_INVISIBLE]) < 0)
1890                         goto done;
1891 
1892                 dev_queue = dev_ingress_queue(dev);
1893                 if (dev_queue &&
1894                     tc_dump_qdisc_root(rtnl_dereference(dev_queue->qdisc_sleeping),
1895                                        skb, cb, &q_idx, s_q_idx, false,
1896                                        tca[TCA_DUMP_INVISIBLE]) < 0)
1897                         goto done;
1898 
1899 cont:
1900                 idx++;
1901         }
1902 
1903 done:
1904         cb->args[0] = idx;
1905         cb->args[1] = q_idx;
1906 
1907         return skb->len;
1908 }
1909 
1910 
1911 
1912 /************************************************
1913  *      Traffic classes manipulation.           *
1914  ************************************************/
1915 
1916 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1917                           unsigned long cl, u32 portid, u32 seq, u16 flags,
1918                           int event, struct netlink_ext_ack *extack)
1919 {
1920         struct tcmsg *tcm;
1921         struct nlmsghdr  *nlh;
1922         unsigned char *b = skb_tail_pointer(skb);
1923         struct gnet_dump d;
1924         const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1925 
1926         cond_resched();
1927         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1928         if (!nlh)
1929                 goto out_nlmsg_trim;
1930         tcm = nlmsg_data(nlh);
1931         tcm->tcm_family = AF_UNSPEC;
1932         tcm->tcm__pad1 = 0;
1933         tcm->tcm__pad2 = 0;
1934         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1935         tcm->tcm_parent = q->handle;
1936         tcm->tcm_handle = q->handle;
1937         tcm->tcm_info = 0;
1938         if (nla_put_string(skb, TCA_KIND, q->ops->id))
1939                 goto nla_put_failure;
1940         if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1941                 goto nla_put_failure;
1942 
1943         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1944                                          NULL, &d, TCA_PAD) < 0)
1945                 goto nla_put_failure;
1946 
1947         if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1948                 goto nla_put_failure;
1949 
1950         if (gnet_stats_finish_copy(&d) < 0)
1951                 goto nla_put_failure;
1952 
1953         if (extack && extack->_msg &&
1954             nla_put_string(skb, TCA_EXT_WARN_MSG, extack->_msg))
1955                 goto out_nlmsg_trim;
1956 
1957         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1958 
1959         return skb->len;
1960 
1961 out_nlmsg_trim:
1962 nla_put_failure:
1963         nlmsg_trim(skb, b);
1964         return -1;
1965 }
1966 
1967 static int tclass_notify(struct net *net, struct sk_buff *oskb,
1968                          struct nlmsghdr *n, struct Qdisc *q,
1969                          unsigned long cl, int event, struct netlink_ext_ack *extack)
1970 {
1971         struct sk_buff *skb;
1972         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1973 
1974         if (!rtnl_notify_needed(net, n->nlmsg_flags, RTNLGRP_TC))
1975                 return 0;
1976 
1977         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1978         if (!skb)
1979                 return -ENOBUFS;
1980 
1981         if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event, extack) < 0) {
1982                 kfree_skb(skb);
1983                 return -EINVAL;
1984         }
1985 
1986         return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1987                               n->nlmsg_flags & NLM_F_ECHO);
1988 }
1989 
1990 static int tclass_get_notify(struct net *net, struct sk_buff *oskb,
1991                              struct nlmsghdr *n, struct Qdisc *q,
1992                              unsigned long cl, struct netlink_ext_ack *extack)
1993 {
1994         struct sk_buff *skb;
1995         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1996 
1997         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1998         if (!skb)
1999                 return -ENOBUFS;
2000 
2001         if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, RTM_NEWTCLASS,
2002                            extack) < 0) {
2003                 kfree_skb(skb);
2004                 return -EINVAL;
2005         }
2006 
2007         return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
2008                               n->nlmsg_flags & NLM_F_ECHO);
2009 }
2010 
2011 static int tclass_del_notify(struct net *net,
2012                              const struct Qdisc_class_ops *cops,
2013                              struct sk_buff *oskb, struct nlmsghdr *n,
2014                              struct Qdisc *q, unsigned long cl,
2015                              struct netlink_ext_ack *extack)
2016 {
2017         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
2018         struct sk_buff *skb;
2019         int err = 0;
2020 
2021         if (!cops->delete)
2022                 return -EOPNOTSUPP;
2023 
2024         if (rtnl_notify_needed(net, n->nlmsg_flags, RTNLGRP_TC)) {
2025                 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2026                 if (!skb)
2027                         return -ENOBUFS;
2028 
2029                 if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0,
2030                                    RTM_DELTCLASS, extack) < 0) {
2031                         kfree_skb(skb);
2032                         return -EINVAL;
2033                 }
2034         } else {
2035                 skb = NULL;
2036         }
2037 
2038         err = cops->delete(q, cl, extack);
2039         if (err) {
2040                 kfree_skb(skb);
2041                 return err;
2042         }
2043 
2044         err = rtnetlink_maybe_send(skb, net, portid, RTNLGRP_TC,
2045                                    n->nlmsg_flags & NLM_F_ECHO);
2046         return err;
2047 }
2048 
2049 #ifdef CONFIG_NET_CLS
2050 
2051 struct tcf_bind_args {
2052         struct tcf_walker w;
2053         unsigned long base;
2054         unsigned long cl;
2055         u32 classid;
2056 };
2057 
2058 static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
2059 {
2060         struct tcf_bind_args *a = (void *)arg;
2061 
2062         if (n && tp->ops->bind_class) {
2063                 struct Qdisc *q = tcf_block_q(tp->chain->block);
2064 
2065                 sch_tree_lock(q);
2066                 tp->ops->bind_class(n, a->classid, a->cl, q, a->base);
2067                 sch_tree_unlock(q);
2068         }
2069         return 0;
2070 }
2071 
2072 struct tc_bind_class_args {
2073         struct qdisc_walker w;
2074         unsigned long new_cl;
2075         u32 portid;
2076         u32 clid;
2077 };
2078 
2079 static int tc_bind_class_walker(struct Qdisc *q, unsigned long cl,
2080                                 struct qdisc_walker *w)
2081 {
2082         struct tc_bind_class_args *a = (struct tc_bind_class_args *)w;
2083         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
2084         struct tcf_block *block;
2085         struct tcf_chain *chain;
2086 
2087         block = cops->tcf_block(q, cl, NULL);
2088         if (!block)
2089                 return 0;
2090         for (chain = tcf_get_next_chain(block, NULL);
2091              chain;
2092              chain = tcf_get_next_chain(block, chain)) {
2093                 struct tcf_proto *tp;
2094 
2095                 for (tp = tcf_get_next_proto(chain, NULL);
2096                      tp; tp = tcf_get_next_proto(chain, tp)) {
2097                         struct tcf_bind_args arg = {};
2098 
2099                         arg.w.fn = tcf_node_bind;
2100                         arg.classid = a->clid;
2101                         arg.base = cl;
2102                         arg.cl = a->new_cl;
2103                         tp->ops->walk(tp, &arg.w, true);
2104                 }
2105         }
2106 
2107         return 0;
2108 }
2109 
2110 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
2111                            unsigned long new_cl)
2112 {
2113         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
2114         struct tc_bind_class_args args = {};
2115 
2116         if (!cops->tcf_block)
2117                 return;
2118         args.portid = portid;
2119         args.clid = clid;
2120         args.new_cl = new_cl;
2121         args.w.fn = tc_bind_class_walker;
2122         q->ops->cl_ops->walk(q, &args.w);
2123 }
2124 
2125 #else
2126 
2127 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
2128                            unsigned long new_cl)
2129 {
2130 }
2131 
2132 #endif
2133 
2134 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
2135                          struct netlink_ext_ack *extack)
2136 {
2137         struct net *net = sock_net(skb->sk);
2138         struct tcmsg *tcm = nlmsg_data(n);
2139         struct nlattr *tca[TCA_MAX + 1];
2140         struct net_device *dev;
2141         struct Qdisc *q = NULL;
2142         const struct Qdisc_class_ops *cops;
2143         unsigned long cl = 0;
2144         unsigned long new_cl;
2145         u32 portid;
2146         u32 clid;
2147         u32 qid;
2148         int err;
2149 
2150         err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
2151                                      rtm_tca_policy, extack);
2152         if (err < 0)
2153                 return err;
2154 
2155         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
2156         if (!dev)
2157                 return -ENODEV;
2158 
2159         /*
2160            parent == TC_H_UNSPEC - unspecified parent.
2161            parent == TC_H_ROOT   - class is root, which has no parent.
2162            parent == X:0         - parent is root class.
2163            parent == X:Y         - parent is a node in hierarchy.
2164            parent == 0:Y         - parent is X:Y, where X:0 is qdisc.
2165 
2166            handle == 0:0         - generate handle from kernel pool.
2167            handle == 0:Y         - class is X:Y, where X:0 is qdisc.
2168            handle == X:Y         - clear.
2169            handle == X:0         - root class.
2170          */
2171 
2172         /* Step 1. Determine qdisc handle X:0 */
2173 
2174         portid = tcm->tcm_parent;
2175         clid = tcm->tcm_handle;
2176         qid = TC_H_MAJ(clid);
2177 
2178         if (portid != TC_H_ROOT) {
2179                 u32 qid1 = TC_H_MAJ(portid);
2180 
2181                 if (qid && qid1) {
2182                         /* If both majors are known, they must be identical. */
2183                         if (qid != qid1)
2184                                 return -EINVAL;
2185                 } else if (qid1) {
2186                         qid = qid1;
2187                 } else if (qid == 0)
2188                         qid = rtnl_dereference(dev->qdisc)->handle;
2189 
2190                 /* Now qid is genuine qdisc handle consistent
2191                  * both with parent and child.
2192                  *
2193                  * TC_H_MAJ(portid) still may be unspecified, complete it now.
2194                  */
2195                 if (portid)
2196                         portid = TC_H_MAKE(qid, portid);
2197         } else {
2198                 if (qid == 0)
2199                         qid = rtnl_dereference(dev->qdisc)->handle;
2200         }
2201 
2202         /* OK. Locate qdisc */
2203         q = qdisc_lookup(dev, qid);
2204         if (!q)
2205                 return -ENOENT;
2206 
2207         /* An check that it supports classes */
2208         cops = q->ops->cl_ops;
2209         if (cops == NULL)
2210                 return -EINVAL;
2211 
2212         /* Now try to get class */
2213         if (clid == 0) {
2214                 if (portid == TC_H_ROOT)
2215                         clid = qid;
2216         } else
2217                 clid = TC_H_MAKE(qid, clid);
2218 
2219         if (clid)
2220                 cl = cops->find(q, clid);
2221 
2222         if (cl == 0) {
2223                 err = -ENOENT;
2224                 if (n->nlmsg_type != RTM_NEWTCLASS ||
2225                     !(n->nlmsg_flags & NLM_F_CREATE))
2226                         goto out;
2227         } else {
2228                 switch (n->nlmsg_type) {
2229                 case RTM_NEWTCLASS:
2230                         err = -EEXIST;
2231                         if (n->nlmsg_flags & NLM_F_EXCL)
2232                                 goto out;
2233                         break;
2234                 case RTM_DELTCLASS:
2235                         err = tclass_del_notify(net, cops, skb, n, q, cl, extack);
2236                         /* Unbind the class with flilters with 0 */
2237                         tc_bind_tclass(q, portid, clid, 0);
2238                         goto out;
2239                 case RTM_GETTCLASS:
2240                         err = tclass_get_notify(net, skb, n, q, cl, extack);
2241                         goto out;
2242                 default:
2243                         err = -EINVAL;
2244                         goto out;
2245                 }
2246         }
2247 
2248         if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
2249                 NL_SET_ERR_MSG(extack, "Shared blocks are not supported for classes");
2250                 return -EOPNOTSUPP;
2251         }
2252 
2253         new_cl = cl;
2254         err = -EOPNOTSUPP;
2255         if (cops->change)
2256                 err = cops->change(q, clid, portid, tca, &new_cl, extack);
2257         if (err == 0) {
2258                 tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS, extack);
2259                 /* We just create a new class, need to do reverse binding. */
2260                 if (cl != new_cl)
2261                         tc_bind_tclass(q, portid, clid, new_cl);
2262         }
2263 out:
2264         return err;
2265 }
2266 
2267 struct qdisc_dump_args {
2268         struct qdisc_walker     w;
2269         struct sk_buff          *skb;
2270         struct netlink_callback *cb;
2271 };
2272 
2273 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl,
2274                             struct qdisc_walker *arg)
2275 {
2276         struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
2277 
2278         return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
2279                               a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
2280                               RTM_NEWTCLASS, NULL);
2281 }
2282 
2283 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
2284                                 struct tcmsg *tcm, struct netlink_callback *cb,
2285                                 int *t_p, int s_t)
2286 {
2287         struct qdisc_dump_args arg;
2288 
2289         if (tc_qdisc_dump_ignore(q, false) ||
2290             *t_p < s_t || !q->ops->cl_ops ||
2291             (tcm->tcm_parent &&
2292              TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
2293                 (*t_p)++;
2294                 return 0;
2295         }
2296         if (*t_p > s_t)
2297                 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
2298         arg.w.fn = qdisc_class_dump;
2299         arg.skb = skb;
2300         arg.cb = cb;
2301         arg.w.stop  = 0;
2302         arg.w.skip = cb->args[1];
2303         arg.w.count = 0;
2304         q->ops->cl_ops->walk(q, &arg.w);
2305         cb->args[1] = arg.w.count;
2306         if (arg.w.stop)
2307                 return -1;
2308         (*t_p)++;
2309         return 0;
2310 }
2311 
2312 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
2313                                struct tcmsg *tcm, struct netlink_callback *cb,
2314                                int *t_p, int s_t, bool recur)
2315 {
2316         struct Qdisc *q;
2317         int b;
2318 
2319         if (!root)
2320                 return 0;
2321 
2322         if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
2323                 return -1;
2324 
2325         if (!qdisc_dev(root) || !recur)
2326                 return 0;
2327 
2328         if (tcm->tcm_parent) {
2329                 q = qdisc_match_from_root(root, TC_H_MAJ(tcm->tcm_parent));
2330                 if (q && q != root &&
2331                     tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2332                         return -1;
2333                 return 0;
2334         }
2335         hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
2336                 if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2337                         return -1;
2338         }
2339 
2340         return 0;
2341 }
2342 
2343 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
2344 {
2345         struct tcmsg *tcm = nlmsg_data(cb->nlh);
2346         struct net *net = sock_net(skb->sk);
2347         struct netdev_queue *dev_queue;
2348         struct net_device *dev;
2349         int t, s_t;
2350 
2351         if (nlmsg_len(cb->nlh) < sizeof(*tcm))
2352                 return 0;
2353         dev = dev_get_by_index(net, tcm->tcm_ifindex);
2354         if (!dev)
2355                 return 0;
2356 
2357         s_t = cb->args[0];
2358         t = 0;
2359 
2360         if (tc_dump_tclass_root(rtnl_dereference(dev->qdisc),
2361                                 skb, tcm, cb, &t, s_t, true) < 0)
2362                 goto done;
2363 
2364         dev_queue = dev_ingress_queue(dev);
2365         if (dev_queue &&
2366             tc_dump_tclass_root(rtnl_dereference(dev_queue->qdisc_sleeping),
2367                                 skb, tcm, cb, &t, s_t, false) < 0)
2368                 goto done;
2369 
2370 done:
2371         cb->args[0] = t;
2372 
2373         dev_put(dev);
2374         return skb->len;
2375 }
2376 
2377 #ifdef CONFIG_PROC_FS
2378 static int psched_show(struct seq_file *seq, void *v)
2379 {
2380         seq_printf(seq, "%08x %08x %08x %08x\n",
2381                    (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
2382                    1000000,
2383                    (u32)NSEC_PER_SEC / hrtimer_resolution);
2384 
2385         return 0;
2386 }
2387 
2388 static int __net_init psched_net_init(struct net *net)
2389 {
2390         struct proc_dir_entry *e;
2391 
2392         e = proc_create_single("psched", 0, net->proc_net, psched_show);
2393         if (e == NULL)
2394                 return -ENOMEM;
2395 
2396         return 0;
2397 }
2398 
2399 static void __net_exit psched_net_exit(struct net *net)
2400 {
2401         remove_proc_entry("psched", net->proc_net);
2402 }
2403 #else
2404 static int __net_init psched_net_init(struct net *net)
2405 {
2406         return 0;
2407 }
2408 
2409 static void __net_exit psched_net_exit(struct net *net)
2410 {
2411 }
2412 #endif
2413 
2414 static struct pernet_operations psched_net_ops = {
2415         .init = psched_net_init,
2416         .exit = psched_net_exit,
2417 };
2418 
2419 #if IS_ENABLED(CONFIG_MITIGATION_RETPOLINE)
2420 DEFINE_STATIC_KEY_FALSE(tc_skip_wrapper);
2421 #endif
2422 
2423 static int __init pktsched_init(void)
2424 {
2425         int err;
2426 
2427         err = register_pernet_subsys(&psched_net_ops);
2428         if (err) {
2429                 pr_err("pktsched_init: "
2430                        "cannot initialize per netns operations\n");
2431                 return err;
2432         }
2433 
2434         register_qdisc(&pfifo_fast_ops);
2435         register_qdisc(&pfifo_qdisc_ops);
2436         register_qdisc(&bfifo_qdisc_ops);
2437         register_qdisc(&pfifo_head_drop_qdisc_ops);
2438         register_qdisc(&mq_qdisc_ops);
2439         register_qdisc(&noqueue_qdisc_ops);
2440 
2441         rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, 0);
2442         rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, 0);
2443         rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc,
2444                       0);
2445         rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, 0);
2446         rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, 0);
2447         rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass,
2448                       0);
2449 
2450         tc_wrapper_init();
2451 
2452         return 0;
2453 }
2454 
2455 subsys_initcall(pktsched_init);
2456 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

sflogo.php