~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/net/sched/sch_generic.c

Version: ~ [ linux-6.11.5 ] ~ [ linux-6.10.14 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.58 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.114 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.169 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.228 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.284 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.322 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.9 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 // SPDX-License-Identifier: GPL-2.0-or-later
  2 /*
  3  * net/sched/sch_generic.c      Generic packet scheduler routines.
  4  *
  5  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  6  *              Jamal Hadi Salim, <hadi@cyberus.ca> 990601
  7  *              - Ingress support
  8  */
  9 
 10 #include <linux/bitops.h>
 11 #include <linux/module.h>
 12 #include <linux/types.h>
 13 #include <linux/kernel.h>
 14 #include <linux/sched.h>
 15 #include <linux/string.h>
 16 #include <linux/errno.h>
 17 #include <linux/netdevice.h>
 18 #include <linux/skbuff.h>
 19 #include <linux/rtnetlink.h>
 20 #include <linux/init.h>
 21 #include <linux/rcupdate.h>
 22 #include <linux/list.h>
 23 #include <linux/slab.h>
 24 #include <linux/if_vlan.h>
 25 #include <linux/skb_array.h>
 26 #include <linux/if_macvlan.h>
 27 #include <net/sch_generic.h>
 28 #include <net/pkt_sched.h>
 29 #include <net/dst.h>
 30 #include <net/hotdata.h>
 31 #include <trace/events/qdisc.h>
 32 #include <trace/events/net.h>
 33 #include <net/xfrm.h>
 34 
 35 /* Qdisc to use by default */
 36 const struct Qdisc_ops *default_qdisc_ops = &pfifo_fast_ops;
 37 EXPORT_SYMBOL(default_qdisc_ops);
 38 
 39 static void qdisc_maybe_clear_missed(struct Qdisc *q,
 40                                      const struct netdev_queue *txq)
 41 {
 42         clear_bit(__QDISC_STATE_MISSED, &q->state);
 43 
 44         /* Make sure the below netif_xmit_frozen_or_stopped()
 45          * checking happens after clearing STATE_MISSED.
 46          */
 47         smp_mb__after_atomic();
 48 
 49         /* Checking netif_xmit_frozen_or_stopped() again to
 50          * make sure STATE_MISSED is set if the STATE_MISSED
 51          * set by netif_tx_wake_queue()'s rescheduling of
 52          * net_tx_action() is cleared by the above clear_bit().
 53          */
 54         if (!netif_xmit_frozen_or_stopped(txq))
 55                 set_bit(__QDISC_STATE_MISSED, &q->state);
 56         else
 57                 set_bit(__QDISC_STATE_DRAINING, &q->state);
 58 }
 59 
 60 /* Main transmission queue. */
 61 
 62 /* Modifications to data participating in scheduling must be protected with
 63  * qdisc_lock(qdisc) spinlock.
 64  *
 65  * The idea is the following:
 66  * - enqueue, dequeue are serialized via qdisc root lock
 67  * - ingress filtering is also serialized via qdisc root lock
 68  * - updates to tree and tree walking are only done under the rtnl mutex.
 69  */
 70 
 71 #define SKB_XOFF_MAGIC ((struct sk_buff *)1UL)
 72 
 73 static inline struct sk_buff *__skb_dequeue_bad_txq(struct Qdisc *q)
 74 {
 75         const struct netdev_queue *txq = q->dev_queue;
 76         spinlock_t *lock = NULL;
 77         struct sk_buff *skb;
 78 
 79         if (q->flags & TCQ_F_NOLOCK) {
 80                 lock = qdisc_lock(q);
 81                 spin_lock(lock);
 82         }
 83 
 84         skb = skb_peek(&q->skb_bad_txq);
 85         if (skb) {
 86                 /* check the reason of requeuing without tx lock first */
 87                 txq = skb_get_tx_queue(txq->dev, skb);
 88                 if (!netif_xmit_frozen_or_stopped(txq)) {
 89                         skb = __skb_dequeue(&q->skb_bad_txq);
 90                         if (qdisc_is_percpu_stats(q)) {
 91                                 qdisc_qstats_cpu_backlog_dec(q, skb);
 92                                 qdisc_qstats_cpu_qlen_dec(q);
 93                         } else {
 94                                 qdisc_qstats_backlog_dec(q, skb);
 95                                 q->q.qlen--;
 96                         }
 97                 } else {
 98                         skb = SKB_XOFF_MAGIC;
 99                         qdisc_maybe_clear_missed(q, txq);
100                 }
101         }
102 
103         if (lock)
104                 spin_unlock(lock);
105 
106         return skb;
107 }
108 
109 static inline struct sk_buff *qdisc_dequeue_skb_bad_txq(struct Qdisc *q)
110 {
111         struct sk_buff *skb = skb_peek(&q->skb_bad_txq);
112 
113         if (unlikely(skb))
114                 skb = __skb_dequeue_bad_txq(q);
115 
116         return skb;
117 }
118 
119 static inline void qdisc_enqueue_skb_bad_txq(struct Qdisc *q,
120                                              struct sk_buff *skb)
121 {
122         spinlock_t *lock = NULL;
123 
124         if (q->flags & TCQ_F_NOLOCK) {
125                 lock = qdisc_lock(q);
126                 spin_lock(lock);
127         }
128 
129         __skb_queue_tail(&q->skb_bad_txq, skb);
130 
131         if (qdisc_is_percpu_stats(q)) {
132                 qdisc_qstats_cpu_backlog_inc(q, skb);
133                 qdisc_qstats_cpu_qlen_inc(q);
134         } else {
135                 qdisc_qstats_backlog_inc(q, skb);
136                 q->q.qlen++;
137         }
138 
139         if (lock)
140                 spin_unlock(lock);
141 }
142 
143 static inline void dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
144 {
145         spinlock_t *lock = NULL;
146 
147         if (q->flags & TCQ_F_NOLOCK) {
148                 lock = qdisc_lock(q);
149                 spin_lock(lock);
150         }
151 
152         while (skb) {
153                 struct sk_buff *next = skb->next;
154 
155                 __skb_queue_tail(&q->gso_skb, skb);
156 
157                 /* it's still part of the queue */
158                 if (qdisc_is_percpu_stats(q)) {
159                         qdisc_qstats_cpu_requeues_inc(q);
160                         qdisc_qstats_cpu_backlog_inc(q, skb);
161                         qdisc_qstats_cpu_qlen_inc(q);
162                 } else {
163                         q->qstats.requeues++;
164                         qdisc_qstats_backlog_inc(q, skb);
165                         q->q.qlen++;
166                 }
167 
168                 skb = next;
169         }
170 
171         if (lock) {
172                 spin_unlock(lock);
173                 set_bit(__QDISC_STATE_MISSED, &q->state);
174         } else {
175                 __netif_schedule(q);
176         }
177 }
178 
179 static void try_bulk_dequeue_skb(struct Qdisc *q,
180                                  struct sk_buff *skb,
181                                  const struct netdev_queue *txq,
182                                  int *packets)
183 {
184         int bytelimit = qdisc_avail_bulklimit(txq) - skb->len;
185 
186         while (bytelimit > 0) {
187                 struct sk_buff *nskb = q->dequeue(q);
188 
189                 if (!nskb)
190                         break;
191 
192                 bytelimit -= nskb->len; /* covers GSO len */
193                 skb->next = nskb;
194                 skb = nskb;
195                 (*packets)++; /* GSO counts as one pkt */
196         }
197         skb_mark_not_on_list(skb);
198 }
199 
200 /* This variant of try_bulk_dequeue_skb() makes sure
201  * all skbs in the chain are for the same txq
202  */
203 static void try_bulk_dequeue_skb_slow(struct Qdisc *q,
204                                       struct sk_buff *skb,
205                                       int *packets)
206 {
207         int mapping = skb_get_queue_mapping(skb);
208         struct sk_buff *nskb;
209         int cnt = 0;
210 
211         do {
212                 nskb = q->dequeue(q);
213                 if (!nskb)
214                         break;
215                 if (unlikely(skb_get_queue_mapping(nskb) != mapping)) {
216                         qdisc_enqueue_skb_bad_txq(q, nskb);
217                         break;
218                 }
219                 skb->next = nskb;
220                 skb = nskb;
221         } while (++cnt < 8);
222         (*packets) += cnt;
223         skb_mark_not_on_list(skb);
224 }
225 
226 /* Note that dequeue_skb can possibly return a SKB list (via skb->next).
227  * A requeued skb (via q->gso_skb) can also be a SKB list.
228  */
229 static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate,
230                                    int *packets)
231 {
232         const struct netdev_queue *txq = q->dev_queue;
233         struct sk_buff *skb = NULL;
234 
235         *packets = 1;
236         if (unlikely(!skb_queue_empty(&q->gso_skb))) {
237                 spinlock_t *lock = NULL;
238 
239                 if (q->flags & TCQ_F_NOLOCK) {
240                         lock = qdisc_lock(q);
241                         spin_lock(lock);
242                 }
243 
244                 skb = skb_peek(&q->gso_skb);
245 
246                 /* skb may be null if another cpu pulls gso_skb off in between
247                  * empty check and lock.
248                  */
249                 if (!skb) {
250                         if (lock)
251                                 spin_unlock(lock);
252                         goto validate;
253                 }
254 
255                 /* skb in gso_skb were already validated */
256                 *validate = false;
257                 if (xfrm_offload(skb))
258                         *validate = true;
259                 /* check the reason of requeuing without tx lock first */
260                 txq = skb_get_tx_queue(txq->dev, skb);
261                 if (!netif_xmit_frozen_or_stopped(txq)) {
262                         skb = __skb_dequeue(&q->gso_skb);
263                         if (qdisc_is_percpu_stats(q)) {
264                                 qdisc_qstats_cpu_backlog_dec(q, skb);
265                                 qdisc_qstats_cpu_qlen_dec(q);
266                         } else {
267                                 qdisc_qstats_backlog_dec(q, skb);
268                                 q->q.qlen--;
269                         }
270                 } else {
271                         skb = NULL;
272                         qdisc_maybe_clear_missed(q, txq);
273                 }
274                 if (lock)
275                         spin_unlock(lock);
276                 goto trace;
277         }
278 validate:
279         *validate = true;
280 
281         if ((q->flags & TCQ_F_ONETXQUEUE) &&
282             netif_xmit_frozen_or_stopped(txq)) {
283                 qdisc_maybe_clear_missed(q, txq);
284                 return skb;
285         }
286 
287         skb = qdisc_dequeue_skb_bad_txq(q);
288         if (unlikely(skb)) {
289                 if (skb == SKB_XOFF_MAGIC)
290                         return NULL;
291                 goto bulk;
292         }
293         skb = q->dequeue(q);
294         if (skb) {
295 bulk:
296                 if (qdisc_may_bulk(q))
297                         try_bulk_dequeue_skb(q, skb, txq, packets);
298                 else
299                         try_bulk_dequeue_skb_slow(q, skb, packets);
300         }
301 trace:
302         trace_qdisc_dequeue(q, txq, *packets, skb);
303         return skb;
304 }
305 
306 /*
307  * Transmit possibly several skbs, and handle the return status as
308  * required. Owning qdisc running bit guarantees that only one CPU
309  * can execute this function.
310  *
311  * Returns to the caller:
312  *                              false  - hardware queue frozen backoff
313  *                              true   - feel free to send more pkts
314  */
315 bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
316                      struct net_device *dev, struct netdev_queue *txq,
317                      spinlock_t *root_lock, bool validate)
318 {
319         int ret = NETDEV_TX_BUSY;
320         bool again = false;
321 
322         /* And release qdisc */
323         if (root_lock)
324                 spin_unlock(root_lock);
325 
326         /* Note that we validate skb (GSO, checksum, ...) outside of locks */
327         if (validate)
328                 skb = validate_xmit_skb_list(skb, dev, &again);
329 
330 #ifdef CONFIG_XFRM_OFFLOAD
331         if (unlikely(again)) {
332                 if (root_lock)
333                         spin_lock(root_lock);
334 
335                 dev_requeue_skb(skb, q);
336                 return false;
337         }
338 #endif
339 
340         if (likely(skb)) {
341                 HARD_TX_LOCK(dev, txq, smp_processor_id());
342                 if (!netif_xmit_frozen_or_stopped(txq))
343                         skb = dev_hard_start_xmit(skb, dev, txq, &ret);
344                 else
345                         qdisc_maybe_clear_missed(q, txq);
346 
347                 HARD_TX_UNLOCK(dev, txq);
348         } else {
349                 if (root_lock)
350                         spin_lock(root_lock);
351                 return true;
352         }
353 
354         if (root_lock)
355                 spin_lock(root_lock);
356 
357         if (!dev_xmit_complete(ret)) {
358                 /* Driver returned NETDEV_TX_BUSY - requeue skb */
359                 if (unlikely(ret != NETDEV_TX_BUSY))
360                         net_warn_ratelimited("BUG %s code %d qlen %d\n",
361                                              dev->name, ret, q->q.qlen);
362 
363                 dev_requeue_skb(skb, q);
364                 return false;
365         }
366 
367         return true;
368 }
369 
370 /*
371  * NOTE: Called under qdisc_lock(q) with locally disabled BH.
372  *
373  * running seqcount guarantees only one CPU can process
374  * this qdisc at a time. qdisc_lock(q) serializes queue accesses for
375  * this queue.
376  *
377  *  netif_tx_lock serializes accesses to device driver.
378  *
379  *  qdisc_lock(q) and netif_tx_lock are mutually exclusive,
380  *  if one is grabbed, another must be free.
381  *
382  * Note, that this procedure can be called by a watchdog timer
383  *
384  * Returns to the caller:
385  *                              0  - queue is empty or throttled.
386  *                              >0 - queue is not empty.
387  *
388  */
389 static inline bool qdisc_restart(struct Qdisc *q, int *packets)
390 {
391         spinlock_t *root_lock = NULL;
392         struct netdev_queue *txq;
393         struct net_device *dev;
394         struct sk_buff *skb;
395         bool validate;
396 
397         /* Dequeue packet */
398         skb = dequeue_skb(q, &validate, packets);
399         if (unlikely(!skb))
400                 return false;
401 
402         if (!(q->flags & TCQ_F_NOLOCK))
403                 root_lock = qdisc_lock(q);
404 
405         dev = qdisc_dev(q);
406         txq = skb_get_tx_queue(dev, skb);
407 
408         return sch_direct_xmit(skb, q, dev, txq, root_lock, validate);
409 }
410 
411 void __qdisc_run(struct Qdisc *q)
412 {
413         int quota = READ_ONCE(net_hotdata.dev_tx_weight);
414         int packets;
415 
416         while (qdisc_restart(q, &packets)) {
417                 quota -= packets;
418                 if (quota <= 0) {
419                         if (q->flags & TCQ_F_NOLOCK)
420                                 set_bit(__QDISC_STATE_MISSED, &q->state);
421                         else
422                                 __netif_schedule(q);
423 
424                         break;
425                 }
426         }
427 }
428 
429 unsigned long dev_trans_start(struct net_device *dev)
430 {
431         unsigned long res = READ_ONCE(netdev_get_tx_queue(dev, 0)->trans_start);
432         unsigned long val;
433         unsigned int i;
434 
435         for (i = 1; i < dev->num_tx_queues; i++) {
436                 val = READ_ONCE(netdev_get_tx_queue(dev, i)->trans_start);
437                 if (val && time_after(val, res))
438                         res = val;
439         }
440 
441         return res;
442 }
443 EXPORT_SYMBOL(dev_trans_start);
444 
445 static void netif_freeze_queues(struct net_device *dev)
446 {
447         unsigned int i;
448         int cpu;
449 
450         cpu = smp_processor_id();
451         for (i = 0; i < dev->num_tx_queues; i++) {
452                 struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
453 
454                 /* We are the only thread of execution doing a
455                  * freeze, but we have to grab the _xmit_lock in
456                  * order to synchronize with threads which are in
457                  * the ->hard_start_xmit() handler and already
458                  * checked the frozen bit.
459                  */
460                 __netif_tx_lock(txq, cpu);
461                 set_bit(__QUEUE_STATE_FROZEN, &txq->state);
462                 __netif_tx_unlock(txq);
463         }
464 }
465 
466 void netif_tx_lock(struct net_device *dev)
467 {
468         spin_lock(&dev->tx_global_lock);
469         netif_freeze_queues(dev);
470 }
471 EXPORT_SYMBOL(netif_tx_lock);
472 
473 static void netif_unfreeze_queues(struct net_device *dev)
474 {
475         unsigned int i;
476 
477         for (i = 0; i < dev->num_tx_queues; i++) {
478                 struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
479 
480                 /* No need to grab the _xmit_lock here.  If the
481                  * queue is not stopped for another reason, we
482                  * force a schedule.
483                  */
484                 clear_bit(__QUEUE_STATE_FROZEN, &txq->state);
485                 netif_schedule_queue(txq);
486         }
487 }
488 
489 void netif_tx_unlock(struct net_device *dev)
490 {
491         netif_unfreeze_queues(dev);
492         spin_unlock(&dev->tx_global_lock);
493 }
494 EXPORT_SYMBOL(netif_tx_unlock);
495 
496 static void dev_watchdog(struct timer_list *t)
497 {
498         struct net_device *dev = from_timer(dev, t, watchdog_timer);
499         bool release = true;
500 
501         spin_lock(&dev->tx_global_lock);
502         if (!qdisc_tx_is_noop(dev)) {
503                 if (netif_device_present(dev) &&
504                     netif_running(dev) &&
505                     netif_carrier_ok(dev)) {
506                         unsigned int timedout_ms = 0;
507                         unsigned int i;
508                         unsigned long trans_start;
509                         unsigned long oldest_start = jiffies;
510 
511                         for (i = 0; i < dev->num_tx_queues; i++) {
512                                 struct netdev_queue *txq;
513 
514                                 txq = netdev_get_tx_queue(dev, i);
515                                 trans_start = READ_ONCE(txq->trans_start);
516                                 if (!netif_xmit_stopped(txq))
517                                         continue;
518                                 if (time_after(jiffies, trans_start + dev->watchdog_timeo)) {
519                                         timedout_ms = jiffies_to_msecs(jiffies - trans_start);
520                                         atomic_long_inc(&txq->trans_timeout);
521                                         break;
522                                 }
523                                 if (time_after(oldest_start, trans_start))
524                                         oldest_start = trans_start;
525                         }
526 
527                         if (unlikely(timedout_ms)) {
528                                 trace_net_dev_xmit_timeout(dev, i);
529                                 netdev_crit(dev, "NETDEV WATCHDOG: CPU: %d: transmit queue %u timed out %u ms\n",
530                                             raw_smp_processor_id(),
531                                             i, timedout_ms);
532                                 netif_freeze_queues(dev);
533                                 dev->netdev_ops->ndo_tx_timeout(dev, i);
534                                 netif_unfreeze_queues(dev);
535                         }
536                         if (!mod_timer(&dev->watchdog_timer,
537                                        round_jiffies(oldest_start +
538                                                      dev->watchdog_timeo)))
539                                 release = false;
540                 }
541         }
542         spin_unlock(&dev->tx_global_lock);
543 
544         if (release)
545                 netdev_put(dev, &dev->watchdog_dev_tracker);
546 }
547 
548 void __netdev_watchdog_up(struct net_device *dev)
549 {
550         if (dev->netdev_ops->ndo_tx_timeout) {
551                 if (dev->watchdog_timeo <= 0)
552                         dev->watchdog_timeo = 5*HZ;
553                 if (!mod_timer(&dev->watchdog_timer,
554                                round_jiffies(jiffies + dev->watchdog_timeo)))
555                         netdev_hold(dev, &dev->watchdog_dev_tracker,
556                                     GFP_ATOMIC);
557         }
558 }
559 EXPORT_SYMBOL_GPL(__netdev_watchdog_up);
560 
561 static void dev_watchdog_up(struct net_device *dev)
562 {
563         __netdev_watchdog_up(dev);
564 }
565 
566 static void dev_watchdog_down(struct net_device *dev)
567 {
568         netif_tx_lock_bh(dev);
569         if (del_timer(&dev->watchdog_timer))
570                 netdev_put(dev, &dev->watchdog_dev_tracker);
571         netif_tx_unlock_bh(dev);
572 }
573 
574 /**
575  *      netif_carrier_on - set carrier
576  *      @dev: network device
577  *
578  * Device has detected acquisition of carrier.
579  */
580 void netif_carrier_on(struct net_device *dev)
581 {
582         if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state)) {
583                 if (dev->reg_state == NETREG_UNINITIALIZED)
584                         return;
585                 atomic_inc(&dev->carrier_up_count);
586                 linkwatch_fire_event(dev);
587                 if (netif_running(dev))
588                         __netdev_watchdog_up(dev);
589         }
590 }
591 EXPORT_SYMBOL(netif_carrier_on);
592 
593 /**
594  *      netif_carrier_off - clear carrier
595  *      @dev: network device
596  *
597  * Device has detected loss of carrier.
598  */
599 void netif_carrier_off(struct net_device *dev)
600 {
601         if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state)) {
602                 if (dev->reg_state == NETREG_UNINITIALIZED)
603                         return;
604                 atomic_inc(&dev->carrier_down_count);
605                 linkwatch_fire_event(dev);
606         }
607 }
608 EXPORT_SYMBOL(netif_carrier_off);
609 
610 /**
611  *      netif_carrier_event - report carrier state event
612  *      @dev: network device
613  *
614  * Device has detected a carrier event but the carrier state wasn't changed.
615  * Use in drivers when querying carrier state asynchronously, to avoid missing
616  * events (link flaps) if link recovers before it's queried.
617  */
618 void netif_carrier_event(struct net_device *dev)
619 {
620         if (dev->reg_state == NETREG_UNINITIALIZED)
621                 return;
622         atomic_inc(&dev->carrier_up_count);
623         atomic_inc(&dev->carrier_down_count);
624         linkwatch_fire_event(dev);
625 }
626 EXPORT_SYMBOL_GPL(netif_carrier_event);
627 
628 /* "NOOP" scheduler: the best scheduler, recommended for all interfaces
629    under all circumstances. It is difficult to invent anything faster or
630    cheaper.
631  */
632 
633 static int noop_enqueue(struct sk_buff *skb, struct Qdisc *qdisc,
634                         struct sk_buff **to_free)
635 {
636         dev_core_stats_tx_dropped_inc(skb->dev);
637         __qdisc_drop(skb, to_free);
638         return NET_XMIT_CN;
639 }
640 
641 static struct sk_buff *noop_dequeue(struct Qdisc *qdisc)
642 {
643         return NULL;
644 }
645 
646 struct Qdisc_ops noop_qdisc_ops __read_mostly = {
647         .id             =       "noop",
648         .priv_size      =       0,
649         .enqueue        =       noop_enqueue,
650         .dequeue        =       noop_dequeue,
651         .peek           =       noop_dequeue,
652         .owner          =       THIS_MODULE,
653 };
654 
655 static struct netdev_queue noop_netdev_queue = {
656         RCU_POINTER_INITIALIZER(qdisc, &noop_qdisc),
657         RCU_POINTER_INITIALIZER(qdisc_sleeping, &noop_qdisc),
658 };
659 
660 struct Qdisc noop_qdisc = {
661         .enqueue        =       noop_enqueue,
662         .dequeue        =       noop_dequeue,
663         .flags          =       TCQ_F_BUILTIN,
664         .ops            =       &noop_qdisc_ops,
665         .q.lock         =       __SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock),
666         .dev_queue      =       &noop_netdev_queue,
667         .busylock       =       __SPIN_LOCK_UNLOCKED(noop_qdisc.busylock),
668         .gso_skb = {
669                 .next = (struct sk_buff *)&noop_qdisc.gso_skb,
670                 .prev = (struct sk_buff *)&noop_qdisc.gso_skb,
671                 .qlen = 0,
672                 .lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.gso_skb.lock),
673         },
674         .skb_bad_txq = {
675                 .next = (struct sk_buff *)&noop_qdisc.skb_bad_txq,
676                 .prev = (struct sk_buff *)&noop_qdisc.skb_bad_txq,
677                 .qlen = 0,
678                 .lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.skb_bad_txq.lock),
679         },
680         .owner = -1,
681 };
682 EXPORT_SYMBOL(noop_qdisc);
683 
684 static int noqueue_init(struct Qdisc *qdisc, struct nlattr *opt,
685                         struct netlink_ext_ack *extack)
686 {
687         /* register_qdisc() assigns a default of noop_enqueue if unset,
688          * but __dev_queue_xmit() treats noqueue only as such
689          * if this is NULL - so clear it here. */
690         qdisc->enqueue = NULL;
691         return 0;
692 }
693 
694 struct Qdisc_ops noqueue_qdisc_ops __read_mostly = {
695         .id             =       "noqueue",
696         .priv_size      =       0,
697         .init           =       noqueue_init,
698         .enqueue        =       noop_enqueue,
699         .dequeue        =       noop_dequeue,
700         .peek           =       noop_dequeue,
701         .owner          =       THIS_MODULE,
702 };
703 
704 const u8 sch_default_prio2band[TC_PRIO_MAX + 1] = {
705         1, 2, 2, 2, 1, 2, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1
706 };
707 EXPORT_SYMBOL(sch_default_prio2band);
708 
709 /* 3-band FIFO queue: old style, but should be a bit faster than
710    generic prio+fifo combination.
711  */
712 
713 #define PFIFO_FAST_BANDS 3
714 
715 /*
716  * Private data for a pfifo_fast scheduler containing:
717  *      - rings for priority bands
718  */
719 struct pfifo_fast_priv {
720         struct skb_array q[PFIFO_FAST_BANDS];
721 };
722 
723 static inline struct skb_array *band2list(struct pfifo_fast_priv *priv,
724                                           int band)
725 {
726         return &priv->q[band];
727 }
728 
729 static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc,
730                               struct sk_buff **to_free)
731 {
732         int band = sch_default_prio2band[skb->priority & TC_PRIO_MAX];
733         struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
734         struct skb_array *q = band2list(priv, band);
735         unsigned int pkt_len = qdisc_pkt_len(skb);
736         int err;
737 
738         err = skb_array_produce(q, skb);
739 
740         if (unlikely(err)) {
741                 if (qdisc_is_percpu_stats(qdisc))
742                         return qdisc_drop_cpu(skb, qdisc, to_free);
743                 else
744                         return qdisc_drop(skb, qdisc, to_free);
745         }
746 
747         qdisc_update_stats_at_enqueue(qdisc, pkt_len);
748         return NET_XMIT_SUCCESS;
749 }
750 
751 static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc)
752 {
753         struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
754         struct sk_buff *skb = NULL;
755         bool need_retry = true;
756         int band;
757 
758 retry:
759         for (band = 0; band < PFIFO_FAST_BANDS && !skb; band++) {
760                 struct skb_array *q = band2list(priv, band);
761 
762                 if (__skb_array_empty(q))
763                         continue;
764 
765                 skb = __skb_array_consume(q);
766         }
767         if (likely(skb)) {
768                 qdisc_update_stats_at_dequeue(qdisc, skb);
769         } else if (need_retry &&
770                    READ_ONCE(qdisc->state) & QDISC_STATE_NON_EMPTY) {
771                 /* Delay clearing the STATE_MISSED here to reduce
772                  * the overhead of the second spin_trylock() in
773                  * qdisc_run_begin() and __netif_schedule() calling
774                  * in qdisc_run_end().
775                  */
776                 clear_bit(__QDISC_STATE_MISSED, &qdisc->state);
777                 clear_bit(__QDISC_STATE_DRAINING, &qdisc->state);
778 
779                 /* Make sure dequeuing happens after clearing
780                  * STATE_MISSED.
781                  */
782                 smp_mb__after_atomic();
783 
784                 need_retry = false;
785 
786                 goto retry;
787         }
788 
789         return skb;
790 }
791 
792 static struct sk_buff *pfifo_fast_peek(struct Qdisc *qdisc)
793 {
794         struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
795         struct sk_buff *skb = NULL;
796         int band;
797 
798         for (band = 0; band < PFIFO_FAST_BANDS && !skb; band++) {
799                 struct skb_array *q = band2list(priv, band);
800 
801                 skb = __skb_array_peek(q);
802         }
803 
804         return skb;
805 }
806 
807 static void pfifo_fast_reset(struct Qdisc *qdisc)
808 {
809         int i, band;
810         struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
811 
812         for (band = 0; band < PFIFO_FAST_BANDS; band++) {
813                 struct skb_array *q = band2list(priv, band);
814                 struct sk_buff *skb;
815 
816                 /* NULL ring is possible if destroy path is due to a failed
817                  * skb_array_init() in pfifo_fast_init() case.
818                  */
819                 if (!q->ring.queue)
820                         continue;
821 
822                 while ((skb = __skb_array_consume(q)) != NULL)
823                         kfree_skb(skb);
824         }
825 
826         if (qdisc_is_percpu_stats(qdisc)) {
827                 for_each_possible_cpu(i) {
828                         struct gnet_stats_queue *q;
829 
830                         q = per_cpu_ptr(qdisc->cpu_qstats, i);
831                         q->backlog = 0;
832                         q->qlen = 0;
833                 }
834         }
835 }
836 
837 static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
838 {
839         struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS };
840 
841         memcpy(&opt.priomap, sch_default_prio2band, TC_PRIO_MAX + 1);
842         if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
843                 goto nla_put_failure;
844         return skb->len;
845 
846 nla_put_failure:
847         return -1;
848 }
849 
850 static int pfifo_fast_init(struct Qdisc *qdisc, struct nlattr *opt,
851                            struct netlink_ext_ack *extack)
852 {
853         unsigned int qlen = qdisc_dev(qdisc)->tx_queue_len;
854         struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
855         int prio;
856 
857         /* guard against zero length rings */
858         if (!qlen)
859                 return -EINVAL;
860 
861         for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
862                 struct skb_array *q = band2list(priv, prio);
863                 int err;
864 
865                 err = skb_array_init(q, qlen, GFP_KERNEL);
866                 if (err)
867                         return -ENOMEM;
868         }
869 
870         /* Can by-pass the queue discipline */
871         qdisc->flags |= TCQ_F_CAN_BYPASS;
872         return 0;
873 }
874 
875 static void pfifo_fast_destroy(struct Qdisc *sch)
876 {
877         struct pfifo_fast_priv *priv = qdisc_priv(sch);
878         int prio;
879 
880         for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
881                 struct skb_array *q = band2list(priv, prio);
882 
883                 /* NULL ring is possible if destroy path is due to a failed
884                  * skb_array_init() in pfifo_fast_init() case.
885                  */
886                 if (!q->ring.queue)
887                         continue;
888                 /* Destroy ring but no need to kfree_skb because a call to
889                  * pfifo_fast_reset() has already done that work.
890                  */
891                 ptr_ring_cleanup(&q->ring, NULL);
892         }
893 }
894 
895 static int pfifo_fast_change_tx_queue_len(struct Qdisc *sch,
896                                           unsigned int new_len)
897 {
898         struct pfifo_fast_priv *priv = qdisc_priv(sch);
899         struct skb_array *bands[PFIFO_FAST_BANDS];
900         int prio;
901 
902         for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
903                 struct skb_array *q = band2list(priv, prio);
904 
905                 bands[prio] = q;
906         }
907 
908         return skb_array_resize_multiple(bands, PFIFO_FAST_BANDS, new_len,
909                                          GFP_KERNEL);
910 }
911 
912 struct Qdisc_ops pfifo_fast_ops __read_mostly = {
913         .id             =       "pfifo_fast",
914         .priv_size      =       sizeof(struct pfifo_fast_priv),
915         .enqueue        =       pfifo_fast_enqueue,
916         .dequeue        =       pfifo_fast_dequeue,
917         .peek           =       pfifo_fast_peek,
918         .init           =       pfifo_fast_init,
919         .destroy        =       pfifo_fast_destroy,
920         .reset          =       pfifo_fast_reset,
921         .dump           =       pfifo_fast_dump,
922         .change_tx_queue_len =  pfifo_fast_change_tx_queue_len,
923         .owner          =       THIS_MODULE,
924         .static_flags   =       TCQ_F_NOLOCK | TCQ_F_CPUSTATS,
925 };
926 EXPORT_SYMBOL(pfifo_fast_ops);
927 
928 static struct lock_class_key qdisc_tx_busylock;
929 
930 struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
931                           const struct Qdisc_ops *ops,
932                           struct netlink_ext_ack *extack)
933 {
934         struct Qdisc *sch;
935         unsigned int size = sizeof(*sch) + ops->priv_size;
936         int err = -ENOBUFS;
937         struct net_device *dev;
938 
939         if (!dev_queue) {
940                 NL_SET_ERR_MSG(extack, "No device queue given");
941                 err = -EINVAL;
942                 goto errout;
943         }
944 
945         dev = dev_queue->dev;
946         sch = kzalloc_node(size, GFP_KERNEL, netdev_queue_numa_node_read(dev_queue));
947 
948         if (!sch)
949                 goto errout;
950         __skb_queue_head_init(&sch->gso_skb);
951         __skb_queue_head_init(&sch->skb_bad_txq);
952         gnet_stats_basic_sync_init(&sch->bstats);
953         lockdep_register_key(&sch->root_lock_key);
954         spin_lock_init(&sch->q.lock);
955         lockdep_set_class(&sch->q.lock, &sch->root_lock_key);
956 
957         if (ops->static_flags & TCQ_F_CPUSTATS) {
958                 sch->cpu_bstats =
959                         netdev_alloc_pcpu_stats(struct gnet_stats_basic_sync);
960                 if (!sch->cpu_bstats)
961                         goto errout1;
962 
963                 sch->cpu_qstats = alloc_percpu(struct gnet_stats_queue);
964                 if (!sch->cpu_qstats) {
965                         free_percpu(sch->cpu_bstats);
966                         goto errout1;
967                 }
968         }
969 
970         spin_lock_init(&sch->busylock);
971         lockdep_set_class(&sch->busylock,
972                           dev->qdisc_tx_busylock ?: &qdisc_tx_busylock);
973 
974         /* seqlock has the same scope of busylock, for NOLOCK qdisc */
975         spin_lock_init(&sch->seqlock);
976         lockdep_set_class(&sch->seqlock,
977                           dev->qdisc_tx_busylock ?: &qdisc_tx_busylock);
978 
979         sch->ops = ops;
980         sch->flags = ops->static_flags;
981         sch->enqueue = ops->enqueue;
982         sch->dequeue = ops->dequeue;
983         sch->dev_queue = dev_queue;
984         sch->owner = -1;
985         netdev_hold(dev, &sch->dev_tracker, GFP_KERNEL);
986         refcount_set(&sch->refcnt, 1);
987 
988         return sch;
989 errout1:
990         lockdep_unregister_key(&sch->root_lock_key);
991         kfree(sch);
992 errout:
993         return ERR_PTR(err);
994 }
995 
996 struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue,
997                                 const struct Qdisc_ops *ops,
998                                 unsigned int parentid,
999                                 struct netlink_ext_ack *extack)
1000 {
1001         struct Qdisc *sch;
1002 
1003         if (!try_module_get(ops->owner)) {
1004                 NL_SET_ERR_MSG(extack, "Failed to increase module reference counter");
1005                 return NULL;
1006         }
1007 
1008         sch = qdisc_alloc(dev_queue, ops, extack);
1009         if (IS_ERR(sch)) {
1010                 module_put(ops->owner);
1011                 return NULL;
1012         }
1013         sch->parent = parentid;
1014 
1015         if (!ops->init || ops->init(sch, NULL, extack) == 0) {
1016                 trace_qdisc_create(ops, dev_queue->dev, parentid);
1017                 return sch;
1018         }
1019 
1020         qdisc_put(sch);
1021         return NULL;
1022 }
1023 EXPORT_SYMBOL(qdisc_create_dflt);
1024 
1025 /* Under qdisc_lock(qdisc) and BH! */
1026 
1027 void qdisc_reset(struct Qdisc *qdisc)
1028 {
1029         const struct Qdisc_ops *ops = qdisc->ops;
1030 
1031         trace_qdisc_reset(qdisc);
1032 
1033         if (ops->reset)
1034                 ops->reset(qdisc);
1035 
1036         __skb_queue_purge(&qdisc->gso_skb);
1037         __skb_queue_purge(&qdisc->skb_bad_txq);
1038 
1039         qdisc->q.qlen = 0;
1040         qdisc->qstats.backlog = 0;
1041 }
1042 EXPORT_SYMBOL(qdisc_reset);
1043 
1044 void qdisc_free(struct Qdisc *qdisc)
1045 {
1046         if (qdisc_is_percpu_stats(qdisc)) {
1047                 free_percpu(qdisc->cpu_bstats);
1048                 free_percpu(qdisc->cpu_qstats);
1049         }
1050 
1051         kfree(qdisc);
1052 }
1053 
1054 static void qdisc_free_cb(struct rcu_head *head)
1055 {
1056         struct Qdisc *q = container_of(head, struct Qdisc, rcu);
1057 
1058         qdisc_free(q);
1059 }
1060 
1061 static void __qdisc_destroy(struct Qdisc *qdisc)
1062 {
1063         const struct Qdisc_ops  *ops = qdisc->ops;
1064         struct net_device *dev = qdisc_dev(qdisc);
1065 
1066 #ifdef CONFIG_NET_SCHED
1067         qdisc_hash_del(qdisc);
1068 
1069         qdisc_put_stab(rtnl_dereference(qdisc->stab));
1070 #endif
1071         gen_kill_estimator(&qdisc->rate_est);
1072 
1073         qdisc_reset(qdisc);
1074 
1075 
1076         if (ops->destroy)
1077                 ops->destroy(qdisc);
1078 
1079         lockdep_unregister_key(&qdisc->root_lock_key);
1080         module_put(ops->owner);
1081         netdev_put(dev, &qdisc->dev_tracker);
1082 
1083         trace_qdisc_destroy(qdisc);
1084 
1085         call_rcu(&qdisc->rcu, qdisc_free_cb);
1086 }
1087 
1088 void qdisc_destroy(struct Qdisc *qdisc)
1089 {
1090         if (qdisc->flags & TCQ_F_BUILTIN)
1091                 return;
1092 
1093         __qdisc_destroy(qdisc);
1094 }
1095 
1096 void qdisc_put(struct Qdisc *qdisc)
1097 {
1098         if (!qdisc)
1099                 return;
1100 
1101         if (qdisc->flags & TCQ_F_BUILTIN ||
1102             !refcount_dec_and_test(&qdisc->refcnt))
1103                 return;
1104 
1105         __qdisc_destroy(qdisc);
1106 }
1107 EXPORT_SYMBOL(qdisc_put);
1108 
1109 /* Version of qdisc_put() that is called with rtnl mutex unlocked.
1110  * Intended to be used as optimization, this function only takes rtnl lock if
1111  * qdisc reference counter reached zero.
1112  */
1113 
1114 void qdisc_put_unlocked(struct Qdisc *qdisc)
1115 {
1116         if (qdisc->flags & TCQ_F_BUILTIN ||
1117             !refcount_dec_and_rtnl_lock(&qdisc->refcnt))
1118                 return;
1119 
1120         __qdisc_destroy(qdisc);
1121         rtnl_unlock();
1122 }
1123 EXPORT_SYMBOL(qdisc_put_unlocked);
1124 
1125 /* Attach toplevel qdisc to device queue. */
1126 struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
1127                               struct Qdisc *qdisc)
1128 {
1129         struct Qdisc *oqdisc = rtnl_dereference(dev_queue->qdisc_sleeping);
1130         spinlock_t *root_lock;
1131 
1132         root_lock = qdisc_lock(oqdisc);
1133         spin_lock_bh(root_lock);
1134 
1135         /* ... and graft new one */
1136         if (qdisc == NULL)
1137                 qdisc = &noop_qdisc;
1138         rcu_assign_pointer(dev_queue->qdisc_sleeping, qdisc);
1139         rcu_assign_pointer(dev_queue->qdisc, &noop_qdisc);
1140 
1141         spin_unlock_bh(root_lock);
1142 
1143         return oqdisc;
1144 }
1145 EXPORT_SYMBOL(dev_graft_qdisc);
1146 
1147 static void shutdown_scheduler_queue(struct net_device *dev,
1148                                      struct netdev_queue *dev_queue,
1149                                      void *_qdisc_default)
1150 {
1151         struct Qdisc *qdisc = rtnl_dereference(dev_queue->qdisc_sleeping);
1152         struct Qdisc *qdisc_default = _qdisc_default;
1153 
1154         if (qdisc) {
1155                 rcu_assign_pointer(dev_queue->qdisc, qdisc_default);
1156                 rcu_assign_pointer(dev_queue->qdisc_sleeping, qdisc_default);
1157 
1158                 qdisc_put(qdisc);
1159         }
1160 }
1161 
1162 static void attach_one_default_qdisc(struct net_device *dev,
1163                                      struct netdev_queue *dev_queue,
1164                                      void *_unused)
1165 {
1166         struct Qdisc *qdisc;
1167         const struct Qdisc_ops *ops = default_qdisc_ops;
1168 
1169         if (dev->priv_flags & IFF_NO_QUEUE)
1170                 ops = &noqueue_qdisc_ops;
1171         else if(dev->type == ARPHRD_CAN)
1172                 ops = &pfifo_fast_ops;
1173 
1174         qdisc = qdisc_create_dflt(dev_queue, ops, TC_H_ROOT, NULL);
1175         if (!qdisc)
1176                 return;
1177 
1178         if (!netif_is_multiqueue(dev))
1179                 qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
1180         rcu_assign_pointer(dev_queue->qdisc_sleeping, qdisc);
1181 }
1182 
1183 static void attach_default_qdiscs(struct net_device *dev)
1184 {
1185         struct netdev_queue *txq;
1186         struct Qdisc *qdisc;
1187 
1188         txq = netdev_get_tx_queue(dev, 0);
1189 
1190         if (!netif_is_multiqueue(dev) ||
1191             dev->priv_flags & IFF_NO_QUEUE) {
1192                 netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL);
1193                 qdisc = rtnl_dereference(txq->qdisc_sleeping);
1194                 rcu_assign_pointer(dev->qdisc, qdisc);
1195                 qdisc_refcount_inc(qdisc);
1196         } else {
1197                 qdisc = qdisc_create_dflt(txq, &mq_qdisc_ops, TC_H_ROOT, NULL);
1198                 if (qdisc) {
1199                         rcu_assign_pointer(dev->qdisc, qdisc);
1200                         qdisc->ops->attach(qdisc);
1201                 }
1202         }
1203         qdisc = rtnl_dereference(dev->qdisc);
1204 
1205         /* Detect default qdisc setup/init failed and fallback to "noqueue" */
1206         if (qdisc == &noop_qdisc) {
1207                 netdev_warn(dev, "default qdisc (%s) fail, fallback to %s\n",
1208                             default_qdisc_ops->id, noqueue_qdisc_ops.id);
1209                 netdev_for_each_tx_queue(dev, shutdown_scheduler_queue, &noop_qdisc);
1210                 dev->priv_flags |= IFF_NO_QUEUE;
1211                 netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL);
1212                 qdisc = rtnl_dereference(txq->qdisc_sleeping);
1213                 rcu_assign_pointer(dev->qdisc, qdisc);
1214                 qdisc_refcount_inc(qdisc);
1215                 dev->priv_flags ^= IFF_NO_QUEUE;
1216         }
1217 
1218 #ifdef CONFIG_NET_SCHED
1219         if (qdisc != &noop_qdisc)
1220                 qdisc_hash_add(qdisc, false);
1221 #endif
1222 }
1223 
1224 static void transition_one_qdisc(struct net_device *dev,
1225                                  struct netdev_queue *dev_queue,
1226                                  void *_need_watchdog)
1227 {
1228         struct Qdisc *new_qdisc = rtnl_dereference(dev_queue->qdisc_sleeping);
1229         int *need_watchdog_p = _need_watchdog;
1230 
1231         if (!(new_qdisc->flags & TCQ_F_BUILTIN))
1232                 clear_bit(__QDISC_STATE_DEACTIVATED, &new_qdisc->state);
1233 
1234         rcu_assign_pointer(dev_queue->qdisc, new_qdisc);
1235         if (need_watchdog_p) {
1236                 WRITE_ONCE(dev_queue->trans_start, 0);
1237                 *need_watchdog_p = 1;
1238         }
1239 }
1240 
1241 void dev_activate(struct net_device *dev)
1242 {
1243         int need_watchdog;
1244 
1245         /* No queueing discipline is attached to device;
1246          * create default one for devices, which need queueing
1247          * and noqueue_qdisc for virtual interfaces
1248          */
1249 
1250         if (rtnl_dereference(dev->qdisc) == &noop_qdisc)
1251                 attach_default_qdiscs(dev);
1252 
1253         if (!netif_carrier_ok(dev))
1254                 /* Delay activation until next carrier-on event */
1255                 return;
1256 
1257         need_watchdog = 0;
1258         netdev_for_each_tx_queue(dev, transition_one_qdisc, &need_watchdog);
1259         if (dev_ingress_queue(dev))
1260                 transition_one_qdisc(dev, dev_ingress_queue(dev), NULL);
1261 
1262         if (need_watchdog) {
1263                 netif_trans_update(dev);
1264                 dev_watchdog_up(dev);
1265         }
1266 }
1267 EXPORT_SYMBOL(dev_activate);
1268 
1269 static void qdisc_deactivate(struct Qdisc *qdisc)
1270 {
1271         if (qdisc->flags & TCQ_F_BUILTIN)
1272                 return;
1273 
1274         set_bit(__QDISC_STATE_DEACTIVATED, &qdisc->state);
1275 }
1276 
1277 static void dev_deactivate_queue(struct net_device *dev,
1278                                  struct netdev_queue *dev_queue,
1279                                  void *_qdisc_default)
1280 {
1281         struct Qdisc *qdisc_default = _qdisc_default;
1282         struct Qdisc *qdisc;
1283 
1284         qdisc = rtnl_dereference(dev_queue->qdisc);
1285         if (qdisc) {
1286                 qdisc_deactivate(qdisc);
1287                 rcu_assign_pointer(dev_queue->qdisc, qdisc_default);
1288         }
1289 }
1290 
1291 static void dev_reset_queue(struct net_device *dev,
1292                             struct netdev_queue *dev_queue,
1293                             void *_unused)
1294 {
1295         struct Qdisc *qdisc;
1296         bool nolock;
1297 
1298         qdisc = rtnl_dereference(dev_queue->qdisc_sleeping);
1299         if (!qdisc)
1300                 return;
1301 
1302         nolock = qdisc->flags & TCQ_F_NOLOCK;
1303 
1304         if (nolock)
1305                 spin_lock_bh(&qdisc->seqlock);
1306         spin_lock_bh(qdisc_lock(qdisc));
1307 
1308         qdisc_reset(qdisc);
1309 
1310         spin_unlock_bh(qdisc_lock(qdisc));
1311         if (nolock) {
1312                 clear_bit(__QDISC_STATE_MISSED, &qdisc->state);
1313                 clear_bit(__QDISC_STATE_DRAINING, &qdisc->state);
1314                 spin_unlock_bh(&qdisc->seqlock);
1315         }
1316 }
1317 
1318 static bool some_qdisc_is_busy(struct net_device *dev)
1319 {
1320         unsigned int i;
1321 
1322         for (i = 0; i < dev->num_tx_queues; i++) {
1323                 struct netdev_queue *dev_queue;
1324                 spinlock_t *root_lock;
1325                 struct Qdisc *q;
1326                 int val;
1327 
1328                 dev_queue = netdev_get_tx_queue(dev, i);
1329                 q = rtnl_dereference(dev_queue->qdisc_sleeping);
1330 
1331                 root_lock = qdisc_lock(q);
1332                 spin_lock_bh(root_lock);
1333 
1334                 val = (qdisc_is_running(q) ||
1335                        test_bit(__QDISC_STATE_SCHED, &q->state));
1336 
1337                 spin_unlock_bh(root_lock);
1338 
1339                 if (val)
1340                         return true;
1341         }
1342         return false;
1343 }
1344 
1345 /**
1346  *      dev_deactivate_many - deactivate transmissions on several devices
1347  *      @head: list of devices to deactivate
1348  *
1349  *      This function returns only when all outstanding transmissions
1350  *      have completed, unless all devices are in dismantle phase.
1351  */
1352 void dev_deactivate_many(struct list_head *head)
1353 {
1354         struct net_device *dev;
1355 
1356         list_for_each_entry(dev, head, close_list) {
1357                 netdev_for_each_tx_queue(dev, dev_deactivate_queue,
1358                                          &noop_qdisc);
1359                 if (dev_ingress_queue(dev))
1360                         dev_deactivate_queue(dev, dev_ingress_queue(dev),
1361                                              &noop_qdisc);
1362 
1363                 dev_watchdog_down(dev);
1364         }
1365 
1366         /* Wait for outstanding qdisc-less dev_queue_xmit calls or
1367          * outstanding qdisc enqueuing calls.
1368          * This is avoided if all devices are in dismantle phase :
1369          * Caller will call synchronize_net() for us
1370          */
1371         synchronize_net();
1372 
1373         list_for_each_entry(dev, head, close_list) {
1374                 netdev_for_each_tx_queue(dev, dev_reset_queue, NULL);
1375 
1376                 if (dev_ingress_queue(dev))
1377                         dev_reset_queue(dev, dev_ingress_queue(dev), NULL);
1378         }
1379 
1380         /* Wait for outstanding qdisc_run calls. */
1381         list_for_each_entry(dev, head, close_list) {
1382                 while (some_qdisc_is_busy(dev)) {
1383                         /* wait_event() would avoid this sleep-loop but would
1384                          * require expensive checks in the fast paths of packet
1385                          * processing which isn't worth it.
1386                          */
1387                         schedule_timeout_uninterruptible(1);
1388                 }
1389         }
1390 }
1391 
1392 void dev_deactivate(struct net_device *dev)
1393 {
1394         LIST_HEAD(single);
1395 
1396         list_add(&dev->close_list, &single);
1397         dev_deactivate_many(&single);
1398         list_del(&single);
1399 }
1400 EXPORT_SYMBOL(dev_deactivate);
1401 
1402 static int qdisc_change_tx_queue_len(struct net_device *dev,
1403                                      struct netdev_queue *dev_queue)
1404 {
1405         struct Qdisc *qdisc = rtnl_dereference(dev_queue->qdisc_sleeping);
1406         const struct Qdisc_ops *ops = qdisc->ops;
1407 
1408         if (ops->change_tx_queue_len)
1409                 return ops->change_tx_queue_len(qdisc, dev->tx_queue_len);
1410         return 0;
1411 }
1412 
1413 void dev_qdisc_change_real_num_tx(struct net_device *dev,
1414                                   unsigned int new_real_tx)
1415 {
1416         struct Qdisc *qdisc = rtnl_dereference(dev->qdisc);
1417 
1418         if (qdisc->ops->change_real_num_tx)
1419                 qdisc->ops->change_real_num_tx(qdisc, new_real_tx);
1420 }
1421 
1422 void mq_change_real_num_tx(struct Qdisc *sch, unsigned int new_real_tx)
1423 {
1424 #ifdef CONFIG_NET_SCHED
1425         struct net_device *dev = qdisc_dev(sch);
1426         struct Qdisc *qdisc;
1427         unsigned int i;
1428 
1429         for (i = new_real_tx; i < dev->real_num_tx_queues; i++) {
1430                 qdisc = rtnl_dereference(netdev_get_tx_queue(dev, i)->qdisc_sleeping);
1431                 /* Only update the default qdiscs we created,
1432                  * qdiscs with handles are always hashed.
1433                  */
1434                 if (qdisc != &noop_qdisc && !qdisc->handle)
1435                         qdisc_hash_del(qdisc);
1436         }
1437         for (i = dev->real_num_tx_queues; i < new_real_tx; i++) {
1438                 qdisc = rtnl_dereference(netdev_get_tx_queue(dev, i)->qdisc_sleeping);
1439                 if (qdisc != &noop_qdisc && !qdisc->handle)
1440                         qdisc_hash_add(qdisc, false);
1441         }
1442 #endif
1443 }
1444 EXPORT_SYMBOL(mq_change_real_num_tx);
1445 
1446 int dev_qdisc_change_tx_queue_len(struct net_device *dev)
1447 {
1448         bool up = dev->flags & IFF_UP;
1449         unsigned int i;
1450         int ret = 0;
1451 
1452         if (up)
1453                 dev_deactivate(dev);
1454 
1455         for (i = 0; i < dev->num_tx_queues; i++) {
1456                 ret = qdisc_change_tx_queue_len(dev, &dev->_tx[i]);
1457 
1458                 /* TODO: revert changes on a partial failure */
1459                 if (ret)
1460                         break;
1461         }
1462 
1463         if (up)
1464                 dev_activate(dev);
1465         return ret;
1466 }
1467 
1468 static void dev_init_scheduler_queue(struct net_device *dev,
1469                                      struct netdev_queue *dev_queue,
1470                                      void *_qdisc)
1471 {
1472         struct Qdisc *qdisc = _qdisc;
1473 
1474         rcu_assign_pointer(dev_queue->qdisc, qdisc);
1475         rcu_assign_pointer(dev_queue->qdisc_sleeping, qdisc);
1476 }
1477 
1478 void dev_init_scheduler(struct net_device *dev)
1479 {
1480         rcu_assign_pointer(dev->qdisc, &noop_qdisc);
1481         netdev_for_each_tx_queue(dev, dev_init_scheduler_queue, &noop_qdisc);
1482         if (dev_ingress_queue(dev))
1483                 dev_init_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc);
1484 
1485         timer_setup(&dev->watchdog_timer, dev_watchdog, 0);
1486 }
1487 
1488 void dev_shutdown(struct net_device *dev)
1489 {
1490         netdev_for_each_tx_queue(dev, shutdown_scheduler_queue, &noop_qdisc);
1491         if (dev_ingress_queue(dev))
1492                 shutdown_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc);
1493         qdisc_put(rtnl_dereference(dev->qdisc));
1494         rcu_assign_pointer(dev->qdisc, &noop_qdisc);
1495 
1496         WARN_ON(timer_pending(&dev->watchdog_timer));
1497 }
1498 
1499 /**
1500  * psched_ratecfg_precompute__() - Pre-compute values for reciprocal division
1501  * @rate:   Rate to compute reciprocal division values of
1502  * @mult:   Multiplier for reciprocal division
1503  * @shift:  Shift for reciprocal division
1504  *
1505  * The multiplier and shift for reciprocal division by rate are stored
1506  * in mult and shift.
1507  *
1508  * The deal here is to replace a divide by a reciprocal one
1509  * in fast path (a reciprocal divide is a multiply and a shift)
1510  *
1511  * Normal formula would be :
1512  *  time_in_ns = (NSEC_PER_SEC * len) / rate_bps
1513  *
1514  * We compute mult/shift to use instead :
1515  *  time_in_ns = (len * mult) >> shift;
1516  *
1517  * We try to get the highest possible mult value for accuracy,
1518  * but have to make sure no overflows will ever happen.
1519  *
1520  * reciprocal_value() is not used here it doesn't handle 64-bit values.
1521  */
1522 static void psched_ratecfg_precompute__(u64 rate, u32 *mult, u8 *shift)
1523 {
1524         u64 factor = NSEC_PER_SEC;
1525 
1526         *mult = 1;
1527         *shift = 0;
1528 
1529         if (rate <= 0)
1530                 return;
1531 
1532         for (;;) {
1533                 *mult = div64_u64(factor, rate);
1534                 if (*mult & (1U << 31) || factor & (1ULL << 63))
1535                         break;
1536                 factor <<= 1;
1537                 (*shift)++;
1538         }
1539 }
1540 
1541 void psched_ratecfg_precompute(struct psched_ratecfg *r,
1542                                const struct tc_ratespec *conf,
1543                                u64 rate64)
1544 {
1545         memset(r, 0, sizeof(*r));
1546         r->overhead = conf->overhead;
1547         r->mpu = conf->mpu;
1548         r->rate_bytes_ps = max_t(u64, conf->rate, rate64);
1549         r->linklayer = (conf->linklayer & TC_LINKLAYER_MASK);
1550         psched_ratecfg_precompute__(r->rate_bytes_ps, &r->mult, &r->shift);
1551 }
1552 EXPORT_SYMBOL(psched_ratecfg_precompute);
1553 
1554 void psched_ppscfg_precompute(struct psched_pktrate *r, u64 pktrate64)
1555 {
1556         r->rate_pkts_ps = pktrate64;
1557         psched_ratecfg_precompute__(r->rate_pkts_ps, &r->mult, &r->shift);
1558 }
1559 EXPORT_SYMBOL(psched_ppscfg_precompute);
1560 
1561 void mini_qdisc_pair_swap(struct mini_Qdisc_pair *miniqp,
1562                           struct tcf_proto *tp_head)
1563 {
1564         /* Protected with chain0->filter_chain_lock.
1565          * Can't access chain directly because tp_head can be NULL.
1566          */
1567         struct mini_Qdisc *miniq_old =
1568                 rcu_dereference_protected(*miniqp->p_miniq, 1);
1569         struct mini_Qdisc *miniq;
1570 
1571         if (!tp_head) {
1572                 RCU_INIT_POINTER(*miniqp->p_miniq, NULL);
1573         } else {
1574                 miniq = miniq_old != &miniqp->miniq1 ?
1575                         &miniqp->miniq1 : &miniqp->miniq2;
1576 
1577                 /* We need to make sure that readers won't see the miniq
1578                  * we are about to modify. So ensure that at least one RCU
1579                  * grace period has elapsed since the miniq was made
1580                  * inactive.
1581                  */
1582                 if (IS_ENABLED(CONFIG_PREEMPT_RT))
1583                         cond_synchronize_rcu(miniq->rcu_state);
1584                 else if (!poll_state_synchronize_rcu(miniq->rcu_state))
1585                         synchronize_rcu_expedited();
1586 
1587                 miniq->filter_list = tp_head;
1588                 rcu_assign_pointer(*miniqp->p_miniq, miniq);
1589         }
1590 
1591         if (miniq_old)
1592                 /* This is counterpart of the rcu sync above. We need to
1593                  * block potential new user of miniq_old until all readers
1594                  * are not seeing it.
1595                  */
1596                 miniq_old->rcu_state = start_poll_synchronize_rcu();
1597 }
1598 EXPORT_SYMBOL(mini_qdisc_pair_swap);
1599 
1600 void mini_qdisc_pair_block_init(struct mini_Qdisc_pair *miniqp,
1601                                 struct tcf_block *block)
1602 {
1603         miniqp->miniq1.block = block;
1604         miniqp->miniq2.block = block;
1605 }
1606 EXPORT_SYMBOL(mini_qdisc_pair_block_init);
1607 
1608 void mini_qdisc_pair_init(struct mini_Qdisc_pair *miniqp, struct Qdisc *qdisc,
1609                           struct mini_Qdisc __rcu **p_miniq)
1610 {
1611         miniqp->miniq1.cpu_bstats = qdisc->cpu_bstats;
1612         miniqp->miniq1.cpu_qstats = qdisc->cpu_qstats;
1613         miniqp->miniq2.cpu_bstats = qdisc->cpu_bstats;
1614         miniqp->miniq2.cpu_qstats = qdisc->cpu_qstats;
1615         miniqp->miniq1.rcu_state = get_state_synchronize_rcu();
1616         miniqp->miniq2.rcu_state = miniqp->miniq1.rcu_state;
1617         miniqp->p_miniq = p_miniq;
1618 }
1619 EXPORT_SYMBOL(mini_qdisc_pair_init);
1620 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

sflogo.php