~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/net/core/dev.c

Version: ~ [ linux-6.11.5 ] ~ [ linux-6.10.14 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.58 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.114 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.169 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.228 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.284 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.322 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.9 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 // SPDX-License-Identifier: GPL-2.0-or-later
  2 /*
  3  *      NET3    Protocol independent device support routines.
  4  *
  5  *      Derived from the non IP parts of dev.c 1.0.19
  6  *              Authors:        Ross Biro
  7  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  8  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  9  *
 10  *      Additional Authors:
 11  *              Florian la Roche <rzsfl@rz.uni-sb.de>
 12  *              Alan Cox <gw4pts@gw4pts.ampr.org>
 13  *              David Hinds <dahinds@users.sourceforge.net>
 14  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
 15  *              Adam Sulmicki <adam@cfar.umd.edu>
 16  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
 17  *
 18  *      Changes:
 19  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
 20  *                                      to 2 if register_netdev gets called
 21  *                                      before net_dev_init & also removed a
 22  *                                      few lines of code in the process.
 23  *              Alan Cox        :       device private ioctl copies fields back.
 24  *              Alan Cox        :       Transmit queue code does relevant
 25  *                                      stunts to keep the queue safe.
 26  *              Alan Cox        :       Fixed double lock.
 27  *              Alan Cox        :       Fixed promisc NULL pointer trap
 28  *              ????????        :       Support the full private ioctl range
 29  *              Alan Cox        :       Moved ioctl permission check into
 30  *                                      drivers
 31  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
 32  *              Alan Cox        :       100 backlog just doesn't cut it when
 33  *                                      you start doing multicast video 8)
 34  *              Alan Cox        :       Rewrote net_bh and list manager.
 35  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
 36  *              Alan Cox        :       Took out transmit every packet pass
 37  *                                      Saved a few bytes in the ioctl handler
 38  *              Alan Cox        :       Network driver sets packet type before
 39  *                                      calling netif_rx. Saves a function
 40  *                                      call a packet.
 41  *              Alan Cox        :       Hashed net_bh()
 42  *              Richard Kooijman:       Timestamp fixes.
 43  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
 44  *              Alan Cox        :       Device lock protection.
 45  *              Alan Cox        :       Fixed nasty side effect of device close
 46  *                                      changes.
 47  *              Rudi Cilibrasi  :       Pass the right thing to
 48  *                                      set_mac_address()
 49  *              Dave Miller     :       32bit quantity for the device lock to
 50  *                                      make it work out on a Sparc.
 51  *              Bjorn Ekwall    :       Added KERNELD hack.
 52  *              Alan Cox        :       Cleaned up the backlog initialise.
 53  *              Craig Metz      :       SIOCGIFCONF fix if space for under
 54  *                                      1 device.
 55  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
 56  *                                      is no device open function.
 57  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
 58  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
 59  *              Cyrus Durgin    :       Cleaned for KMOD
 60  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
 61  *                                      A network device unload needs to purge
 62  *                                      the backlog queue.
 63  *      Paul Rusty Russell      :       SIOCSIFNAME
 64  *              Pekka Riikonen  :       Netdev boot-time settings code
 65  *              Andrew Morton   :       Make unregister_netdevice wait
 66  *                                      indefinitely on dev->refcnt
 67  *              J Hadi Salim    :       - Backlog queue sampling
 68  *                                      - netif_rx() feedback
 69  */
 70 
 71 #include <linux/uaccess.h>
 72 #include <linux/bitmap.h>
 73 #include <linux/capability.h>
 74 #include <linux/cpu.h>
 75 #include <linux/types.h>
 76 #include <linux/kernel.h>
 77 #include <linux/hash.h>
 78 #include <linux/slab.h>
 79 #include <linux/sched.h>
 80 #include <linux/sched/isolation.h>
 81 #include <linux/sched/mm.h>
 82 #include <linux/smpboot.h>
 83 #include <linux/mutex.h>
 84 #include <linux/rwsem.h>
 85 #include <linux/string.h>
 86 #include <linux/mm.h>
 87 #include <linux/socket.h>
 88 #include <linux/sockios.h>
 89 #include <linux/errno.h>
 90 #include <linux/interrupt.h>
 91 #include <linux/if_ether.h>
 92 #include <linux/netdevice.h>
 93 #include <linux/etherdevice.h>
 94 #include <linux/ethtool.h>
 95 #include <linux/skbuff.h>
 96 #include <linux/kthread.h>
 97 #include <linux/bpf.h>
 98 #include <linux/bpf_trace.h>
 99 #include <net/net_namespace.h>
100 #include <net/sock.h>
101 #include <net/busy_poll.h>
102 #include <linux/rtnetlink.h>
103 #include <linux/stat.h>
104 #include <net/dsa.h>
105 #include <net/dst.h>
106 #include <net/dst_metadata.h>
107 #include <net/gro.h>
108 #include <net/pkt_sched.h>
109 #include <net/pkt_cls.h>
110 #include <net/checksum.h>
111 #include <net/xfrm.h>
112 #include <net/tcx.h>
113 #include <linux/highmem.h>
114 #include <linux/init.h>
115 #include <linux/module.h>
116 #include <linux/netpoll.h>
117 #include <linux/rcupdate.h>
118 #include <linux/delay.h>
119 #include <net/iw_handler.h>
120 #include <asm/current.h>
121 #include <linux/audit.h>
122 #include <linux/dmaengine.h>
123 #include <linux/err.h>
124 #include <linux/ctype.h>
125 #include <linux/if_arp.h>
126 #include <linux/if_vlan.h>
127 #include <linux/ip.h>
128 #include <net/ip.h>
129 #include <net/mpls.h>
130 #include <linux/ipv6.h>
131 #include <linux/in.h>
132 #include <linux/jhash.h>
133 #include <linux/random.h>
134 #include <trace/events/napi.h>
135 #include <trace/events/net.h>
136 #include <trace/events/skb.h>
137 #include <trace/events/qdisc.h>
138 #include <trace/events/xdp.h>
139 #include <linux/inetdevice.h>
140 #include <linux/cpu_rmap.h>
141 #include <linux/static_key.h>
142 #include <linux/hashtable.h>
143 #include <linux/vmalloc.h>
144 #include <linux/if_macvlan.h>
145 #include <linux/errqueue.h>
146 #include <linux/hrtimer.h>
147 #include <linux/netfilter_netdev.h>
148 #include <linux/crash_dump.h>
149 #include <linux/sctp.h>
150 #include <net/udp_tunnel.h>
151 #include <linux/net_namespace.h>
152 #include <linux/indirect_call_wrapper.h>
153 #include <net/devlink.h>
154 #include <linux/pm_runtime.h>
155 #include <linux/prandom.h>
156 #include <linux/once_lite.h>
157 #include <net/netdev_rx_queue.h>
158 #include <net/page_pool/types.h>
159 #include <net/page_pool/helpers.h>
160 #include <net/rps.h>
161 
162 #include "dev.h"
163 #include "net-sysfs.h"
164 
165 static DEFINE_SPINLOCK(ptype_lock);
166 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
167 
168 static int netif_rx_internal(struct sk_buff *skb);
169 static int call_netdevice_notifiers_extack(unsigned long val,
170                                            struct net_device *dev,
171                                            struct netlink_ext_ack *extack);
172 
173 static DEFINE_MUTEX(ifalias_mutex);
174 
175 /* protects napi_hash addition/deletion and napi_gen_id */
176 static DEFINE_SPINLOCK(napi_hash_lock);
177 
178 static unsigned int napi_gen_id = NR_CPUS;
179 static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
180 
181 static DECLARE_RWSEM(devnet_rename_sem);
182 
183 static inline void dev_base_seq_inc(struct net *net)
184 {
185         unsigned int val = net->dev_base_seq + 1;
186 
187         WRITE_ONCE(net->dev_base_seq, val ?: 1);
188 }
189 
190 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
191 {
192         unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ));
193 
194         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
195 }
196 
197 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
198 {
199         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
200 }
201 
202 #ifndef CONFIG_PREEMPT_RT
203 
204 static DEFINE_STATIC_KEY_FALSE(use_backlog_threads_key);
205 
206 static int __init setup_backlog_napi_threads(char *arg)
207 {
208         static_branch_enable(&use_backlog_threads_key);
209         return 0;
210 }
211 early_param("thread_backlog_napi", setup_backlog_napi_threads);
212 
213 static bool use_backlog_threads(void)
214 {
215         return static_branch_unlikely(&use_backlog_threads_key);
216 }
217 
218 #else
219 
220 static bool use_backlog_threads(void)
221 {
222         return true;
223 }
224 
225 #endif
226 
227 static inline void backlog_lock_irq_save(struct softnet_data *sd,
228                                          unsigned long *flags)
229 {
230         if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
231                 spin_lock_irqsave(&sd->input_pkt_queue.lock, *flags);
232         else
233                 local_irq_save(*flags);
234 }
235 
236 static inline void backlog_lock_irq_disable(struct softnet_data *sd)
237 {
238         if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
239                 spin_lock_irq(&sd->input_pkt_queue.lock);
240         else
241                 local_irq_disable();
242 }
243 
244 static inline void backlog_unlock_irq_restore(struct softnet_data *sd,
245                                               unsigned long *flags)
246 {
247         if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
248                 spin_unlock_irqrestore(&sd->input_pkt_queue.lock, *flags);
249         else
250                 local_irq_restore(*flags);
251 }
252 
253 static inline void backlog_unlock_irq_enable(struct softnet_data *sd)
254 {
255         if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
256                 spin_unlock_irq(&sd->input_pkt_queue.lock);
257         else
258                 local_irq_enable();
259 }
260 
261 static struct netdev_name_node *netdev_name_node_alloc(struct net_device *dev,
262                                                        const char *name)
263 {
264         struct netdev_name_node *name_node;
265 
266         name_node = kmalloc(sizeof(*name_node), GFP_KERNEL);
267         if (!name_node)
268                 return NULL;
269         INIT_HLIST_NODE(&name_node->hlist);
270         name_node->dev = dev;
271         name_node->name = name;
272         return name_node;
273 }
274 
275 static struct netdev_name_node *
276 netdev_name_node_head_alloc(struct net_device *dev)
277 {
278         struct netdev_name_node *name_node;
279 
280         name_node = netdev_name_node_alloc(dev, dev->name);
281         if (!name_node)
282                 return NULL;
283         INIT_LIST_HEAD(&name_node->list);
284         return name_node;
285 }
286 
287 static void netdev_name_node_free(struct netdev_name_node *name_node)
288 {
289         kfree(name_node);
290 }
291 
292 static void netdev_name_node_add(struct net *net,
293                                  struct netdev_name_node *name_node)
294 {
295         hlist_add_head_rcu(&name_node->hlist,
296                            dev_name_hash(net, name_node->name));
297 }
298 
299 static void netdev_name_node_del(struct netdev_name_node *name_node)
300 {
301         hlist_del_rcu(&name_node->hlist);
302 }
303 
304 static struct netdev_name_node *netdev_name_node_lookup(struct net *net,
305                                                         const char *name)
306 {
307         struct hlist_head *head = dev_name_hash(net, name);
308         struct netdev_name_node *name_node;
309 
310         hlist_for_each_entry(name_node, head, hlist)
311                 if (!strcmp(name_node->name, name))
312                         return name_node;
313         return NULL;
314 }
315 
316 static struct netdev_name_node *netdev_name_node_lookup_rcu(struct net *net,
317                                                             const char *name)
318 {
319         struct hlist_head *head = dev_name_hash(net, name);
320         struct netdev_name_node *name_node;
321 
322         hlist_for_each_entry_rcu(name_node, head, hlist)
323                 if (!strcmp(name_node->name, name))
324                         return name_node;
325         return NULL;
326 }
327 
328 bool netdev_name_in_use(struct net *net, const char *name)
329 {
330         return netdev_name_node_lookup(net, name);
331 }
332 EXPORT_SYMBOL(netdev_name_in_use);
333 
334 int netdev_name_node_alt_create(struct net_device *dev, const char *name)
335 {
336         struct netdev_name_node *name_node;
337         struct net *net = dev_net(dev);
338 
339         name_node = netdev_name_node_lookup(net, name);
340         if (name_node)
341                 return -EEXIST;
342         name_node = netdev_name_node_alloc(dev, name);
343         if (!name_node)
344                 return -ENOMEM;
345         netdev_name_node_add(net, name_node);
346         /* The node that holds dev->name acts as a head of per-device list. */
347         list_add_tail_rcu(&name_node->list, &dev->name_node->list);
348 
349         return 0;
350 }
351 
352 static void netdev_name_node_alt_free(struct rcu_head *head)
353 {
354         struct netdev_name_node *name_node =
355                 container_of(head, struct netdev_name_node, rcu);
356 
357         kfree(name_node->name);
358         netdev_name_node_free(name_node);
359 }
360 
361 static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node)
362 {
363         netdev_name_node_del(name_node);
364         list_del(&name_node->list);
365         call_rcu(&name_node->rcu, netdev_name_node_alt_free);
366 }
367 
368 int netdev_name_node_alt_destroy(struct net_device *dev, const char *name)
369 {
370         struct netdev_name_node *name_node;
371         struct net *net = dev_net(dev);
372 
373         name_node = netdev_name_node_lookup(net, name);
374         if (!name_node)
375                 return -ENOENT;
376         /* lookup might have found our primary name or a name belonging
377          * to another device.
378          */
379         if (name_node == dev->name_node || name_node->dev != dev)
380                 return -EINVAL;
381 
382         __netdev_name_node_alt_destroy(name_node);
383         return 0;
384 }
385 
386 static void netdev_name_node_alt_flush(struct net_device *dev)
387 {
388         struct netdev_name_node *name_node, *tmp;
389 
390         list_for_each_entry_safe(name_node, tmp, &dev->name_node->list, list) {
391                 list_del(&name_node->list);
392                 netdev_name_node_alt_free(&name_node->rcu);
393         }
394 }
395 
396 /* Device list insertion */
397 static void list_netdevice(struct net_device *dev)
398 {
399         struct netdev_name_node *name_node;
400         struct net *net = dev_net(dev);
401 
402         ASSERT_RTNL();
403 
404         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
405         netdev_name_node_add(net, dev->name_node);
406         hlist_add_head_rcu(&dev->index_hlist,
407                            dev_index_hash(net, dev->ifindex));
408 
409         netdev_for_each_altname(dev, name_node)
410                 netdev_name_node_add(net, name_node);
411 
412         /* We reserved the ifindex, this can't fail */
413         WARN_ON(xa_store(&net->dev_by_index, dev->ifindex, dev, GFP_KERNEL));
414 
415         dev_base_seq_inc(net);
416 }
417 
418 /* Device list removal
419  * caller must respect a RCU grace period before freeing/reusing dev
420  */
421 static void unlist_netdevice(struct net_device *dev)
422 {
423         struct netdev_name_node *name_node;
424         struct net *net = dev_net(dev);
425 
426         ASSERT_RTNL();
427 
428         xa_erase(&net->dev_by_index, dev->ifindex);
429 
430         netdev_for_each_altname(dev, name_node)
431                 netdev_name_node_del(name_node);
432 
433         /* Unlink dev from the device chain */
434         list_del_rcu(&dev->dev_list);
435         netdev_name_node_del(dev->name_node);
436         hlist_del_rcu(&dev->index_hlist);
437 
438         dev_base_seq_inc(dev_net(dev));
439 }
440 
441 /*
442  *      Our notifier list
443  */
444 
445 static RAW_NOTIFIER_HEAD(netdev_chain);
446 
447 /*
448  *      Device drivers call our routines to queue packets here. We empty the
449  *      queue in the local softnet handler.
450  */
451 
452 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data) = {
453         .process_queue_bh_lock = INIT_LOCAL_LOCK(process_queue_bh_lock),
454 };
455 EXPORT_PER_CPU_SYMBOL(softnet_data);
456 
457 /* Page_pool has a lockless array/stack to alloc/recycle pages.
458  * PP consumers must pay attention to run APIs in the appropriate context
459  * (e.g. NAPI context).
460  */
461 static DEFINE_PER_CPU(struct page_pool *, system_page_pool);
462 
463 #ifdef CONFIG_LOCKDEP
464 /*
465  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
466  * according to dev->type
467  */
468 static const unsigned short netdev_lock_type[] = {
469          ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
470          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
471          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
472          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
473          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
474          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
475          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
476          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
477          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
478          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
479          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
480          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
481          ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
482          ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
483          ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
484 
485 static const char *const netdev_lock_name[] = {
486         "_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
487         "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
488         "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
489         "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
490         "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
491         "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
492         "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
493         "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
494         "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
495         "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
496         "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
497         "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
498         "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
499         "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
500         "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
501 
502 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
503 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
504 
505 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
506 {
507         int i;
508 
509         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
510                 if (netdev_lock_type[i] == dev_type)
511                         return i;
512         /* the last key is used by default */
513         return ARRAY_SIZE(netdev_lock_type) - 1;
514 }
515 
516 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
517                                                  unsigned short dev_type)
518 {
519         int i;
520 
521         i = netdev_lock_pos(dev_type);
522         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
523                                    netdev_lock_name[i]);
524 }
525 
526 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
527 {
528         int i;
529 
530         i = netdev_lock_pos(dev->type);
531         lockdep_set_class_and_name(&dev->addr_list_lock,
532                                    &netdev_addr_lock_key[i],
533                                    netdev_lock_name[i]);
534 }
535 #else
536 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
537                                                  unsigned short dev_type)
538 {
539 }
540 
541 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
542 {
543 }
544 #endif
545 
546 /*******************************************************************************
547  *
548  *              Protocol management and registration routines
549  *
550  *******************************************************************************/
551 
552 
553 /*
554  *      Add a protocol ID to the list. Now that the input handler is
555  *      smarter we can dispense with all the messy stuff that used to be
556  *      here.
557  *
558  *      BEWARE!!! Protocol handlers, mangling input packets,
559  *      MUST BE last in hash buckets and checking protocol handlers
560  *      MUST start from promiscuous ptype_all chain in net_bh.
561  *      It is true now, do not change it.
562  *      Explanation follows: if protocol handler, mangling packet, will
563  *      be the first on list, it is not able to sense, that packet
564  *      is cloned and should be copied-on-write, so that it will
565  *      change it and subsequent readers will get broken packet.
566  *                                                      --ANK (980803)
567  */
568 
569 static inline struct list_head *ptype_head(const struct packet_type *pt)
570 {
571         if (pt->type == htons(ETH_P_ALL))
572                 return pt->dev ? &pt->dev->ptype_all : &net_hotdata.ptype_all;
573         else
574                 return pt->dev ? &pt->dev->ptype_specific :
575                                  &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
576 }
577 
578 /**
579  *      dev_add_pack - add packet handler
580  *      @pt: packet type declaration
581  *
582  *      Add a protocol handler to the networking stack. The passed &packet_type
583  *      is linked into kernel lists and may not be freed until it has been
584  *      removed from the kernel lists.
585  *
586  *      This call does not sleep therefore it can not
587  *      guarantee all CPU's that are in middle of receiving packets
588  *      will see the new packet type (until the next received packet).
589  */
590 
591 void dev_add_pack(struct packet_type *pt)
592 {
593         struct list_head *head = ptype_head(pt);
594 
595         spin_lock(&ptype_lock);
596         list_add_rcu(&pt->list, head);
597         spin_unlock(&ptype_lock);
598 }
599 EXPORT_SYMBOL(dev_add_pack);
600 
601 /**
602  *      __dev_remove_pack        - remove packet handler
603  *      @pt: packet type declaration
604  *
605  *      Remove a protocol handler that was previously added to the kernel
606  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
607  *      from the kernel lists and can be freed or reused once this function
608  *      returns.
609  *
610  *      The packet type might still be in use by receivers
611  *      and must not be freed until after all the CPU's have gone
612  *      through a quiescent state.
613  */
614 void __dev_remove_pack(struct packet_type *pt)
615 {
616         struct list_head *head = ptype_head(pt);
617         struct packet_type *pt1;
618 
619         spin_lock(&ptype_lock);
620 
621         list_for_each_entry(pt1, head, list) {
622                 if (pt == pt1) {
623                         list_del_rcu(&pt->list);
624                         goto out;
625                 }
626         }
627 
628         pr_warn("dev_remove_pack: %p not found\n", pt);
629 out:
630         spin_unlock(&ptype_lock);
631 }
632 EXPORT_SYMBOL(__dev_remove_pack);
633 
634 /**
635  *      dev_remove_pack  - remove packet handler
636  *      @pt: packet type declaration
637  *
638  *      Remove a protocol handler that was previously added to the kernel
639  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
640  *      from the kernel lists and can be freed or reused once this function
641  *      returns.
642  *
643  *      This call sleeps to guarantee that no CPU is looking at the packet
644  *      type after return.
645  */
646 void dev_remove_pack(struct packet_type *pt)
647 {
648         __dev_remove_pack(pt);
649 
650         synchronize_net();
651 }
652 EXPORT_SYMBOL(dev_remove_pack);
653 
654 
655 /*******************************************************************************
656  *
657  *                          Device Interface Subroutines
658  *
659  *******************************************************************************/
660 
661 /**
662  *      dev_get_iflink  - get 'iflink' value of a interface
663  *      @dev: targeted interface
664  *
665  *      Indicates the ifindex the interface is linked to.
666  *      Physical interfaces have the same 'ifindex' and 'iflink' values.
667  */
668 
669 int dev_get_iflink(const struct net_device *dev)
670 {
671         if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
672                 return dev->netdev_ops->ndo_get_iflink(dev);
673 
674         return READ_ONCE(dev->ifindex);
675 }
676 EXPORT_SYMBOL(dev_get_iflink);
677 
678 /**
679  *      dev_fill_metadata_dst - Retrieve tunnel egress information.
680  *      @dev: targeted interface
681  *      @skb: The packet.
682  *
683  *      For better visibility of tunnel traffic OVS needs to retrieve
684  *      egress tunnel information for a packet. Following API allows
685  *      user to get this info.
686  */
687 int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
688 {
689         struct ip_tunnel_info *info;
690 
691         if (!dev->netdev_ops  || !dev->netdev_ops->ndo_fill_metadata_dst)
692                 return -EINVAL;
693 
694         info = skb_tunnel_info_unclone(skb);
695         if (!info)
696                 return -ENOMEM;
697         if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
698                 return -EINVAL;
699 
700         return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
701 }
702 EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
703 
704 static struct net_device_path *dev_fwd_path(struct net_device_path_stack *stack)
705 {
706         int k = stack->num_paths++;
707 
708         if (WARN_ON_ONCE(k >= NET_DEVICE_PATH_STACK_MAX))
709                 return NULL;
710 
711         return &stack->path[k];
712 }
713 
714 int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr,
715                           struct net_device_path_stack *stack)
716 {
717         const struct net_device *last_dev;
718         struct net_device_path_ctx ctx = {
719                 .dev    = dev,
720         };
721         struct net_device_path *path;
722         int ret = 0;
723 
724         memcpy(ctx.daddr, daddr, sizeof(ctx.daddr));
725         stack->num_paths = 0;
726         while (ctx.dev && ctx.dev->netdev_ops->ndo_fill_forward_path) {
727                 last_dev = ctx.dev;
728                 path = dev_fwd_path(stack);
729                 if (!path)
730                         return -1;
731 
732                 memset(path, 0, sizeof(struct net_device_path));
733                 ret = ctx.dev->netdev_ops->ndo_fill_forward_path(&ctx, path);
734                 if (ret < 0)
735                         return -1;
736 
737                 if (WARN_ON_ONCE(last_dev == ctx.dev))
738                         return -1;
739         }
740 
741         if (!ctx.dev)
742                 return ret;
743 
744         path = dev_fwd_path(stack);
745         if (!path)
746                 return -1;
747         path->type = DEV_PATH_ETHERNET;
748         path->dev = ctx.dev;
749 
750         return ret;
751 }
752 EXPORT_SYMBOL_GPL(dev_fill_forward_path);
753 
754 /**
755  *      __dev_get_by_name       - find a device by its name
756  *      @net: the applicable net namespace
757  *      @name: name to find
758  *
759  *      Find an interface by name. Must be called under RTNL semaphore.
760  *      If the name is found a pointer to the device is returned.
761  *      If the name is not found then %NULL is returned. The
762  *      reference counters are not incremented so the caller must be
763  *      careful with locks.
764  */
765 
766 struct net_device *__dev_get_by_name(struct net *net, const char *name)
767 {
768         struct netdev_name_node *node_name;
769 
770         node_name = netdev_name_node_lookup(net, name);
771         return node_name ? node_name->dev : NULL;
772 }
773 EXPORT_SYMBOL(__dev_get_by_name);
774 
775 /**
776  * dev_get_by_name_rcu  - find a device by its name
777  * @net: the applicable net namespace
778  * @name: name to find
779  *
780  * Find an interface by name.
781  * If the name is found a pointer to the device is returned.
782  * If the name is not found then %NULL is returned.
783  * The reference counters are not incremented so the caller must be
784  * careful with locks. The caller must hold RCU lock.
785  */
786 
787 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
788 {
789         struct netdev_name_node *node_name;
790 
791         node_name = netdev_name_node_lookup_rcu(net, name);
792         return node_name ? node_name->dev : NULL;
793 }
794 EXPORT_SYMBOL(dev_get_by_name_rcu);
795 
796 /* Deprecated for new users, call netdev_get_by_name() instead */
797 struct net_device *dev_get_by_name(struct net *net, const char *name)
798 {
799         struct net_device *dev;
800 
801         rcu_read_lock();
802         dev = dev_get_by_name_rcu(net, name);
803         dev_hold(dev);
804         rcu_read_unlock();
805         return dev;
806 }
807 EXPORT_SYMBOL(dev_get_by_name);
808 
809 /**
810  *      netdev_get_by_name() - find a device by its name
811  *      @net: the applicable net namespace
812  *      @name: name to find
813  *      @tracker: tracking object for the acquired reference
814  *      @gfp: allocation flags for the tracker
815  *
816  *      Find an interface by name. This can be called from any
817  *      context and does its own locking. The returned handle has
818  *      the usage count incremented and the caller must use netdev_put() to
819  *      release it when it is no longer needed. %NULL is returned if no
820  *      matching device is found.
821  */
822 struct net_device *netdev_get_by_name(struct net *net, const char *name,
823                                       netdevice_tracker *tracker, gfp_t gfp)
824 {
825         struct net_device *dev;
826 
827         dev = dev_get_by_name(net, name);
828         if (dev)
829                 netdev_tracker_alloc(dev, tracker, gfp);
830         return dev;
831 }
832 EXPORT_SYMBOL(netdev_get_by_name);
833 
834 /**
835  *      __dev_get_by_index - find a device by its ifindex
836  *      @net: the applicable net namespace
837  *      @ifindex: index of device
838  *
839  *      Search for an interface by index. Returns %NULL if the device
840  *      is not found or a pointer to the device. The device has not
841  *      had its reference counter increased so the caller must be careful
842  *      about locking. The caller must hold the RTNL semaphore.
843  */
844 
845 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
846 {
847         struct net_device *dev;
848         struct hlist_head *head = dev_index_hash(net, ifindex);
849 
850         hlist_for_each_entry(dev, head, index_hlist)
851                 if (dev->ifindex == ifindex)
852                         return dev;
853 
854         return NULL;
855 }
856 EXPORT_SYMBOL(__dev_get_by_index);
857 
858 /**
859  *      dev_get_by_index_rcu - find a device by its ifindex
860  *      @net: the applicable net namespace
861  *      @ifindex: index of device
862  *
863  *      Search for an interface by index. Returns %NULL if the device
864  *      is not found or a pointer to the device. The device has not
865  *      had its reference counter increased so the caller must be careful
866  *      about locking. The caller must hold RCU lock.
867  */
868 
869 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
870 {
871         struct net_device *dev;
872         struct hlist_head *head = dev_index_hash(net, ifindex);
873 
874         hlist_for_each_entry_rcu(dev, head, index_hlist)
875                 if (dev->ifindex == ifindex)
876                         return dev;
877 
878         return NULL;
879 }
880 EXPORT_SYMBOL(dev_get_by_index_rcu);
881 
882 /* Deprecated for new users, call netdev_get_by_index() instead */
883 struct net_device *dev_get_by_index(struct net *net, int ifindex)
884 {
885         struct net_device *dev;
886 
887         rcu_read_lock();
888         dev = dev_get_by_index_rcu(net, ifindex);
889         dev_hold(dev);
890         rcu_read_unlock();
891         return dev;
892 }
893 EXPORT_SYMBOL(dev_get_by_index);
894 
895 /**
896  *      netdev_get_by_index() - find a device by its ifindex
897  *      @net: the applicable net namespace
898  *      @ifindex: index of device
899  *      @tracker: tracking object for the acquired reference
900  *      @gfp: allocation flags for the tracker
901  *
902  *      Search for an interface by index. Returns NULL if the device
903  *      is not found or a pointer to the device. The device returned has
904  *      had a reference added and the pointer is safe until the user calls
905  *      netdev_put() to indicate they have finished with it.
906  */
907 struct net_device *netdev_get_by_index(struct net *net, int ifindex,
908                                        netdevice_tracker *tracker, gfp_t gfp)
909 {
910         struct net_device *dev;
911 
912         dev = dev_get_by_index(net, ifindex);
913         if (dev)
914                 netdev_tracker_alloc(dev, tracker, gfp);
915         return dev;
916 }
917 EXPORT_SYMBOL(netdev_get_by_index);
918 
919 /**
920  *      dev_get_by_napi_id - find a device by napi_id
921  *      @napi_id: ID of the NAPI struct
922  *
923  *      Search for an interface by NAPI ID. Returns %NULL if the device
924  *      is not found or a pointer to the device. The device has not had
925  *      its reference counter increased so the caller must be careful
926  *      about locking. The caller must hold RCU lock.
927  */
928 
929 struct net_device *dev_get_by_napi_id(unsigned int napi_id)
930 {
931         struct napi_struct *napi;
932 
933         WARN_ON_ONCE(!rcu_read_lock_held());
934 
935         if (napi_id < MIN_NAPI_ID)
936                 return NULL;
937 
938         napi = napi_by_id(napi_id);
939 
940         return napi ? napi->dev : NULL;
941 }
942 EXPORT_SYMBOL(dev_get_by_napi_id);
943 
944 static DEFINE_SEQLOCK(netdev_rename_lock);
945 
946 void netdev_copy_name(struct net_device *dev, char *name)
947 {
948         unsigned int seq;
949 
950         do {
951                 seq = read_seqbegin(&netdev_rename_lock);
952                 strscpy(name, dev->name, IFNAMSIZ);
953         } while (read_seqretry(&netdev_rename_lock, seq));
954 }
955 
956 /**
957  *      netdev_get_name - get a netdevice name, knowing its ifindex.
958  *      @net: network namespace
959  *      @name: a pointer to the buffer where the name will be stored.
960  *      @ifindex: the ifindex of the interface to get the name from.
961  */
962 int netdev_get_name(struct net *net, char *name, int ifindex)
963 {
964         struct net_device *dev;
965         int ret;
966 
967         rcu_read_lock();
968 
969         dev = dev_get_by_index_rcu(net, ifindex);
970         if (!dev) {
971                 ret = -ENODEV;
972                 goto out;
973         }
974 
975         netdev_copy_name(dev, name);
976 
977         ret = 0;
978 out:
979         rcu_read_unlock();
980         return ret;
981 }
982 
983 /**
984  *      dev_getbyhwaddr_rcu - find a device by its hardware address
985  *      @net: the applicable net namespace
986  *      @type: media type of device
987  *      @ha: hardware address
988  *
989  *      Search for an interface by MAC address. Returns NULL if the device
990  *      is not found or a pointer to the device.
991  *      The caller must hold RCU or RTNL.
992  *      The returned device has not had its ref count increased
993  *      and the caller must therefore be careful about locking
994  *
995  */
996 
997 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
998                                        const char *ha)
999 {
1000         struct net_device *dev;
1001 
1002         for_each_netdev_rcu(net, dev)
1003                 if (dev->type == type &&
1004                     !memcmp(dev->dev_addr, ha, dev->addr_len))
1005                         return dev;
1006 
1007         return NULL;
1008 }
1009 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
1010 
1011 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
1012 {
1013         struct net_device *dev, *ret = NULL;
1014 
1015         rcu_read_lock();
1016         for_each_netdev_rcu(net, dev)
1017                 if (dev->type == type) {
1018                         dev_hold(dev);
1019                         ret = dev;
1020                         break;
1021                 }
1022         rcu_read_unlock();
1023         return ret;
1024 }
1025 EXPORT_SYMBOL(dev_getfirstbyhwtype);
1026 
1027 /**
1028  *      __dev_get_by_flags - find any device with given flags
1029  *      @net: the applicable net namespace
1030  *      @if_flags: IFF_* values
1031  *      @mask: bitmask of bits in if_flags to check
1032  *
1033  *      Search for any interface with the given flags. Returns NULL if a device
1034  *      is not found or a pointer to the device. Must be called inside
1035  *      rtnl_lock(), and result refcount is unchanged.
1036  */
1037 
1038 struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
1039                                       unsigned short mask)
1040 {
1041         struct net_device *dev, *ret;
1042 
1043         ASSERT_RTNL();
1044 
1045         ret = NULL;
1046         for_each_netdev(net, dev) {
1047                 if (((dev->flags ^ if_flags) & mask) == 0) {
1048                         ret = dev;
1049                         break;
1050                 }
1051         }
1052         return ret;
1053 }
1054 EXPORT_SYMBOL(__dev_get_by_flags);
1055 
1056 /**
1057  *      dev_valid_name - check if name is okay for network device
1058  *      @name: name string
1059  *
1060  *      Network device names need to be valid file names to
1061  *      allow sysfs to work.  We also disallow any kind of
1062  *      whitespace.
1063  */
1064 bool dev_valid_name(const char *name)
1065 {
1066         if (*name == '\0')
1067                 return false;
1068         if (strnlen(name, IFNAMSIZ) == IFNAMSIZ)
1069                 return false;
1070         if (!strcmp(name, ".") || !strcmp(name, ".."))
1071                 return false;
1072 
1073         while (*name) {
1074                 if (*name == '/' || *name == ':' || isspace(*name))
1075                         return false;
1076                 name++;
1077         }
1078         return true;
1079 }
1080 EXPORT_SYMBOL(dev_valid_name);
1081 
1082 /**
1083  *      __dev_alloc_name - allocate a name for a device
1084  *      @net: network namespace to allocate the device name in
1085  *      @name: name format string
1086  *      @res: result name string
1087  *
1088  *      Passed a format string - eg "lt%d" it will try and find a suitable
1089  *      id. It scans list of devices to build up a free map, then chooses
1090  *      the first empty slot. The caller must hold the dev_base or rtnl lock
1091  *      while allocating the name and adding the device in order to avoid
1092  *      duplicates.
1093  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1094  *      Returns the number of the unit assigned or a negative errno code.
1095  */
1096 
1097 static int __dev_alloc_name(struct net *net, const char *name, char *res)
1098 {
1099         int i = 0;
1100         const char *p;
1101         const int max_netdevices = 8*PAGE_SIZE;
1102         unsigned long *inuse;
1103         struct net_device *d;
1104         char buf[IFNAMSIZ];
1105 
1106         /* Verify the string as this thing may have come from the user.
1107          * There must be one "%d" and no other "%" characters.
1108          */
1109         p = strchr(name, '%');
1110         if (!p || p[1] != 'd' || strchr(p + 2, '%'))
1111                 return -EINVAL;
1112 
1113         /* Use one page as a bit array of possible slots */
1114         inuse = bitmap_zalloc(max_netdevices, GFP_ATOMIC);
1115         if (!inuse)
1116                 return -ENOMEM;
1117 
1118         for_each_netdev(net, d) {
1119                 struct netdev_name_node *name_node;
1120 
1121                 netdev_for_each_altname(d, name_node) {
1122                         if (!sscanf(name_node->name, name, &i))
1123                                 continue;
1124                         if (i < 0 || i >= max_netdevices)
1125                                 continue;
1126 
1127                         /* avoid cases where sscanf is not exact inverse of printf */
1128                         snprintf(buf, IFNAMSIZ, name, i);
1129                         if (!strncmp(buf, name_node->name, IFNAMSIZ))
1130                                 __set_bit(i, inuse);
1131                 }
1132                 if (!sscanf(d->name, name, &i))
1133                         continue;
1134                 if (i < 0 || i >= max_netdevices)
1135                         continue;
1136 
1137                 /* avoid cases where sscanf is not exact inverse of printf */
1138                 snprintf(buf, IFNAMSIZ, name, i);
1139                 if (!strncmp(buf, d->name, IFNAMSIZ))
1140                         __set_bit(i, inuse);
1141         }
1142 
1143         i = find_first_zero_bit(inuse, max_netdevices);
1144         bitmap_free(inuse);
1145         if (i == max_netdevices)
1146                 return -ENFILE;
1147 
1148         /* 'res' and 'name' could overlap, use 'buf' as an intermediate buffer */
1149         strscpy(buf, name, IFNAMSIZ);
1150         snprintf(res, IFNAMSIZ, buf, i);
1151         return i;
1152 }
1153 
1154 /* Returns negative errno or allocated unit id (see __dev_alloc_name()) */
1155 static int dev_prep_valid_name(struct net *net, struct net_device *dev,
1156                                const char *want_name, char *out_name,
1157                                int dup_errno)
1158 {
1159         if (!dev_valid_name(want_name))
1160                 return -EINVAL;
1161 
1162         if (strchr(want_name, '%'))
1163                 return __dev_alloc_name(net, want_name, out_name);
1164 
1165         if (netdev_name_in_use(net, want_name))
1166                 return -dup_errno;
1167         if (out_name != want_name)
1168                 strscpy(out_name, want_name, IFNAMSIZ);
1169         return 0;
1170 }
1171 
1172 /**
1173  *      dev_alloc_name - allocate a name for a device
1174  *      @dev: device
1175  *      @name: name format string
1176  *
1177  *      Passed a format string - eg "lt%d" it will try and find a suitable
1178  *      id. It scans list of devices to build up a free map, then chooses
1179  *      the first empty slot. The caller must hold the dev_base or rtnl lock
1180  *      while allocating the name and adding the device in order to avoid
1181  *      duplicates.
1182  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1183  *      Returns the number of the unit assigned or a negative errno code.
1184  */
1185 
1186 int dev_alloc_name(struct net_device *dev, const char *name)
1187 {
1188         return dev_prep_valid_name(dev_net(dev), dev, name, dev->name, ENFILE);
1189 }
1190 EXPORT_SYMBOL(dev_alloc_name);
1191 
1192 static int dev_get_valid_name(struct net *net, struct net_device *dev,
1193                               const char *name)
1194 {
1195         int ret;
1196 
1197         ret = dev_prep_valid_name(net, dev, name, dev->name, EEXIST);
1198         return ret < 0 ? ret : 0;
1199 }
1200 
1201 /**
1202  *      dev_change_name - change name of a device
1203  *      @dev: device
1204  *      @newname: name (or format string) must be at least IFNAMSIZ
1205  *
1206  *      Change name of a device, can pass format strings "eth%d".
1207  *      for wildcarding.
1208  */
1209 int dev_change_name(struct net_device *dev, const char *newname)
1210 {
1211         unsigned char old_assign_type;
1212         char oldname[IFNAMSIZ];
1213         int err = 0;
1214         int ret;
1215         struct net *net;
1216 
1217         ASSERT_RTNL();
1218         BUG_ON(!dev_net(dev));
1219 
1220         net = dev_net(dev);
1221 
1222         down_write(&devnet_rename_sem);
1223 
1224         if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1225                 up_write(&devnet_rename_sem);
1226                 return 0;
1227         }
1228 
1229         memcpy(oldname, dev->name, IFNAMSIZ);
1230 
1231         write_seqlock_bh(&netdev_rename_lock);
1232         err = dev_get_valid_name(net, dev, newname);
1233         write_sequnlock_bh(&netdev_rename_lock);
1234 
1235         if (err < 0) {
1236                 up_write(&devnet_rename_sem);
1237                 return err;
1238         }
1239 
1240         if (oldname[0] && !strchr(oldname, '%'))
1241                 netdev_info(dev, "renamed from %s%s\n", oldname,
1242                             dev->flags & IFF_UP ? " (while UP)" : "");
1243 
1244         old_assign_type = dev->name_assign_type;
1245         WRITE_ONCE(dev->name_assign_type, NET_NAME_RENAMED);
1246 
1247 rollback:
1248         ret = device_rename(&dev->dev, dev->name);
1249         if (ret) {
1250                 memcpy(dev->name, oldname, IFNAMSIZ);
1251                 WRITE_ONCE(dev->name_assign_type, old_assign_type);
1252                 up_write(&devnet_rename_sem);
1253                 return ret;
1254         }
1255 
1256         up_write(&devnet_rename_sem);
1257 
1258         netdev_adjacent_rename_links(dev, oldname);
1259 
1260         netdev_name_node_del(dev->name_node);
1261 
1262         synchronize_net();
1263 
1264         netdev_name_node_add(net, dev->name_node);
1265 
1266         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1267         ret = notifier_to_errno(ret);
1268 
1269         if (ret) {
1270                 /* err >= 0 after dev_alloc_name() or stores the first errno */
1271                 if (err >= 0) {
1272                         err = ret;
1273                         down_write(&devnet_rename_sem);
1274                         write_seqlock_bh(&netdev_rename_lock);
1275                         memcpy(dev->name, oldname, IFNAMSIZ);
1276                         write_sequnlock_bh(&netdev_rename_lock);
1277                         memcpy(oldname, newname, IFNAMSIZ);
1278                         WRITE_ONCE(dev->name_assign_type, old_assign_type);
1279                         old_assign_type = NET_NAME_RENAMED;
1280                         goto rollback;
1281                 } else {
1282                         netdev_err(dev, "name change rollback failed: %d\n",
1283                                    ret);
1284                 }
1285         }
1286 
1287         return err;
1288 }
1289 
1290 /**
1291  *      dev_set_alias - change ifalias of a device
1292  *      @dev: device
1293  *      @alias: name up to IFALIASZ
1294  *      @len: limit of bytes to copy from info
1295  *
1296  *      Set ifalias for a device,
1297  */
1298 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1299 {
1300         struct dev_ifalias *new_alias = NULL;
1301 
1302         if (len >= IFALIASZ)
1303                 return -EINVAL;
1304 
1305         if (len) {
1306                 new_alias = kmalloc(sizeof(*new_alias) + len + 1, GFP_KERNEL);
1307                 if (!new_alias)
1308                         return -ENOMEM;
1309 
1310                 memcpy(new_alias->ifalias, alias, len);
1311                 new_alias->ifalias[len] = 0;
1312         }
1313 
1314         mutex_lock(&ifalias_mutex);
1315         new_alias = rcu_replace_pointer(dev->ifalias, new_alias,
1316                                         mutex_is_locked(&ifalias_mutex));
1317         mutex_unlock(&ifalias_mutex);
1318 
1319         if (new_alias)
1320                 kfree_rcu(new_alias, rcuhead);
1321 
1322         return len;
1323 }
1324 EXPORT_SYMBOL(dev_set_alias);
1325 
1326 /**
1327  *      dev_get_alias - get ifalias of a device
1328  *      @dev: device
1329  *      @name: buffer to store name of ifalias
1330  *      @len: size of buffer
1331  *
1332  *      get ifalias for a device.  Caller must make sure dev cannot go
1333  *      away,  e.g. rcu read lock or own a reference count to device.
1334  */
1335 int dev_get_alias(const struct net_device *dev, char *name, size_t len)
1336 {
1337         const struct dev_ifalias *alias;
1338         int ret = 0;
1339 
1340         rcu_read_lock();
1341         alias = rcu_dereference(dev->ifalias);
1342         if (alias)
1343                 ret = snprintf(name, len, "%s", alias->ifalias);
1344         rcu_read_unlock();
1345 
1346         return ret;
1347 }
1348 
1349 /**
1350  *      netdev_features_change - device changes features
1351  *      @dev: device to cause notification
1352  *
1353  *      Called to indicate a device has changed features.
1354  */
1355 void netdev_features_change(struct net_device *dev)
1356 {
1357         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1358 }
1359 EXPORT_SYMBOL(netdev_features_change);
1360 
1361 /**
1362  *      netdev_state_change - device changes state
1363  *      @dev: device to cause notification
1364  *
1365  *      Called to indicate a device has changed state. This function calls
1366  *      the notifier chains for netdev_chain and sends a NEWLINK message
1367  *      to the routing socket.
1368  */
1369 void netdev_state_change(struct net_device *dev)
1370 {
1371         if (dev->flags & IFF_UP) {
1372                 struct netdev_notifier_change_info change_info = {
1373                         .info.dev = dev,
1374                 };
1375 
1376                 call_netdevice_notifiers_info(NETDEV_CHANGE,
1377                                               &change_info.info);
1378                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL, 0, NULL);
1379         }
1380 }
1381 EXPORT_SYMBOL(netdev_state_change);
1382 
1383 /**
1384  * __netdev_notify_peers - notify network peers about existence of @dev,
1385  * to be called when rtnl lock is already held.
1386  * @dev: network device
1387  *
1388  * Generate traffic such that interested network peers are aware of
1389  * @dev, such as by generating a gratuitous ARP. This may be used when
1390  * a device wants to inform the rest of the network about some sort of
1391  * reconfiguration such as a failover event or virtual machine
1392  * migration.
1393  */
1394 void __netdev_notify_peers(struct net_device *dev)
1395 {
1396         ASSERT_RTNL();
1397         call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1398         call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev);
1399 }
1400 EXPORT_SYMBOL(__netdev_notify_peers);
1401 
1402 /**
1403  * netdev_notify_peers - notify network peers about existence of @dev
1404  * @dev: network device
1405  *
1406  * Generate traffic such that interested network peers are aware of
1407  * @dev, such as by generating a gratuitous ARP. This may be used when
1408  * a device wants to inform the rest of the network about some sort of
1409  * reconfiguration such as a failover event or virtual machine
1410  * migration.
1411  */
1412 void netdev_notify_peers(struct net_device *dev)
1413 {
1414         rtnl_lock();
1415         __netdev_notify_peers(dev);
1416         rtnl_unlock();
1417 }
1418 EXPORT_SYMBOL(netdev_notify_peers);
1419 
1420 static int napi_threaded_poll(void *data);
1421 
1422 static int napi_kthread_create(struct napi_struct *n)
1423 {
1424         int err = 0;
1425 
1426         /* Create and wake up the kthread once to put it in
1427          * TASK_INTERRUPTIBLE mode to avoid the blocked task
1428          * warning and work with loadavg.
1429          */
1430         n->thread = kthread_run(napi_threaded_poll, n, "napi/%s-%d",
1431                                 n->dev->name, n->napi_id);
1432         if (IS_ERR(n->thread)) {
1433                 err = PTR_ERR(n->thread);
1434                 pr_err("kthread_run failed with err %d\n", err);
1435                 n->thread = NULL;
1436         }
1437 
1438         return err;
1439 }
1440 
1441 static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
1442 {
1443         const struct net_device_ops *ops = dev->netdev_ops;
1444         int ret;
1445 
1446         ASSERT_RTNL();
1447         dev_addr_check(dev);
1448 
1449         if (!netif_device_present(dev)) {
1450                 /* may be detached because parent is runtime-suspended */
1451                 if (dev->dev.parent)
1452                         pm_runtime_resume(dev->dev.parent);
1453                 if (!netif_device_present(dev))
1454                         return -ENODEV;
1455         }
1456 
1457         /* Block netpoll from trying to do any rx path servicing.
1458          * If we don't do this there is a chance ndo_poll_controller
1459          * or ndo_poll may be running while we open the device
1460          */
1461         netpoll_poll_disable(dev);
1462 
1463         ret = call_netdevice_notifiers_extack(NETDEV_PRE_UP, dev, extack);
1464         ret = notifier_to_errno(ret);
1465         if (ret)
1466                 return ret;
1467 
1468         set_bit(__LINK_STATE_START, &dev->state);
1469 
1470         if (ops->ndo_validate_addr)
1471                 ret = ops->ndo_validate_addr(dev);
1472 
1473         if (!ret && ops->ndo_open)
1474                 ret = ops->ndo_open(dev);
1475 
1476         netpoll_poll_enable(dev);
1477 
1478         if (ret)
1479                 clear_bit(__LINK_STATE_START, &dev->state);
1480         else {
1481                 dev->flags |= IFF_UP;
1482                 dev_set_rx_mode(dev);
1483                 dev_activate(dev);
1484                 add_device_randomness(dev->dev_addr, dev->addr_len);
1485         }
1486 
1487         return ret;
1488 }
1489 
1490 /**
1491  *      dev_open        - prepare an interface for use.
1492  *      @dev: device to open
1493  *      @extack: netlink extended ack
1494  *
1495  *      Takes a device from down to up state. The device's private open
1496  *      function is invoked and then the multicast lists are loaded. Finally
1497  *      the device is moved into the up state and a %NETDEV_UP message is
1498  *      sent to the netdev notifier chain.
1499  *
1500  *      Calling this function on an active interface is a nop. On a failure
1501  *      a negative errno code is returned.
1502  */
1503 int dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
1504 {
1505         int ret;
1506 
1507         if (dev->flags & IFF_UP)
1508                 return 0;
1509 
1510         ret = __dev_open(dev, extack);
1511         if (ret < 0)
1512                 return ret;
1513 
1514         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP | IFF_RUNNING, GFP_KERNEL, 0, NULL);
1515         call_netdevice_notifiers(NETDEV_UP, dev);
1516 
1517         return ret;
1518 }
1519 EXPORT_SYMBOL(dev_open);
1520 
1521 static void __dev_close_many(struct list_head *head)
1522 {
1523         struct net_device *dev;
1524 
1525         ASSERT_RTNL();
1526         might_sleep();
1527 
1528         list_for_each_entry(dev, head, close_list) {
1529                 /* Temporarily disable netpoll until the interface is down */
1530                 netpoll_poll_disable(dev);
1531 
1532                 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1533 
1534                 clear_bit(__LINK_STATE_START, &dev->state);
1535 
1536                 /* Synchronize to scheduled poll. We cannot touch poll list, it
1537                  * can be even on different cpu. So just clear netif_running().
1538                  *
1539                  * dev->stop() will invoke napi_disable() on all of it's
1540                  * napi_struct instances on this device.
1541                  */
1542                 smp_mb__after_atomic(); /* Commit netif_running(). */
1543         }
1544 
1545         dev_deactivate_many(head);
1546 
1547         list_for_each_entry(dev, head, close_list) {
1548                 const struct net_device_ops *ops = dev->netdev_ops;
1549 
1550                 /*
1551                  *      Call the device specific close. This cannot fail.
1552                  *      Only if device is UP
1553                  *
1554                  *      We allow it to be called even after a DETACH hot-plug
1555                  *      event.
1556                  */
1557                 if (ops->ndo_stop)
1558                         ops->ndo_stop(dev);
1559 
1560                 dev->flags &= ~IFF_UP;
1561                 netpoll_poll_enable(dev);
1562         }
1563 }
1564 
1565 static void __dev_close(struct net_device *dev)
1566 {
1567         LIST_HEAD(single);
1568 
1569         list_add(&dev->close_list, &single);
1570         __dev_close_many(&single);
1571         list_del(&single);
1572 }
1573 
1574 void dev_close_many(struct list_head *head, bool unlink)
1575 {
1576         struct net_device *dev, *tmp;
1577 
1578         /* Remove the devices that don't need to be closed */
1579         list_for_each_entry_safe(dev, tmp, head, close_list)
1580                 if (!(dev->flags & IFF_UP))
1581                         list_del_init(&dev->close_list);
1582 
1583         __dev_close_many(head);
1584 
1585         list_for_each_entry_safe(dev, tmp, head, close_list) {
1586                 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP | IFF_RUNNING, GFP_KERNEL, 0, NULL);
1587                 call_netdevice_notifiers(NETDEV_DOWN, dev);
1588                 if (unlink)
1589                         list_del_init(&dev->close_list);
1590         }
1591 }
1592 EXPORT_SYMBOL(dev_close_many);
1593 
1594 /**
1595  *      dev_close - shutdown an interface.
1596  *      @dev: device to shutdown
1597  *
1598  *      This function moves an active device into down state. A
1599  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1600  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1601  *      chain.
1602  */
1603 void dev_close(struct net_device *dev)
1604 {
1605         if (dev->flags & IFF_UP) {
1606                 LIST_HEAD(single);
1607 
1608                 list_add(&dev->close_list, &single);
1609                 dev_close_many(&single, true);
1610                 list_del(&single);
1611         }
1612 }
1613 EXPORT_SYMBOL(dev_close);
1614 
1615 
1616 /**
1617  *      dev_disable_lro - disable Large Receive Offload on a device
1618  *      @dev: device
1619  *
1620  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1621  *      called under RTNL.  This is needed if received packets may be
1622  *      forwarded to another interface.
1623  */
1624 void dev_disable_lro(struct net_device *dev)
1625 {
1626         struct net_device *lower_dev;
1627         struct list_head *iter;
1628 
1629         dev->wanted_features &= ~NETIF_F_LRO;
1630         netdev_update_features(dev);
1631 
1632         if (unlikely(dev->features & NETIF_F_LRO))
1633                 netdev_WARN(dev, "failed to disable LRO!\n");
1634 
1635         netdev_for_each_lower_dev(dev, lower_dev, iter)
1636                 dev_disable_lro(lower_dev);
1637 }
1638 EXPORT_SYMBOL(dev_disable_lro);
1639 
1640 /**
1641  *      dev_disable_gro_hw - disable HW Generic Receive Offload on a device
1642  *      @dev: device
1643  *
1644  *      Disable HW Generic Receive Offload (GRO_HW) on a net device.  Must be
1645  *      called under RTNL.  This is needed if Generic XDP is installed on
1646  *      the device.
1647  */
1648 static void dev_disable_gro_hw(struct net_device *dev)
1649 {
1650         dev->wanted_features &= ~NETIF_F_GRO_HW;
1651         netdev_update_features(dev);
1652 
1653         if (unlikely(dev->features & NETIF_F_GRO_HW))
1654                 netdev_WARN(dev, "failed to disable GRO_HW!\n");
1655 }
1656 
1657 const char *netdev_cmd_to_name(enum netdev_cmd cmd)
1658 {
1659 #define N(val)                                          \
1660         case NETDEV_##val:                              \
1661                 return "NETDEV_" __stringify(val);
1662         switch (cmd) {
1663         N(UP) N(DOWN) N(REBOOT) N(CHANGE) N(REGISTER) N(UNREGISTER)
1664         N(CHANGEMTU) N(CHANGEADDR) N(GOING_DOWN) N(CHANGENAME) N(FEAT_CHANGE)
1665         N(BONDING_FAILOVER) N(PRE_UP) N(PRE_TYPE_CHANGE) N(POST_TYPE_CHANGE)
1666         N(POST_INIT) N(PRE_UNINIT) N(RELEASE) N(NOTIFY_PEERS) N(JOIN)
1667         N(CHANGEUPPER) N(RESEND_IGMP) N(PRECHANGEMTU) N(CHANGEINFODATA)
1668         N(BONDING_INFO) N(PRECHANGEUPPER) N(CHANGELOWERSTATE)
1669         N(UDP_TUNNEL_PUSH_INFO) N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN)
1670         N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO)
1671         N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO)
1672         N(PRE_CHANGEADDR) N(OFFLOAD_XSTATS_ENABLE) N(OFFLOAD_XSTATS_DISABLE)
1673         N(OFFLOAD_XSTATS_REPORT_USED) N(OFFLOAD_XSTATS_REPORT_DELTA)
1674         N(XDP_FEAT_CHANGE)
1675         }
1676 #undef N
1677         return "UNKNOWN_NETDEV_EVENT";
1678 }
1679 EXPORT_SYMBOL_GPL(netdev_cmd_to_name);
1680 
1681 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1682                                    struct net_device *dev)
1683 {
1684         struct netdev_notifier_info info = {
1685                 .dev = dev,
1686         };
1687 
1688         return nb->notifier_call(nb, val, &info);
1689 }
1690 
1691 static int call_netdevice_register_notifiers(struct notifier_block *nb,
1692                                              struct net_device *dev)
1693 {
1694         int err;
1695 
1696         err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1697         err = notifier_to_errno(err);
1698         if (err)
1699                 return err;
1700 
1701         if (!(dev->flags & IFF_UP))
1702                 return 0;
1703 
1704         call_netdevice_notifier(nb, NETDEV_UP, dev);
1705         return 0;
1706 }
1707 
1708 static void call_netdevice_unregister_notifiers(struct notifier_block *nb,
1709                                                 struct net_device *dev)
1710 {
1711         if (dev->flags & IFF_UP) {
1712                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1713                                         dev);
1714                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1715         }
1716         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1717 }
1718 
1719 static int call_netdevice_register_net_notifiers(struct notifier_block *nb,
1720                                                  struct net *net)
1721 {
1722         struct net_device *dev;
1723         int err;
1724 
1725         for_each_netdev(net, dev) {
1726                 err = call_netdevice_register_notifiers(nb, dev);
1727                 if (err)
1728                         goto rollback;
1729         }
1730         return 0;
1731 
1732 rollback:
1733         for_each_netdev_continue_reverse(net, dev)
1734                 call_netdevice_unregister_notifiers(nb, dev);
1735         return err;
1736 }
1737 
1738 static void call_netdevice_unregister_net_notifiers(struct notifier_block *nb,
1739                                                     struct net *net)
1740 {
1741         struct net_device *dev;
1742 
1743         for_each_netdev(net, dev)
1744                 call_netdevice_unregister_notifiers(nb, dev);
1745 }
1746 
1747 static int dev_boot_phase = 1;
1748 
1749 /**
1750  * register_netdevice_notifier - register a network notifier block
1751  * @nb: notifier
1752  *
1753  * Register a notifier to be called when network device events occur.
1754  * The notifier passed is linked into the kernel structures and must
1755  * not be reused until it has been unregistered. A negative errno code
1756  * is returned on a failure.
1757  *
1758  * When registered all registration and up events are replayed
1759  * to the new notifier to allow device to have a race free
1760  * view of the network device list.
1761  */
1762 
1763 int register_netdevice_notifier(struct notifier_block *nb)
1764 {
1765         struct net *net;
1766         int err;
1767 
1768         /* Close race with setup_net() and cleanup_net() */
1769         down_write(&pernet_ops_rwsem);
1770         rtnl_lock();
1771         err = raw_notifier_chain_register(&netdev_chain, nb);
1772         if (err)
1773                 goto unlock;
1774         if (dev_boot_phase)
1775                 goto unlock;
1776         for_each_net(net) {
1777                 err = call_netdevice_register_net_notifiers(nb, net);
1778                 if (err)
1779                         goto rollback;
1780         }
1781 
1782 unlock:
1783         rtnl_unlock();
1784         up_write(&pernet_ops_rwsem);
1785         return err;
1786 
1787 rollback:
1788         for_each_net_continue_reverse(net)
1789                 call_netdevice_unregister_net_notifiers(nb, net);
1790 
1791         raw_notifier_chain_unregister(&netdev_chain, nb);
1792         goto unlock;
1793 }
1794 EXPORT_SYMBOL(register_netdevice_notifier);
1795 
1796 /**
1797  * unregister_netdevice_notifier - unregister a network notifier block
1798  * @nb: notifier
1799  *
1800  * Unregister a notifier previously registered by
1801  * register_netdevice_notifier(). The notifier is unlinked into the
1802  * kernel structures and may then be reused. A negative errno code
1803  * is returned on a failure.
1804  *
1805  * After unregistering unregister and down device events are synthesized
1806  * for all devices on the device list to the removed notifier to remove
1807  * the need for special case cleanup code.
1808  */
1809 
1810 int unregister_netdevice_notifier(struct notifier_block *nb)
1811 {
1812         struct net *net;
1813         int err;
1814 
1815         /* Close race with setup_net() and cleanup_net() */
1816         down_write(&pernet_ops_rwsem);
1817         rtnl_lock();
1818         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1819         if (err)
1820                 goto unlock;
1821 
1822         for_each_net(net)
1823                 call_netdevice_unregister_net_notifiers(nb, net);
1824 
1825 unlock:
1826         rtnl_unlock();
1827         up_write(&pernet_ops_rwsem);
1828         return err;
1829 }
1830 EXPORT_SYMBOL(unregister_netdevice_notifier);
1831 
1832 static int __register_netdevice_notifier_net(struct net *net,
1833                                              struct notifier_block *nb,
1834                                              bool ignore_call_fail)
1835 {
1836         int err;
1837 
1838         err = raw_notifier_chain_register(&net->netdev_chain, nb);
1839         if (err)
1840                 return err;
1841         if (dev_boot_phase)
1842                 return 0;
1843 
1844         err = call_netdevice_register_net_notifiers(nb, net);
1845         if (err && !ignore_call_fail)
1846                 goto chain_unregister;
1847 
1848         return 0;
1849 
1850 chain_unregister:
1851         raw_notifier_chain_unregister(&net->netdev_chain, nb);
1852         return err;
1853 }
1854 
1855 static int __unregister_netdevice_notifier_net(struct net *net,
1856                                                struct notifier_block *nb)
1857 {
1858         int err;
1859 
1860         err = raw_notifier_chain_unregister(&net->netdev_chain, nb);
1861         if (err)
1862                 return err;
1863 
1864         call_netdevice_unregister_net_notifiers(nb, net);
1865         return 0;
1866 }
1867 
1868 /**
1869  * register_netdevice_notifier_net - register a per-netns network notifier block
1870  * @net: network namespace
1871  * @nb: notifier
1872  *
1873  * Register a notifier to be called when network device events occur.
1874  * The notifier passed is linked into the kernel structures and must
1875  * not be reused until it has been unregistered. A negative errno code
1876  * is returned on a failure.
1877  *
1878  * When registered all registration and up events are replayed
1879  * to the new notifier to allow device to have a race free
1880  * view of the network device list.
1881  */
1882 
1883 int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb)
1884 {
1885         int err;
1886 
1887         rtnl_lock();
1888         err = __register_netdevice_notifier_net(net, nb, false);
1889         rtnl_unlock();
1890         return err;
1891 }
1892 EXPORT_SYMBOL(register_netdevice_notifier_net);
1893 
1894 /**
1895  * unregister_netdevice_notifier_net - unregister a per-netns
1896  *                                     network notifier block
1897  * @net: network namespace
1898  * @nb: notifier
1899  *
1900  * Unregister a notifier previously registered by
1901  * register_netdevice_notifier_net(). The notifier is unlinked from the
1902  * kernel structures and may then be reused. A negative errno code
1903  * is returned on a failure.
1904  *
1905  * After unregistering unregister and down device events are synthesized
1906  * for all devices on the device list to the removed notifier to remove
1907  * the need for special case cleanup code.
1908  */
1909 
1910 int unregister_netdevice_notifier_net(struct net *net,
1911                                       struct notifier_block *nb)
1912 {
1913         int err;
1914 
1915         rtnl_lock();
1916         err = __unregister_netdevice_notifier_net(net, nb);
1917         rtnl_unlock();
1918         return err;
1919 }
1920 EXPORT_SYMBOL(unregister_netdevice_notifier_net);
1921 
1922 static void __move_netdevice_notifier_net(struct net *src_net,
1923                                           struct net *dst_net,
1924                                           struct notifier_block *nb)
1925 {
1926         __unregister_netdevice_notifier_net(src_net, nb);
1927         __register_netdevice_notifier_net(dst_net, nb, true);
1928 }
1929 
1930 int register_netdevice_notifier_dev_net(struct net_device *dev,
1931                                         struct notifier_block *nb,
1932                                         struct netdev_net_notifier *nn)
1933 {
1934         int err;
1935 
1936         rtnl_lock();
1937         err = __register_netdevice_notifier_net(dev_net(dev), nb, false);
1938         if (!err) {
1939                 nn->nb = nb;
1940                 list_add(&nn->list, &dev->net_notifier_list);
1941         }
1942         rtnl_unlock();
1943         return err;
1944 }
1945 EXPORT_SYMBOL(register_netdevice_notifier_dev_net);
1946 
1947 int unregister_netdevice_notifier_dev_net(struct net_device *dev,
1948                                           struct notifier_block *nb,
1949                                           struct netdev_net_notifier *nn)
1950 {
1951         int err;
1952 
1953         rtnl_lock();
1954         list_del(&nn->list);
1955         err = __unregister_netdevice_notifier_net(dev_net(dev), nb);
1956         rtnl_unlock();
1957         return err;
1958 }
1959 EXPORT_SYMBOL(unregister_netdevice_notifier_dev_net);
1960 
1961 static void move_netdevice_notifiers_dev_net(struct net_device *dev,
1962                                              struct net *net)
1963 {
1964         struct netdev_net_notifier *nn;
1965 
1966         list_for_each_entry(nn, &dev->net_notifier_list, list)
1967                 __move_netdevice_notifier_net(dev_net(dev), net, nn->nb);
1968 }
1969 
1970 /**
1971  *      call_netdevice_notifiers_info - call all network notifier blocks
1972  *      @val: value passed unmodified to notifier function
1973  *      @info: notifier information data
1974  *
1975  *      Call all network notifier blocks.  Parameters and return value
1976  *      are as for raw_notifier_call_chain().
1977  */
1978 
1979 int call_netdevice_notifiers_info(unsigned long val,
1980                                   struct netdev_notifier_info *info)
1981 {
1982         struct net *net = dev_net(info->dev);
1983         int ret;
1984 
1985         ASSERT_RTNL();
1986 
1987         /* Run per-netns notifier block chain first, then run the global one.
1988          * Hopefully, one day, the global one is going to be removed after
1989          * all notifier block registrators get converted to be per-netns.
1990          */
1991         ret = raw_notifier_call_chain(&net->netdev_chain, val, info);
1992         if (ret & NOTIFY_STOP_MASK)
1993                 return ret;
1994         return raw_notifier_call_chain(&netdev_chain, val, info);
1995 }
1996 
1997 /**
1998  *      call_netdevice_notifiers_info_robust - call per-netns notifier blocks
1999  *                                             for and rollback on error
2000  *      @val_up: value passed unmodified to notifier function
2001  *      @val_down: value passed unmodified to the notifier function when
2002  *                 recovering from an error on @val_up
2003  *      @info: notifier information data
2004  *
2005  *      Call all per-netns network notifier blocks, but not notifier blocks on
2006  *      the global notifier chain. Parameters and return value are as for
2007  *      raw_notifier_call_chain_robust().
2008  */
2009 
2010 static int
2011 call_netdevice_notifiers_info_robust(unsigned long val_up,
2012                                      unsigned long val_down,
2013                                      struct netdev_notifier_info *info)
2014 {
2015         struct net *net = dev_net(info->dev);
2016 
2017         ASSERT_RTNL();
2018 
2019         return raw_notifier_call_chain_robust(&net->netdev_chain,
2020                                               val_up, val_down, info);
2021 }
2022 
2023 static int call_netdevice_notifiers_extack(unsigned long val,
2024                                            struct net_device *dev,
2025                                            struct netlink_ext_ack *extack)
2026 {
2027         struct netdev_notifier_info info = {
2028                 .dev = dev,
2029                 .extack = extack,
2030         };
2031 
2032         return call_netdevice_notifiers_info(val, &info);
2033 }
2034 
2035 /**
2036  *      call_netdevice_notifiers - call all network notifier blocks
2037  *      @val: value passed unmodified to notifier function
2038  *      @dev: net_device pointer passed unmodified to notifier function
2039  *
2040  *      Call all network notifier blocks.  Parameters and return value
2041  *      are as for raw_notifier_call_chain().
2042  */
2043 
2044 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
2045 {
2046         return call_netdevice_notifiers_extack(val, dev, NULL);
2047 }
2048 EXPORT_SYMBOL(call_netdevice_notifiers);
2049 
2050 /**
2051  *      call_netdevice_notifiers_mtu - call all network notifier blocks
2052  *      @val: value passed unmodified to notifier function
2053  *      @dev: net_device pointer passed unmodified to notifier function
2054  *      @arg: additional u32 argument passed to the notifier function
2055  *
2056  *      Call all network notifier blocks.  Parameters and return value
2057  *      are as for raw_notifier_call_chain().
2058  */
2059 static int call_netdevice_notifiers_mtu(unsigned long val,
2060                                         struct net_device *dev, u32 arg)
2061 {
2062         struct netdev_notifier_info_ext info = {
2063                 .info.dev = dev,
2064                 .ext.mtu = arg,
2065         };
2066 
2067         BUILD_BUG_ON(offsetof(struct netdev_notifier_info_ext, info) != 0);
2068 
2069         return call_netdevice_notifiers_info(val, &info.info);
2070 }
2071 
2072 #ifdef CONFIG_NET_INGRESS
2073 static DEFINE_STATIC_KEY_FALSE(ingress_needed_key);
2074 
2075 void net_inc_ingress_queue(void)
2076 {
2077         static_branch_inc(&ingress_needed_key);
2078 }
2079 EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
2080 
2081 void net_dec_ingress_queue(void)
2082 {
2083         static_branch_dec(&ingress_needed_key);
2084 }
2085 EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
2086 #endif
2087 
2088 #ifdef CONFIG_NET_EGRESS
2089 static DEFINE_STATIC_KEY_FALSE(egress_needed_key);
2090 
2091 void net_inc_egress_queue(void)
2092 {
2093         static_branch_inc(&egress_needed_key);
2094 }
2095 EXPORT_SYMBOL_GPL(net_inc_egress_queue);
2096 
2097 void net_dec_egress_queue(void)
2098 {
2099         static_branch_dec(&egress_needed_key);
2100 }
2101 EXPORT_SYMBOL_GPL(net_dec_egress_queue);
2102 #endif
2103 
2104 #ifdef CONFIG_NET_CLS_ACT
2105 DEFINE_STATIC_KEY_FALSE(tcf_bypass_check_needed_key);
2106 EXPORT_SYMBOL(tcf_bypass_check_needed_key);
2107 #endif
2108 
2109 DEFINE_STATIC_KEY_FALSE(netstamp_needed_key);
2110 EXPORT_SYMBOL(netstamp_needed_key);
2111 #ifdef CONFIG_JUMP_LABEL
2112 static atomic_t netstamp_needed_deferred;
2113 static atomic_t netstamp_wanted;
2114 static void netstamp_clear(struct work_struct *work)
2115 {
2116         int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
2117         int wanted;
2118 
2119         wanted = atomic_add_return(deferred, &netstamp_wanted);
2120         if (wanted > 0)
2121                 static_branch_enable(&netstamp_needed_key);
2122         else
2123                 static_branch_disable(&netstamp_needed_key);
2124 }
2125 static DECLARE_WORK(netstamp_work, netstamp_clear);
2126 #endif
2127 
2128 void net_enable_timestamp(void)
2129 {
2130 #ifdef CONFIG_JUMP_LABEL
2131         int wanted = atomic_read(&netstamp_wanted);
2132 
2133         while (wanted > 0) {
2134                 if (atomic_try_cmpxchg(&netstamp_wanted, &wanted, wanted + 1))
2135                         return;
2136         }
2137         atomic_inc(&netstamp_needed_deferred);
2138         schedule_work(&netstamp_work);
2139 #else
2140         static_branch_inc(&netstamp_needed_key);
2141 #endif
2142 }
2143 EXPORT_SYMBOL(net_enable_timestamp);
2144 
2145 void net_disable_timestamp(void)
2146 {
2147 #ifdef CONFIG_JUMP_LABEL
2148         int wanted = atomic_read(&netstamp_wanted);
2149 
2150         while (wanted > 1) {
2151                 if (atomic_try_cmpxchg(&netstamp_wanted, &wanted, wanted - 1))
2152                         return;
2153         }
2154         atomic_dec(&netstamp_needed_deferred);
2155         schedule_work(&netstamp_work);
2156 #else
2157         static_branch_dec(&netstamp_needed_key);
2158 #endif
2159 }
2160 EXPORT_SYMBOL(net_disable_timestamp);
2161 
2162 static inline void net_timestamp_set(struct sk_buff *skb)
2163 {
2164         skb->tstamp = 0;
2165         skb->tstamp_type = SKB_CLOCK_REALTIME;
2166         if (static_branch_unlikely(&netstamp_needed_key))
2167                 skb->tstamp = ktime_get_real();
2168 }
2169 
2170 #define net_timestamp_check(COND, SKB)                          \
2171         if (static_branch_unlikely(&netstamp_needed_key)) {     \
2172                 if ((COND) && !(SKB)->tstamp)                   \
2173                         (SKB)->tstamp = ktime_get_real();       \
2174         }                                                       \
2175 
2176 bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
2177 {
2178         return __is_skb_forwardable(dev, skb, true);
2179 }
2180 EXPORT_SYMBOL_GPL(is_skb_forwardable);
2181 
2182 static int __dev_forward_skb2(struct net_device *dev, struct sk_buff *skb,
2183                               bool check_mtu)
2184 {
2185         int ret = ____dev_forward_skb(dev, skb, check_mtu);
2186 
2187         if (likely(!ret)) {
2188                 skb->protocol = eth_type_trans(skb, dev);
2189                 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
2190         }
2191 
2192         return ret;
2193 }
2194 
2195 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
2196 {
2197         return __dev_forward_skb2(dev, skb, true);
2198 }
2199 EXPORT_SYMBOL_GPL(__dev_forward_skb);
2200 
2201 /**
2202  * dev_forward_skb - loopback an skb to another netif
2203  *
2204  * @dev: destination network device
2205  * @skb: buffer to forward
2206  *
2207  * return values:
2208  *      NET_RX_SUCCESS  (no congestion)
2209  *      NET_RX_DROP     (packet was dropped, but freed)
2210  *
2211  * dev_forward_skb can be used for injecting an skb from the
2212  * start_xmit function of one device into the receive queue
2213  * of another device.
2214  *
2215  * The receiving device may be in another namespace, so
2216  * we have to clear all information in the skb that could
2217  * impact namespace isolation.
2218  */
2219 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
2220 {
2221         return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
2222 }
2223 EXPORT_SYMBOL_GPL(dev_forward_skb);
2224 
2225 int dev_forward_skb_nomtu(struct net_device *dev, struct sk_buff *skb)
2226 {
2227         return __dev_forward_skb2(dev, skb, false) ?: netif_rx_internal(skb);
2228 }
2229 
2230 static inline int deliver_skb(struct sk_buff *skb,
2231                               struct packet_type *pt_prev,
2232                               struct net_device *orig_dev)
2233 {
2234         if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
2235                 return -ENOMEM;
2236         refcount_inc(&skb->users);
2237         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2238 }
2239 
2240 static inline void deliver_ptype_list_skb(struct sk_buff *skb,
2241                                           struct packet_type **pt,
2242                                           struct net_device *orig_dev,
2243                                           __be16 type,
2244                                           struct list_head *ptype_list)
2245 {
2246         struct packet_type *ptype, *pt_prev = *pt;
2247 
2248         list_for_each_entry_rcu(ptype, ptype_list, list) {
2249                 if (ptype->type != type)
2250                         continue;
2251                 if (pt_prev)
2252                         deliver_skb(skb, pt_prev, orig_dev);
2253                 pt_prev = ptype;
2254         }
2255         *pt = pt_prev;
2256 }
2257 
2258 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
2259 {
2260         if (!ptype->af_packet_priv || !skb->sk)
2261                 return false;
2262 
2263         if (ptype->id_match)
2264                 return ptype->id_match(ptype, skb->sk);
2265         else if ((struct sock *)ptype->af_packet_priv == skb->sk)
2266                 return true;
2267 
2268         return false;
2269 }
2270 
2271 /**
2272  * dev_nit_active - return true if any network interface taps are in use
2273  *
2274  * @dev: network device to check for the presence of taps
2275  */
2276 bool dev_nit_active(struct net_device *dev)
2277 {
2278         return !list_empty(&net_hotdata.ptype_all) ||
2279                !list_empty(&dev->ptype_all);
2280 }
2281 EXPORT_SYMBOL_GPL(dev_nit_active);
2282 
2283 /*
2284  *      Support routine. Sends outgoing frames to any network
2285  *      taps currently in use.
2286  */
2287 
2288 void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
2289 {
2290         struct list_head *ptype_list = &net_hotdata.ptype_all;
2291         struct packet_type *ptype, *pt_prev = NULL;
2292         struct sk_buff *skb2 = NULL;
2293 
2294         rcu_read_lock();
2295 again:
2296         list_for_each_entry_rcu(ptype, ptype_list, list) {
2297                 if (READ_ONCE(ptype->ignore_outgoing))
2298                         continue;
2299 
2300                 /* Never send packets back to the socket
2301                  * they originated from - MvS (miquels@drinkel.ow.org)
2302                  */
2303                 if (skb_loop_sk(ptype, skb))
2304                         continue;
2305 
2306                 if (pt_prev) {
2307                         deliver_skb(skb2, pt_prev, skb->dev);
2308                         pt_prev = ptype;
2309                         continue;
2310                 }
2311 
2312                 /* need to clone skb, done only once */
2313                 skb2 = skb_clone(skb, GFP_ATOMIC);
2314                 if (!skb2)
2315                         goto out_unlock;
2316 
2317                 net_timestamp_set(skb2);
2318 
2319                 /* skb->nh should be correctly
2320                  * set by sender, so that the second statement is
2321                  * just protection against buggy protocols.
2322                  */
2323                 skb_reset_mac_header(skb2);
2324 
2325                 if (skb_network_header(skb2) < skb2->data ||
2326                     skb_network_header(skb2) > skb_tail_pointer(skb2)) {
2327                         net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
2328                                              ntohs(skb2->protocol),
2329                                              dev->name);
2330                         skb_reset_network_header(skb2);
2331                 }
2332 
2333                 skb2->transport_header = skb2->network_header;
2334                 skb2->pkt_type = PACKET_OUTGOING;
2335                 pt_prev = ptype;
2336         }
2337 
2338         if (ptype_list == &net_hotdata.ptype_all) {
2339                 ptype_list = &dev->ptype_all;
2340                 goto again;
2341         }
2342 out_unlock:
2343         if (pt_prev) {
2344                 if (!skb_orphan_frags_rx(skb2, GFP_ATOMIC))
2345                         pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
2346                 else
2347                         kfree_skb(skb2);
2348         }
2349         rcu_read_unlock();
2350 }
2351 EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
2352 
2353 /**
2354  * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
2355  * @dev: Network device
2356  * @txq: number of queues available
2357  *
2358  * If real_num_tx_queues is changed the tc mappings may no longer be
2359  * valid. To resolve this verify the tc mapping remains valid and if
2360  * not NULL the mapping. With no priorities mapping to this
2361  * offset/count pair it will no longer be used. In the worst case TC0
2362  * is invalid nothing can be done so disable priority mappings. If is
2363  * expected that drivers will fix this mapping if they can before
2364  * calling netif_set_real_num_tx_queues.
2365  */
2366 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
2367 {
2368         int i;
2369         struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
2370 
2371         /* If TC0 is invalidated disable TC mapping */
2372         if (tc->offset + tc->count > txq) {
2373                 netdev_warn(dev, "Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
2374                 dev->num_tc = 0;
2375                 return;
2376         }
2377 
2378         /* Invalidated prio to tc mappings set to TC0 */
2379         for (i = 1; i < TC_BITMASK + 1; i++) {
2380                 int q = netdev_get_prio_tc_map(dev, i);
2381 
2382                 tc = &dev->tc_to_txq[q];
2383                 if (tc->offset + tc->count > txq) {
2384                         netdev_warn(dev, "Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
2385                                     i, q);
2386                         netdev_set_prio_tc_map(dev, i, 0);
2387                 }
2388         }
2389 }
2390 
2391 int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
2392 {
2393         if (dev->num_tc) {
2394                 struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
2395                 int i;
2396 
2397                 /* walk through the TCs and see if it falls into any of them */
2398                 for (i = 0; i < TC_MAX_QUEUE; i++, tc++) {
2399                         if ((txq - tc->offset) < tc->count)
2400                                 return i;
2401                 }
2402 
2403                 /* didn't find it, just return -1 to indicate no match */
2404                 return -1;
2405         }
2406 
2407         return 0;
2408 }
2409 EXPORT_SYMBOL(netdev_txq_to_tc);
2410 
2411 #ifdef CONFIG_XPS
2412 static struct static_key xps_needed __read_mostly;
2413 static struct static_key xps_rxqs_needed __read_mostly;
2414 static DEFINE_MUTEX(xps_map_mutex);
2415 #define xmap_dereference(P)             \
2416         rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
2417 
2418 static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
2419                              struct xps_dev_maps *old_maps, int tci, u16 index)
2420 {
2421         struct xps_map *map = NULL;
2422         int pos;
2423 
2424         map = xmap_dereference(dev_maps->attr_map[tci]);
2425         if (!map)
2426                 return false;
2427 
2428         for (pos = map->len; pos--;) {
2429                 if (map->queues[pos] != index)
2430                         continue;
2431 
2432                 if (map->len > 1) {
2433                         map->queues[pos] = map->queues[--map->len];
2434                         break;
2435                 }
2436 
2437                 if (old_maps)
2438                         RCU_INIT_POINTER(old_maps->attr_map[tci], NULL);
2439                 RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
2440                 kfree_rcu(map, rcu);
2441                 return false;
2442         }
2443 
2444         return true;
2445 }
2446 
2447 static bool remove_xps_queue_cpu(struct net_device *dev,
2448                                  struct xps_dev_maps *dev_maps,
2449                                  int cpu, u16 offset, u16 count)
2450 {
2451         int num_tc = dev_maps->num_tc;
2452         bool active = false;
2453         int tci;
2454 
2455         for (tci = cpu * num_tc; num_tc--; tci++) {
2456                 int i, j;
2457 
2458                 for (i = count, j = offset; i--; j++) {
2459                         if (!remove_xps_queue(dev_maps, NULL, tci, j))
2460                                 break;
2461                 }
2462 
2463                 active |= i < 0;
2464         }
2465 
2466         return active;
2467 }
2468 
2469 static void reset_xps_maps(struct net_device *dev,
2470                            struct xps_dev_maps *dev_maps,
2471                            enum xps_map_type type)
2472 {
2473         static_key_slow_dec_cpuslocked(&xps_needed);
2474         if (type == XPS_RXQS)
2475                 static_key_slow_dec_cpuslocked(&xps_rxqs_needed);
2476 
2477         RCU_INIT_POINTER(dev->xps_maps[type], NULL);
2478 
2479         kfree_rcu(dev_maps, rcu);
2480 }
2481 
2482 static void clean_xps_maps(struct net_device *dev, enum xps_map_type type,
2483                            u16 offset, u16 count)
2484 {
2485         struct xps_dev_maps *dev_maps;
2486         bool active = false;
2487         int i, j;
2488 
2489         dev_maps = xmap_dereference(dev->xps_maps[type]);
2490         if (!dev_maps)
2491                 return;
2492 
2493         for (j = 0; j < dev_maps->nr_ids; j++)
2494                 active |= remove_xps_queue_cpu(dev, dev_maps, j, offset, count);
2495         if (!active)
2496                 reset_xps_maps(dev, dev_maps, type);
2497 
2498         if (type == XPS_CPUS) {
2499                 for (i = offset + (count - 1); count--; i--)
2500                         netdev_queue_numa_node_write(
2501                                 netdev_get_tx_queue(dev, i), NUMA_NO_NODE);
2502         }
2503 }
2504 
2505 static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
2506                                    u16 count)
2507 {
2508         if (!static_key_false(&xps_needed))
2509                 return;
2510 
2511         cpus_read_lock();
2512         mutex_lock(&xps_map_mutex);
2513 
2514         if (static_key_false(&xps_rxqs_needed))
2515                 clean_xps_maps(dev, XPS_RXQS, offset, count);
2516 
2517         clean_xps_maps(dev, XPS_CPUS, offset, count);
2518 
2519         mutex_unlock(&xps_map_mutex);
2520         cpus_read_unlock();
2521 }
2522 
2523 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
2524 {
2525         netif_reset_xps_queues(dev, index, dev->num_tx_queues - index);
2526 }
2527 
2528 static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index,
2529                                       u16 index, bool is_rxqs_map)
2530 {
2531         struct xps_map *new_map;
2532         int alloc_len = XPS_MIN_MAP_ALLOC;
2533         int i, pos;
2534 
2535         for (pos = 0; map && pos < map->len; pos++) {
2536                 if (map->queues[pos] != index)
2537                         continue;
2538                 return map;
2539         }
2540 
2541         /* Need to add tx-queue to this CPU's/rx-queue's existing map */
2542         if (map) {
2543                 if (pos < map->alloc_len)
2544                         return map;
2545 
2546                 alloc_len = map->alloc_len * 2;
2547         }
2548 
2549         /* Need to allocate new map to store tx-queue on this CPU's/rx-queue's
2550          *  map
2551          */
2552         if (is_rxqs_map)
2553                 new_map = kzalloc(XPS_MAP_SIZE(alloc_len), GFP_KERNEL);
2554         else
2555                 new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
2556                                        cpu_to_node(attr_index));
2557         if (!new_map)
2558                 return NULL;
2559 
2560         for (i = 0; i < pos; i++)
2561                 new_map->queues[i] = map->queues[i];
2562         new_map->alloc_len = alloc_len;
2563         new_map->len = pos;
2564 
2565         return new_map;
2566 }
2567 
2568 /* Copy xps maps at a given index */
2569 static void xps_copy_dev_maps(struct xps_dev_maps *dev_maps,
2570                               struct xps_dev_maps *new_dev_maps, int index,
2571                               int tc, bool skip_tc)
2572 {
2573         int i, tci = index * dev_maps->num_tc;
2574         struct xps_map *map;
2575 
2576         /* copy maps belonging to foreign traffic classes */
2577         for (i = 0; i < dev_maps->num_tc; i++, tci++) {
2578                 if (i == tc && skip_tc)
2579                         continue;
2580 
2581                 /* fill in the new device map from the old device map */
2582                 map = xmap_dereference(dev_maps->attr_map[tci]);
2583                 RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
2584         }
2585 }
2586 
2587 /* Must be called under cpus_read_lock */
2588 int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
2589                           u16 index, enum xps_map_type type)
2590 {
2591         struct xps_dev_maps *dev_maps, *new_dev_maps = NULL, *old_dev_maps = NULL;
2592         const unsigned long *online_mask = NULL;
2593         bool active = false, copy = false;
2594         int i, j, tci, numa_node_id = -2;
2595         int maps_sz, num_tc = 1, tc = 0;
2596         struct xps_map *map, *new_map;
2597         unsigned int nr_ids;
2598 
2599         WARN_ON_ONCE(index >= dev->num_tx_queues);
2600 
2601         if (dev->num_tc) {
2602                 /* Do not allow XPS on subordinate device directly */
2603                 num_tc = dev->num_tc;
2604                 if (num_tc < 0)
2605                         return -EINVAL;
2606 
2607                 /* If queue belongs to subordinate dev use its map */
2608                 dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
2609 
2610                 tc = netdev_txq_to_tc(dev, index);
2611                 if (tc < 0)
2612                         return -EINVAL;
2613         }
2614 
2615         mutex_lock(&xps_map_mutex);
2616 
2617         dev_maps = xmap_dereference(dev->xps_maps[type]);
2618         if (type == XPS_RXQS) {
2619                 maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues);
2620                 nr_ids = dev->num_rx_queues;
2621         } else {
2622                 maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc);
2623                 if (num_possible_cpus() > 1)
2624                         online_mask = cpumask_bits(cpu_online_mask);
2625                 nr_ids = nr_cpu_ids;
2626         }
2627 
2628         if (maps_sz < L1_CACHE_BYTES)
2629                 maps_sz = L1_CACHE_BYTES;
2630 
2631         /* The old dev_maps could be larger or smaller than the one we're
2632          * setting up now, as dev->num_tc or nr_ids could have been updated in
2633          * between. We could try to be smart, but let's be safe instead and only
2634          * copy foreign traffic classes if the two map sizes match.
2635          */
2636         if (dev_maps &&
2637             dev_maps->num_tc == num_tc && dev_maps->nr_ids == nr_ids)
2638                 copy = true;
2639 
2640         /* allocate memory for queue storage */
2641         for (j = -1; j = netif_attrmask_next_and(j, online_mask, mask, nr_ids),
2642              j < nr_ids;) {
2643                 if (!new_dev_maps) {
2644                         new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2645                         if (!new_dev_maps) {
2646                                 mutex_unlock(&xps_map_mutex);
2647                                 return -ENOMEM;
2648                         }
2649 
2650                         new_dev_maps->nr_ids = nr_ids;
2651                         new_dev_maps->num_tc = num_tc;
2652                 }
2653 
2654                 tci = j * num_tc + tc;
2655                 map = copy ? xmap_dereference(dev_maps->attr_map[tci]) : NULL;
2656 
2657                 map = expand_xps_map(map, j, index, type == XPS_RXQS);
2658                 if (!map)
2659                         goto error;
2660 
2661                 RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
2662         }
2663 
2664         if (!new_dev_maps)
2665                 goto out_no_new_maps;
2666 
2667         if (!dev_maps) {
2668                 /* Increment static keys at most once per type */
2669                 static_key_slow_inc_cpuslocked(&xps_needed);
2670                 if (type == XPS_RXQS)
2671                         static_key_slow_inc_cpuslocked(&xps_rxqs_needed);
2672         }
2673 
2674         for (j = 0; j < nr_ids; j++) {
2675                 bool skip_tc = false;
2676 
2677                 tci = j * num_tc + tc;
2678                 if (netif_attr_test_mask(j, mask, nr_ids) &&
2679                     netif_attr_test_online(j, online_mask, nr_ids)) {
2680                         /* add tx-queue to CPU/rx-queue maps */
2681                         int pos = 0;
2682 
2683                         skip_tc = true;
2684 
2685                         map = xmap_dereference(new_dev_maps->attr_map[tci]);
2686                         while ((pos < map->len) && (map->queues[pos] != index))
2687                                 pos++;
2688 
2689                         if (pos == map->len)
2690                                 map->queues[map->len++] = index;
2691 #ifdef CONFIG_NUMA
2692                         if (type == XPS_CPUS) {
2693                                 if (numa_node_id == -2)
2694                                         numa_node_id = cpu_to_node(j);
2695                                 else if (numa_node_id != cpu_to_node(j))
2696                                         numa_node_id = -1;
2697                         }
2698 #endif
2699                 }
2700 
2701                 if (copy)
2702                         xps_copy_dev_maps(dev_maps, new_dev_maps, j, tc,
2703                                           skip_tc);
2704         }
2705 
2706         rcu_assign_pointer(dev->xps_maps[type], new_dev_maps);
2707 
2708         /* Cleanup old maps */
2709         if (!dev_maps)
2710                 goto out_no_old_maps;
2711 
2712         for (j = 0; j < dev_maps->nr_ids; j++) {
2713                 for (i = num_tc, tci = j * dev_maps->num_tc; i--; tci++) {
2714                         map = xmap_dereference(dev_maps->attr_map[tci]);
2715                         if (!map)
2716                                 continue;
2717 
2718                         if (copy) {
2719                                 new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
2720                                 if (map == new_map)
2721                                         continue;
2722                         }
2723 
2724                         RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
2725                         kfree_rcu(map, rcu);
2726                 }
2727         }
2728 
2729         old_dev_maps = dev_maps;
2730 
2731 out_no_old_maps:
2732         dev_maps = new_dev_maps;
2733         active = true;
2734 
2735 out_no_new_maps:
2736         if (type == XPS_CPUS)
2737                 /* update Tx queue numa node */
2738                 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2739                                              (numa_node_id >= 0) ?
2740                                              numa_node_id : NUMA_NO_NODE);
2741 
2742         if (!dev_maps)
2743                 goto out_no_maps;
2744 
2745         /* removes tx-queue from unused CPUs/rx-queues */
2746         for (j = 0; j < dev_maps->nr_ids; j++) {
2747                 tci = j * dev_maps->num_tc;
2748 
2749                 for (i = 0; i < dev_maps->num_tc; i++, tci++) {
2750                         if (i == tc &&
2751                             netif_attr_test_mask(j, mask, dev_maps->nr_ids) &&
2752                             netif_attr_test_online(j, online_mask, dev_maps->nr_ids))
2753                                 continue;
2754 
2755                         active |= remove_xps_queue(dev_maps,
2756                                                    copy ? old_dev_maps : NULL,
2757                                                    tci, index);
2758                 }
2759         }
2760 
2761         if (old_dev_maps)
2762                 kfree_rcu(old_dev_maps, rcu);
2763 
2764         /* free map if not active */
2765         if (!active)
2766                 reset_xps_maps(dev, dev_maps, type);
2767 
2768 out_no_maps:
2769         mutex_unlock(&xps_map_mutex);
2770 
2771         return 0;
2772 error:
2773         /* remove any maps that we added */
2774         for (j = 0; j < nr_ids; j++) {
2775                 for (i = num_tc, tci = j * num_tc; i--; tci++) {
2776                         new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
2777                         map = copy ?
2778                               xmap_dereference(dev_maps->attr_map[tci]) :
2779                               NULL;
2780                         if (new_map && new_map != map)
2781                                 kfree(new_map);
2782                 }
2783         }
2784 
2785         mutex_unlock(&xps_map_mutex);
2786 
2787         kfree(new_dev_maps);
2788         return -ENOMEM;
2789 }
2790 EXPORT_SYMBOL_GPL(__netif_set_xps_queue);
2791 
2792 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
2793                         u16 index)
2794 {
2795         int ret;
2796 
2797         cpus_read_lock();
2798         ret =  __netif_set_xps_queue(dev, cpumask_bits(mask), index, XPS_CPUS);
2799         cpus_read_unlock();
2800 
2801         return ret;
2802 }
2803 EXPORT_SYMBOL(netif_set_xps_queue);
2804 
2805 #endif
2806 static void netdev_unbind_all_sb_channels(struct net_device *dev)
2807 {
2808         struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
2809 
2810         /* Unbind any subordinate channels */
2811         while (txq-- != &dev->_tx[0]) {
2812                 if (txq->sb_dev)
2813                         netdev_unbind_sb_channel(dev, txq->sb_dev);
2814         }
2815 }
2816 
2817 void netdev_reset_tc(struct net_device *dev)
2818 {
2819 #ifdef CONFIG_XPS
2820         netif_reset_xps_queues_gt(dev, 0);
2821 #endif
2822         netdev_unbind_all_sb_channels(dev);
2823 
2824         /* Reset TC configuration of device */
2825         dev->num_tc = 0;
2826         memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
2827         memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
2828 }
2829 EXPORT_SYMBOL(netdev_reset_tc);
2830 
2831 int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset)
2832 {
2833         if (tc >= dev->num_tc)
2834                 return -EINVAL;
2835 
2836 #ifdef CONFIG_XPS
2837         netif_reset_xps_queues(dev, offset, count);
2838 #endif
2839         dev->tc_to_txq[tc].count = count;
2840         dev->tc_to_txq[tc].offset = offset;
2841         return 0;
2842 }
2843 EXPORT_SYMBOL(netdev_set_tc_queue);
2844 
2845 int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
2846 {
2847         if (num_tc > TC_MAX_QUEUE)
2848                 return -EINVAL;
2849 
2850 #ifdef CONFIG_XPS
2851         netif_reset_xps_queues_gt(dev, 0);
2852 #endif
2853         netdev_unbind_all_sb_channels(dev);
2854 
2855         dev->num_tc = num_tc;
2856         return 0;
2857 }
2858 EXPORT_SYMBOL(netdev_set_num_tc);
2859 
2860 void netdev_unbind_sb_channel(struct net_device *dev,
2861                               struct net_device *sb_dev)
2862 {
2863         struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
2864 
2865 #ifdef CONFIG_XPS
2866         netif_reset_xps_queues_gt(sb_dev, 0);
2867 #endif
2868         memset(sb_dev->tc_to_txq, 0, sizeof(sb_dev->tc_to_txq));
2869         memset(sb_dev->prio_tc_map, 0, sizeof(sb_dev->prio_tc_map));
2870 
2871         while (txq-- != &dev->_tx[0]) {
2872                 if (txq->sb_dev == sb_dev)
2873                         txq->sb_dev = NULL;
2874         }
2875 }
2876 EXPORT_SYMBOL(netdev_unbind_sb_channel);
2877 
2878 int netdev_bind_sb_channel_queue(struct net_device *dev,
2879                                  struct net_device *sb_dev,
2880                                  u8 tc, u16 count, u16 offset)
2881 {
2882         /* Make certain the sb_dev and dev are already configured */
2883         if (sb_dev->num_tc >= 0 || tc >= dev->num_tc)
2884                 return -EINVAL;
2885 
2886         /* We cannot hand out queues we don't have */
2887         if ((offset + count) > dev->real_num_tx_queues)
2888                 return -EINVAL;
2889 
2890         /* Record the mapping */
2891         sb_dev->tc_to_txq[tc].count = count;
2892         sb_dev->tc_to_txq[tc].offset = offset;
2893 
2894         /* Provide a way for Tx queue to find the tc_to_txq map or
2895          * XPS map for itself.
2896          */
2897         while (count--)
2898                 netdev_get_tx_queue(dev, count + offset)->sb_dev = sb_dev;
2899 
2900         return 0;
2901 }
2902 EXPORT_SYMBOL(netdev_bind_sb_channel_queue);
2903 
2904 int netdev_set_sb_channel(struct net_device *dev, u16 channel)
2905 {
2906         /* Do not use a multiqueue device to represent a subordinate channel */
2907         if (netif_is_multiqueue(dev))
2908                 return -ENODEV;
2909 
2910         /* We allow channels 1 - 32767 to be used for subordinate channels.
2911          * Channel 0 is meant to be "native" mode and used only to represent
2912          * the main root device. We allow writing 0 to reset the device back
2913          * to normal mode after being used as a subordinate channel.
2914          */
2915         if (channel > S16_MAX)
2916                 return -EINVAL;
2917 
2918         dev->num_tc = -channel;
2919 
2920         return 0;
2921 }
2922 EXPORT_SYMBOL(netdev_set_sb_channel);
2923 
2924 /*
2925  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2926  * greater than real_num_tx_queues stale skbs on the qdisc must be flushed.
2927  */
2928 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2929 {
2930         bool disabling;
2931         int rc;
2932 
2933         disabling = txq < dev->real_num_tx_queues;
2934 
2935         if (txq < 1 || txq > dev->num_tx_queues)
2936                 return -EINVAL;
2937 
2938         if (dev->reg_state == NETREG_REGISTERED ||
2939             dev->reg_state == NETREG_UNREGISTERING) {
2940                 ASSERT_RTNL();
2941 
2942                 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2943                                                   txq);
2944                 if (rc)
2945                         return rc;
2946 
2947                 if (dev->num_tc)
2948                         netif_setup_tc(dev, txq);
2949 
2950                 dev_qdisc_change_real_num_tx(dev, txq);
2951 
2952                 dev->real_num_tx_queues = txq;
2953 
2954                 if (disabling) {
2955                         synchronize_net();
2956                         qdisc_reset_all_tx_gt(dev, txq);
2957 #ifdef CONFIG_XPS
2958                         netif_reset_xps_queues_gt(dev, txq);
2959 #endif
2960                 }
2961         } else {
2962                 dev->real_num_tx_queues = txq;
2963         }
2964 
2965         return 0;
2966 }
2967 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2968 
2969 #ifdef CONFIG_SYSFS
2970 /**
2971  *      netif_set_real_num_rx_queues - set actual number of RX queues used
2972  *      @dev: Network device
2973  *      @rxq: Actual number of RX queues
2974  *
2975  *      This must be called either with the rtnl_lock held or before
2976  *      registration of the net device.  Returns 0 on success, or a
2977  *      negative error code.  If called before registration, it always
2978  *      succeeds.
2979  */
2980 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2981 {
2982         int rc;
2983 
2984         if (rxq < 1 || rxq > dev->num_rx_queues)
2985                 return -EINVAL;
2986 
2987         if (dev->reg_state == NETREG_REGISTERED) {
2988                 ASSERT_RTNL();
2989 
2990                 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2991                                                   rxq);
2992                 if (rc)
2993                         return rc;
2994         }
2995 
2996         dev->real_num_rx_queues = rxq;
2997         return 0;
2998 }
2999 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
3000 #endif
3001 
3002 /**
3003  *      netif_set_real_num_queues - set actual number of RX and TX queues used
3004  *      @dev: Network device
3005  *      @txq: Actual number of TX queues
3006  *      @rxq: Actual number of RX queues
3007  *
3008  *      Set the real number of both TX and RX queues.
3009  *      Does nothing if the number of queues is already correct.
3010  */
3011 int netif_set_real_num_queues(struct net_device *dev,
3012                               unsigned int txq, unsigned int rxq)
3013 {
3014         unsigned int old_rxq = dev->real_num_rx_queues;
3015         int err;
3016 
3017         if (txq < 1 || txq > dev->num_tx_queues ||
3018             rxq < 1 || rxq > dev->num_rx_queues)
3019                 return -EINVAL;
3020 
3021         /* Start from increases, so the error path only does decreases -
3022          * decreases can't fail.
3023          */
3024         if (rxq > dev->real_num_rx_queues) {
3025                 err = netif_set_real_num_rx_queues(dev, rxq);
3026                 if (err)
3027                         return err;
3028         }
3029         if (txq > dev->real_num_tx_queues) {
3030                 err = netif_set_real_num_tx_queues(dev, txq);
3031                 if (err)
3032                         goto undo_rx;
3033         }
3034         if (rxq < dev->real_num_rx_queues)
3035                 WARN_ON(netif_set_real_num_rx_queues(dev, rxq));
3036         if (txq < dev->real_num_tx_queues)
3037                 WARN_ON(netif_set_real_num_tx_queues(dev, txq));
3038 
3039         return 0;
3040 undo_rx:
3041         WARN_ON(netif_set_real_num_rx_queues(dev, old_rxq));
3042         return err;
3043 }
3044 EXPORT_SYMBOL(netif_set_real_num_queues);
3045 
3046 /**
3047  * netif_set_tso_max_size() - set the max size of TSO frames supported
3048  * @dev:        netdev to update
3049  * @size:       max skb->len of a TSO frame
3050  *
3051  * Set the limit on the size of TSO super-frames the device can handle.
3052  * Unless explicitly set the stack will assume the value of
3053  * %GSO_LEGACY_MAX_SIZE.
3054  */
3055 void netif_set_tso_max_size(struct net_device *dev, unsigned int size)
3056 {
3057         dev->tso_max_size = min(GSO_MAX_SIZE, size);
3058         if (size < READ_ONCE(dev->gso_max_size))
3059                 netif_set_gso_max_size(dev, size);
3060         if (size < READ_ONCE(dev->gso_ipv4_max_size))
3061                 netif_set_gso_ipv4_max_size(dev, size);
3062 }
3063 EXPORT_SYMBOL(netif_set_tso_max_size);
3064 
3065 /**
3066  * netif_set_tso_max_segs() - set the max number of segs supported for TSO
3067  * @dev:        netdev to update
3068  * @segs:       max number of TCP segments
3069  *
3070  * Set the limit on the number of TCP segments the device can generate from
3071  * a single TSO super-frame.
3072  * Unless explicitly set the stack will assume the value of %GSO_MAX_SEGS.
3073  */
3074 void netif_set_tso_max_segs(struct net_device *dev, unsigned int segs)
3075 {
3076         dev->tso_max_segs = segs;
3077         if (segs < READ_ONCE(dev->gso_max_segs))
3078                 netif_set_gso_max_segs(dev, segs);
3079 }
3080 EXPORT_SYMBOL(netif_set_tso_max_segs);
3081 
3082 /**
3083  * netif_inherit_tso_max() - copy all TSO limits from a lower device to an upper
3084  * @to:         netdev to update
3085  * @from:       netdev from which to copy the limits
3086  */
3087 void netif_inherit_tso_max(struct net_device *to, const struct net_device *from)
3088 {
3089         netif_set_tso_max_size(to, from->tso_max_size);
3090         netif_set_tso_max_segs(to, from->tso_max_segs);
3091 }
3092 EXPORT_SYMBOL(netif_inherit_tso_max);
3093 
3094 /**
3095  * netif_get_num_default_rss_queues - default number of RSS queues
3096  *
3097  * Default value is the number of physical cores if there are only 1 or 2, or
3098  * divided by 2 if there are more.
3099  */
3100 int netif_get_num_default_rss_queues(void)
3101 {
3102         cpumask_var_t cpus;
3103         int cpu, count = 0;
3104 
3105         if (unlikely(is_kdump_kernel() || !zalloc_cpumask_var(&cpus, GFP_KERNEL)))
3106                 return 1;
3107 
3108         cpumask_copy(cpus, cpu_online_mask);
3109         for_each_cpu(cpu, cpus) {
3110                 ++count;
3111                 cpumask_andnot(cpus, cpus, topology_sibling_cpumask(cpu));
3112         }
3113         free_cpumask_var(cpus);
3114 
3115         return count > 2 ? DIV_ROUND_UP(count, 2) : count;
3116 }
3117 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
3118 
3119 static void __netif_reschedule(struct Qdisc *q)
3120 {
3121         struct softnet_data *sd;
3122         unsigned long flags;
3123 
3124         local_irq_save(flags);
3125         sd = this_cpu_ptr(&softnet_data);
3126         q->next_sched = NULL;
3127         *sd->output_queue_tailp = q;
3128         sd->output_queue_tailp = &q->next_sched;
3129         raise_softirq_irqoff(NET_TX_SOFTIRQ);
3130         local_irq_restore(flags);
3131 }
3132 
3133 void __netif_schedule(struct Qdisc *q)
3134 {
3135         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
3136                 __netif_reschedule(q);
3137 }
3138 EXPORT_SYMBOL(__netif_schedule);
3139 
3140 struct dev_kfree_skb_cb {
3141         enum skb_drop_reason reason;
3142 };
3143 
3144 static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
3145 {
3146         return (struct dev_kfree_skb_cb *)skb->cb;
3147 }
3148 
3149 void netif_schedule_queue(struct netdev_queue *txq)
3150 {
3151         rcu_read_lock();
3152         if (!netif_xmit_stopped(txq)) {
3153                 struct Qdisc *q = rcu_dereference(txq->qdisc);
3154 
3155                 __netif_schedule(q);
3156         }
3157         rcu_read_unlock();
3158 }
3159 EXPORT_SYMBOL(netif_schedule_queue);
3160 
3161 void netif_tx_wake_queue(struct netdev_queue *dev_queue)
3162 {
3163         if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
3164                 struct Qdisc *q;
3165 
3166                 rcu_read_lock();
3167                 q = rcu_dereference(dev_queue->qdisc);
3168                 __netif_schedule(q);
3169                 rcu_read_unlock();
3170         }
3171 }
3172 EXPORT_SYMBOL(netif_tx_wake_queue);
3173 
3174 void dev_kfree_skb_irq_reason(struct sk_buff *skb, enum skb_drop_reason reason)
3175 {
3176         unsigned long flags;
3177 
3178         if (unlikely(!skb))
3179                 return;
3180 
3181         if (likely(refcount_read(&skb->users) == 1)) {
3182                 smp_rmb();
3183                 refcount_set(&skb->users, 0);
3184         } else if (likely(!refcount_dec_and_test(&skb->users))) {
3185                 return;
3186         }
3187         get_kfree_skb_cb(skb)->reason = reason;
3188         local_irq_save(flags);
3189         skb->next = __this_cpu_read(softnet_data.completion_queue);
3190         __this_cpu_write(softnet_data.completion_queue, skb);
3191         raise_softirq_irqoff(NET_TX_SOFTIRQ);
3192         local_irq_restore(flags);
3193 }
3194 EXPORT_SYMBOL(dev_kfree_skb_irq_reason);
3195 
3196 void dev_kfree_skb_any_reason(struct sk_buff *skb, enum skb_drop_reason reason)
3197 {
3198         if (in_hardirq() || irqs_disabled())
3199                 dev_kfree_skb_irq_reason(skb, reason);
3200         else
3201                 kfree_skb_reason(skb, reason);
3202 }
3203 EXPORT_SYMBOL(dev_kfree_skb_any_reason);
3204 
3205 
3206 /**
3207  * netif_device_detach - mark device as removed
3208  * @dev: network device
3209  *
3210  * Mark device as removed from system and therefore no longer available.
3211  */
3212 void netif_device_detach(struct net_device *dev)
3213 {
3214         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
3215             netif_running(dev)) {
3216                 netif_tx_stop_all_queues(dev);
3217         }
3218 }
3219 EXPORT_SYMBOL(netif_device_detach);
3220 
3221 /**
3222  * netif_device_attach - mark device as attached
3223  * @dev: network device
3224  *
3225  * Mark device as attached from system and restart if needed.
3226  */
3227 void netif_device_attach(struct net_device *dev)
3228 {
3229         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
3230             netif_running(dev)) {
3231                 netif_tx_wake_all_queues(dev);
3232                 __netdev_watchdog_up(dev);
3233         }
3234 }
3235 EXPORT_SYMBOL(netif_device_attach);
3236 
3237 /*
3238  * Returns a Tx hash based on the given packet descriptor a Tx queues' number
3239  * to be used as a distribution range.
3240  */
3241 static u16 skb_tx_hash(const struct net_device *dev,
3242                        const struct net_device *sb_dev,
3243                        struct sk_buff *skb)
3244 {
3245         u32 hash;
3246         u16 qoffset = 0;
3247         u16 qcount = dev->real_num_tx_queues;
3248 
3249         if (dev->num_tc) {
3250                 u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
3251 
3252                 qoffset = sb_dev->tc_to_txq[tc].offset;
3253                 qcount = sb_dev->tc_to_txq[tc].count;
3254                 if (unlikely(!qcount)) {
3255                         net_warn_ratelimited("%s: invalid qcount, qoffset %u for tc %u\n",
3256                                              sb_dev->name, qoffset, tc);
3257                         qoffset = 0;
3258                         qcount = dev->real_num_tx_queues;
3259                 }
3260         }
3261 
3262         if (skb_rx_queue_recorded(skb)) {
3263                 DEBUG_NET_WARN_ON_ONCE(qcount == 0);
3264                 hash = skb_get_rx_queue(skb);
3265                 if (hash >= qoffset)
3266                         hash -= qoffset;
3267                 while (unlikely(hash >= qcount))
3268                         hash -= qcount;
3269                 return hash + qoffset;
3270         }
3271 
3272         return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
3273 }
3274 
3275 void skb_warn_bad_offload(const struct sk_buff *skb)
3276 {
3277         static const netdev_features_t null_features;
3278         struct net_device *dev = skb->dev;
3279         const char *name = "";
3280 
3281         if (!net_ratelimit())
3282                 return;
3283 
3284         if (dev) {
3285                 if (dev->dev.parent)
3286                         name = dev_driver_string(dev->dev.parent);
3287                 else
3288                         name = netdev_name(dev);
3289         }
3290         skb_dump(KERN_WARNING, skb, false);
3291         WARN(1, "%s: caps=(%pNF, %pNF)\n",
3292              name, dev ? &dev->features : &null_features,
3293              skb->sk ? &skb->sk->sk_route_caps : &null_features);
3294 }
3295 
3296 /*
3297  * Invalidate hardware checksum when packet is to be mangled, and
3298  * complete checksum manually on outgoing path.
3299  */
3300 int skb_checksum_help(struct sk_buff *skb)
3301 {
3302         __wsum csum;
3303         int ret = 0, offset;
3304 
3305         if (skb->ip_summed == CHECKSUM_COMPLETE)
3306                 goto out_set_summed;
3307 
3308         if (unlikely(skb_is_gso(skb))) {
3309                 skb_warn_bad_offload(skb);
3310                 return -EINVAL;
3311         }
3312 
3313         /* Before computing a checksum, we should make sure no frag could
3314          * be modified by an external entity : checksum could be wrong.
3315          */
3316         if (skb_has_shared_frag(skb)) {
3317                 ret = __skb_linearize(skb);
3318                 if (ret)
3319                         goto out;
3320         }
3321 
3322         offset = skb_checksum_start_offset(skb);
3323         ret = -EINVAL;
3324         if (unlikely(offset >= skb_headlen(skb))) {
3325                 DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false);
3326                 WARN_ONCE(true, "offset (%d) >= skb_headlen() (%u)\n",
3327                           offset, skb_headlen(skb));
3328                 goto out;
3329         }
3330         csum = skb_checksum(skb, offset, skb->len - offset, 0);
3331 
3332         offset += skb->csum_offset;
3333         if (unlikely(offset + sizeof(__sum16) > skb_headlen(skb))) {
3334                 DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false);
3335                 WARN_ONCE(true, "offset+2 (%zu) > skb_headlen() (%u)\n",
3336                           offset + sizeof(__sum16), skb_headlen(skb));
3337                 goto out;
3338         }
3339         ret = skb_ensure_writable(skb, offset + sizeof(__sum16));
3340         if (ret)
3341                 goto out;
3342 
3343         *(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
3344 out_set_summed:
3345         skb->ip_summed = CHECKSUM_NONE;
3346 out:
3347         return ret;
3348 }
3349 EXPORT_SYMBOL(skb_checksum_help);
3350 
3351 int skb_crc32c_csum_help(struct sk_buff *skb)
3352 {
3353         __le32 crc32c_csum;
3354         int ret = 0, offset, start;
3355 
3356         if (skb->ip_summed != CHECKSUM_PARTIAL)
3357                 goto out;
3358 
3359         if (unlikely(skb_is_gso(skb)))
3360                 goto out;
3361 
3362         /* Before computing a checksum, we should make sure no frag could
3363          * be modified by an external entity : checksum could be wrong.
3364          */
3365         if (unlikely(skb_has_shared_frag(skb))) {
3366                 ret = __skb_linearize(skb);
3367                 if (ret)
3368                         goto out;
3369         }
3370         start = skb_checksum_start_offset(skb);
3371         offset = start + offsetof(struct sctphdr, checksum);
3372         if (WARN_ON_ONCE(offset >= skb_headlen(skb))) {
3373                 ret = -EINVAL;
3374                 goto out;
3375         }
3376 
3377         ret = skb_ensure_writable(skb, offset + sizeof(__le32));
3378         if (ret)
3379                 goto out;
3380 
3381         crc32c_csum = cpu_to_le32(~__skb_checksum(skb, start,
3382                                                   skb->len - start, ~(__u32)0,
3383                                                   crc32c_csum_stub));
3384         *(__le32 *)(skb->data + offset) = crc32c_csum;
3385         skb_reset_csum_not_inet(skb);
3386 out:
3387         return ret;
3388 }
3389 
3390 __be16 skb_network_protocol(struct sk_buff *skb, int *depth)
3391 {
3392         __be16 type = skb->protocol;
3393 
3394         /* Tunnel gso handlers can set protocol to ethernet. */
3395         if (type == htons(ETH_P_TEB)) {
3396                 struct ethhdr *eth;
3397 
3398                 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
3399                         return 0;
3400 
3401                 eth = (struct ethhdr *)skb->data;
3402                 type = eth->h_proto;
3403         }
3404 
3405         return vlan_get_protocol_and_depth(skb, type, depth);
3406 }
3407 
3408 
3409 /* Take action when hardware reception checksum errors are detected. */
3410 #ifdef CONFIG_BUG
3411 static void do_netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb)
3412 {
3413         netdev_err(dev, "hw csum failure\n");
3414         skb_dump(KERN_ERR, skb, true);
3415         dump_stack();
3416 }
3417 
3418 void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb)
3419 {
3420         DO_ONCE_LITE(do_netdev_rx_csum_fault, dev, skb);
3421 }
3422 EXPORT_SYMBOL(netdev_rx_csum_fault);
3423 #endif
3424 
3425 /* XXX: check that highmem exists at all on the given machine. */
3426 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
3427 {
3428 #ifdef CONFIG_HIGHMEM
3429         int i;
3430 
3431         if (!(dev->features & NETIF_F_HIGHDMA)) {
3432                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
3433                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
3434 
3435                         if (PageHighMem(skb_frag_page(frag)))
3436                                 return 1;
3437                 }
3438         }
3439 #endif
3440         return 0;
3441 }
3442 
3443 /* If MPLS offload request, verify we are testing hardware MPLS features
3444  * instead of standard features for the netdev.
3445  */
3446 #if IS_ENABLED(CONFIG_NET_MPLS_GSO)
3447 static netdev_features_t net_mpls_features(struct sk_buff *skb,
3448                                            netdev_features_t features,
3449                                            __be16 type)
3450 {
3451         if (eth_p_mpls(type))
3452                 features &= skb->dev->mpls_features;
3453 
3454         return features;
3455 }
3456 #else
3457 static netdev_features_t net_mpls_features(struct sk_buff *skb,
3458                                            netdev_features_t features,
3459                                            __be16 type)
3460 {
3461         return features;
3462 }
3463 #endif
3464 
3465 static netdev_features_t harmonize_features(struct sk_buff *skb,
3466         netdev_features_t features)
3467 {
3468         __be16 type;
3469 
3470         type = skb_network_protocol(skb, NULL);
3471         features = net_mpls_features(skb, features, type);
3472 
3473         if (skb->ip_summed != CHECKSUM_NONE &&
3474             !can_checksum_protocol(features, type)) {
3475                 features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
3476         }
3477         if (illegal_highdma(skb->dev, skb))
3478                 features &= ~NETIF_F_SG;
3479 
3480         return features;
3481 }
3482 
3483 netdev_features_t passthru_features_check(struct sk_buff *skb,
3484                                           struct net_device *dev,
3485                                           netdev_features_t features)
3486 {
3487         return features;
3488 }
3489 EXPORT_SYMBOL(passthru_features_check);
3490 
3491 static netdev_features_t dflt_features_check(struct sk_buff *skb,
3492                                              struct net_device *dev,
3493                                              netdev_features_t features)
3494 {
3495         return vlan_features_check(skb, features);
3496 }
3497 
3498 static netdev_features_t gso_features_check(const struct sk_buff *skb,
3499                                             struct net_device *dev,
3500                                             netdev_features_t features)
3501 {
3502         u16 gso_segs = skb_shinfo(skb)->gso_segs;
3503 
3504         if (gso_segs > READ_ONCE(dev->gso_max_segs))
3505                 return features & ~NETIF_F_GSO_MASK;
3506 
3507         if (unlikely(skb->len >= netif_get_gso_max_size(dev, skb)))
3508                 return features & ~NETIF_F_GSO_MASK;
3509 
3510         if (!skb_shinfo(skb)->gso_type) {
3511                 skb_warn_bad_offload(skb);
3512                 return features & ~NETIF_F_GSO_MASK;
3513         }
3514 
3515         /* Support for GSO partial features requires software
3516          * intervention before we can actually process the packets
3517          * so we need to strip support for any partial features now
3518          * and we can pull them back in after we have partially
3519          * segmented the frame.
3520          */
3521         if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
3522                 features &= ~dev->gso_partial_features;
3523 
3524         /* Make sure to clear the IPv4 ID mangling feature if the
3525          * IPv4 header has the potential to be fragmented.
3526          */
3527         if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
3528                 struct iphdr *iph = skb->encapsulation ?
3529                                     inner_ip_hdr(skb) : ip_hdr(skb);
3530 
3531                 if (!(iph->frag_off & htons(IP_DF)))
3532                         features &= ~NETIF_F_TSO_MANGLEID;
3533         }
3534 
3535         return features;
3536 }
3537 
3538 netdev_features_t netif_skb_features(struct sk_buff *skb)
3539 {
3540         struct net_device *dev = skb->dev;
3541         netdev_features_t features = dev->features;
3542 
3543         if (skb_is_gso(skb))
3544                 features = gso_features_check(skb, dev, features);
3545 
3546         /* If encapsulation offload request, verify we are testing
3547          * hardware encapsulation features instead of standard
3548          * features for the netdev
3549          */
3550         if (skb->encapsulation)
3551                 features &= dev->hw_enc_features;
3552 
3553         if (skb_vlan_tagged(skb))
3554                 features = netdev_intersect_features(features,
3555                                                      dev->vlan_features |
3556                                                      NETIF_F_HW_VLAN_CTAG_TX |
3557                                                      NETIF_F_HW_VLAN_STAG_TX);
3558 
3559         if (dev->netdev_ops->ndo_features_check)
3560                 features &= dev->netdev_ops->ndo_features_check(skb, dev,
3561                                                                 features);
3562         else
3563                 features &= dflt_features_check(skb, dev, features);
3564 
3565         return harmonize_features(skb, features);
3566 }
3567 EXPORT_SYMBOL(netif_skb_features);
3568 
3569 static int xmit_one(struct sk_buff *skb, struct net_device *dev,
3570                     struct netdev_queue *txq, bool more)
3571 {
3572         unsigned int len;
3573         int rc;
3574 
3575         if (dev_nit_active(dev))
3576                 dev_queue_xmit_nit(skb, dev);
3577 
3578         len = skb->len;
3579         trace_net_dev_start_xmit(skb, dev);
3580         rc = netdev_start_xmit(skb, dev, txq, more);
3581         trace_net_dev_xmit(skb, rc, dev, len);
3582 
3583         return rc;
3584 }
3585 
3586 struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
3587                                     struct netdev_queue *txq, int *ret)
3588 {
3589         struct sk_buff *skb = first;
3590         int rc = NETDEV_TX_OK;
3591 
3592         while (skb) {
3593                 struct sk_buff *next = skb->next;
3594 
3595                 skb_mark_not_on_list(skb);
3596                 rc = xmit_one(skb, dev, txq, next != NULL);
3597                 if (unlikely(!dev_xmit_complete(rc))) {
3598                         skb->next = next;
3599                         goto out;
3600                 }
3601 
3602                 skb = next;
3603                 if (netif_tx_queue_stopped(txq) && skb) {
3604                         rc = NETDEV_TX_BUSY;
3605                         break;
3606                 }
3607         }
3608 
3609 out:
3610         *ret = rc;
3611         return skb;
3612 }
3613 
3614 static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
3615                                           netdev_features_t features)
3616 {
3617         if (skb_vlan_tag_present(skb) &&
3618             !vlan_hw_offload_capable(features, skb->vlan_proto))
3619                 skb = __vlan_hwaccel_push_inside(skb);
3620         return skb;
3621 }
3622 
3623 int skb_csum_hwoffload_help(struct sk_buff *skb,
3624                             const netdev_features_t features)
3625 {
3626         if (unlikely(skb_csum_is_sctp(skb)))
3627                 return !!(features & NETIF_F_SCTP_CRC) ? 0 :
3628                         skb_crc32c_csum_help(skb);
3629 
3630         if (features & NETIF_F_HW_CSUM)
3631                 return 0;
3632 
3633         if (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) {
3634                 switch (skb->csum_offset) {
3635                 case offsetof(struct tcphdr, check):
3636                 case offsetof(struct udphdr, check):
3637                         return 0;
3638                 }
3639         }
3640 
3641         return skb_checksum_help(skb);
3642 }
3643 EXPORT_SYMBOL(skb_csum_hwoffload_help);
3644 
3645 static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev, bool *again)
3646 {
3647         netdev_features_t features;
3648 
3649         features = netif_skb_features(skb);
3650         skb = validate_xmit_vlan(skb, features);
3651         if (unlikely(!skb))
3652                 goto out_null;
3653 
3654         skb = sk_validate_xmit_skb(skb, dev);
3655         if (unlikely(!skb))
3656                 goto out_null;
3657 
3658         if (netif_needs_gso(skb, features)) {
3659                 struct sk_buff *segs;
3660 
3661                 segs = skb_gso_segment(skb, features);
3662                 if (IS_ERR(segs)) {
3663                         goto out_kfree_skb;
3664                 } else if (segs) {
3665                         consume_skb(skb);
3666                         skb = segs;
3667                 }
3668         } else {
3669                 if (skb_needs_linearize(skb, features) &&
3670                     __skb_linearize(skb))
3671                         goto out_kfree_skb;
3672 
3673                 /* If packet is not checksummed and device does not
3674                  * support checksumming for this protocol, complete
3675                  * checksumming here.
3676                  */
3677                 if (skb->ip_summed == CHECKSUM_PARTIAL) {
3678                         if (skb->encapsulation)
3679                                 skb_set_inner_transport_header(skb,
3680                                                                skb_checksum_start_offset(skb));
3681                         else
3682                                 skb_set_transport_header(skb,
3683                                                          skb_checksum_start_offset(skb));
3684                         if (skb_csum_hwoffload_help(skb, features))
3685                                 goto out_kfree_skb;
3686                 }
3687         }
3688 
3689         skb = validate_xmit_xfrm(skb, features, again);
3690 
3691         return skb;
3692 
3693 out_kfree_skb:
3694         kfree_skb(skb);
3695 out_null:
3696         dev_core_stats_tx_dropped_inc(dev);
3697         return NULL;
3698 }
3699 
3700 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again)
3701 {
3702         struct sk_buff *next, *head = NULL, *tail;
3703 
3704         for (; skb != NULL; skb = next) {
3705                 next = skb->next;
3706                 skb_mark_not_on_list(skb);
3707 
3708                 /* in case skb wont be segmented, point to itself */
3709                 skb->prev = skb;
3710 
3711                 skb = validate_xmit_skb(skb, dev, again);
3712                 if (!skb)
3713                         continue;
3714 
3715                 if (!head)
3716                         head = skb;
3717                 else
3718                         tail->next = skb;
3719                 /* If skb was segmented, skb->prev points to
3720                  * the last segment. If not, it still contains skb.
3721                  */
3722                 tail = skb->prev;
3723         }
3724         return head;
3725 }
3726 EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
3727 
3728 static void qdisc_pkt_len_init(struct sk_buff *skb)
3729 {
3730         const struct skb_shared_info *shinfo = skb_shinfo(skb);
3731 
3732         qdisc_skb_cb(skb)->pkt_len = skb->len;
3733 
3734         /* To get more precise estimation of bytes sent on wire,
3735          * we add to pkt_len the headers size of all segments
3736          */
3737         if (shinfo->gso_size && skb_transport_header_was_set(skb)) {
3738                 u16 gso_segs = shinfo->gso_segs;
3739                 unsigned int hdr_len;
3740 
3741                 /* mac layer + network layer */
3742                 hdr_len = skb_transport_offset(skb);
3743 
3744                 /* + transport layer */
3745                 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
3746                         const struct tcphdr *th;
3747                         struct tcphdr _tcphdr;
3748 
3749                         th = skb_header_pointer(skb, hdr_len,
3750                                                 sizeof(_tcphdr), &_tcphdr);
3751                         if (likely(th))
3752                                 hdr_len += __tcp_hdrlen(th);
3753                 } else if (shinfo->gso_type & SKB_GSO_UDP_L4) {
3754                         struct udphdr _udphdr;
3755 
3756                         if (skb_header_pointer(skb, hdr_len,
3757                                                sizeof(_udphdr), &_udphdr))
3758                                 hdr_len += sizeof(struct udphdr);
3759                 }
3760 
3761                 if (unlikely(shinfo->gso_type & SKB_GSO_DODGY)) {
3762                         int payload = skb->len - hdr_len;
3763 
3764                         /* Malicious packet. */
3765                         if (payload <= 0)
3766                                 return;
3767                         gso_segs = DIV_ROUND_UP(payload, shinfo->gso_size);
3768                 }
3769                 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
3770         }
3771 }
3772 
3773 static int dev_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *q,
3774                              struct sk_buff **to_free,
3775                              struct netdev_queue *txq)
3776 {
3777         int rc;
3778 
3779         rc = q->enqueue(skb, q, to_free) & NET_XMIT_MASK;
3780         if (rc == NET_XMIT_SUCCESS)
3781                 trace_qdisc_enqueue(q, txq, skb);
3782         return rc;
3783 }
3784 
3785 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
3786                                  struct net_device *dev,
3787                                  struct netdev_queue *txq)
3788 {
3789         spinlock_t *root_lock = qdisc_lock(q);
3790         struct sk_buff *to_free = NULL;
3791         bool contended;
3792         int rc;
3793 
3794         qdisc_calculate_pkt_len(skb, q);
3795 
3796         tcf_set_drop_reason(skb, SKB_DROP_REASON_QDISC_DROP);
3797 
3798         if (q->flags & TCQ_F_NOLOCK) {
3799                 if (q->flags & TCQ_F_CAN_BYPASS && nolock_qdisc_is_empty(q) &&
3800                     qdisc_run_begin(q)) {
3801                         /* Retest nolock_qdisc_is_empty() within the protection
3802                          * of q->seqlock to protect from racing with requeuing.
3803                          */
3804                         if (unlikely(!nolock_qdisc_is_empty(q))) {
3805                                 rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
3806                                 __qdisc_run(q);
3807                                 qdisc_run_end(q);
3808 
3809                                 goto no_lock_out;
3810                         }
3811 
3812                         qdisc_bstats_cpu_update(q, skb);
3813                         if (sch_direct_xmit(skb, q, dev, txq, NULL, true) &&
3814                             !nolock_qdisc_is_empty(q))
3815                                 __qdisc_run(q);
3816 
3817                         qdisc_run_end(q);
3818                         return NET_XMIT_SUCCESS;
3819                 }
3820 
3821                 rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
3822                 qdisc_run(q);
3823 
3824 no_lock_out:
3825                 if (unlikely(to_free))
3826                         kfree_skb_list_reason(to_free,
3827                                               tcf_get_drop_reason(to_free));
3828                 return rc;
3829         }
3830 
3831         if (unlikely(READ_ONCE(q->owner) == smp_processor_id())) {
3832                 kfree_skb_reason(skb, SKB_DROP_REASON_TC_RECLASSIFY_LOOP);
3833                 return NET_XMIT_DROP;
3834         }
3835         /*
3836          * Heuristic to force contended enqueues to serialize on a
3837          * separate lock before trying to get qdisc main lock.
3838          * This permits qdisc->running owner to get the lock more
3839          * often and dequeue packets faster.
3840          * On PREEMPT_RT it is possible to preempt the qdisc owner during xmit
3841          * and then other tasks will only enqueue packets. The packets will be
3842          * sent after the qdisc owner is scheduled again. To prevent this
3843          * scenario the task always serialize on the lock.
3844          */
3845         contended = qdisc_is_running(q) || IS_ENABLED(CONFIG_PREEMPT_RT);
3846         if (unlikely(contended))
3847                 spin_lock(&q->busylock);
3848 
3849         spin_lock(root_lock);
3850         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
3851                 __qdisc_drop(skb, &to_free);
3852                 rc = NET_XMIT_DROP;
3853         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
3854                    qdisc_run_begin(q)) {
3855                 /*
3856                  * This is a work-conserving queue; there are no old skbs
3857                  * waiting to be sent out; and the qdisc is not running -
3858                  * xmit the skb directly.
3859                  */
3860 
3861                 qdisc_bstats_update(q, skb);
3862 
3863                 if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
3864                         if (unlikely(contended)) {
3865                                 spin_unlock(&q->busylock);
3866                                 contended = false;
3867                         }
3868                         __qdisc_run(q);
3869                 }
3870 
3871                 qdisc_run_end(q);
3872                 rc = NET_XMIT_SUCCESS;
3873         } else {
3874                 WRITE_ONCE(q->owner, smp_processor_id());
3875                 rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
3876                 WRITE_ONCE(q->owner, -1);
3877                 if (qdisc_run_begin(q)) {
3878                         if (unlikely(contended)) {
3879                                 spin_unlock(&q->busylock);
3880                                 contended = false;
3881                         }
3882                         __qdisc_run(q);
3883                         qdisc_run_end(q);
3884                 }
3885         }
3886         spin_unlock(root_lock);
3887         if (unlikely(to_free))
3888                 kfree_skb_list_reason(to_free,
3889                                       tcf_get_drop_reason(to_free));
3890         if (unlikely(contended))
3891                 spin_unlock(&q->busylock);
3892         return rc;
3893 }
3894 
3895 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
3896 static void skb_update_prio(struct sk_buff *skb)
3897 {
3898         const struct netprio_map *map;
3899         const struct sock *sk;
3900         unsigned int prioidx;
3901 
3902         if (skb->priority)
3903                 return;
3904         map = rcu_dereference_bh(skb->dev->priomap);
3905         if (!map)
3906                 return;
3907         sk = skb_to_full_sk(skb);
3908         if (!sk)
3909                 return;
3910 
3911         prioidx = sock_cgroup_prioidx(&sk->sk_cgrp_data);
3912 
3913         if (prioidx < map->priomap_len)
3914                 skb->priority = map->priomap[prioidx];
3915 }
3916 #else
3917 #define skb_update_prio(skb)
3918 #endif
3919 
3920 /**
3921  *      dev_loopback_xmit - loop back @skb
3922  *      @net: network namespace this loopback is happening in
3923  *      @sk:  sk needed to be a netfilter okfn
3924  *      @skb: buffer to transmit
3925  */
3926 int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
3927 {
3928         skb_reset_mac_header(skb);
3929         __skb_pull(skb, skb_network_offset(skb));
3930         skb->pkt_type = PACKET_LOOPBACK;
3931         if (skb->ip_summed == CHECKSUM_NONE)
3932                 skb->ip_summed = CHECKSUM_UNNECESSARY;
3933         DEBUG_NET_WARN_ON_ONCE(!skb_dst(skb));
3934         skb_dst_force(skb);
3935         netif_rx(skb);
3936         return 0;
3937 }
3938 EXPORT_SYMBOL(dev_loopback_xmit);
3939 
3940 #ifdef CONFIG_NET_EGRESS
3941 static struct netdev_queue *
3942 netdev_tx_queue_mapping(struct net_device *dev, struct sk_buff *skb)
3943 {
3944         int qm = skb_get_queue_mapping(skb);
3945 
3946         return netdev_get_tx_queue(dev, netdev_cap_txqueue(dev, qm));
3947 }
3948 
3949 #ifndef CONFIG_PREEMPT_RT
3950 static bool netdev_xmit_txqueue_skipped(void)
3951 {
3952         return __this_cpu_read(softnet_data.xmit.skip_txqueue);
3953 }
3954 
3955 void netdev_xmit_skip_txqueue(bool skip)
3956 {
3957         __this_cpu_write(softnet_data.xmit.skip_txqueue, skip);
3958 }
3959 EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue);
3960 
3961 #else
3962 static bool netdev_xmit_txqueue_skipped(void)
3963 {
3964         return current->net_xmit.skip_txqueue;
3965 }
3966 
3967 void netdev_xmit_skip_txqueue(bool skip)
3968 {
3969         current->net_xmit.skip_txqueue = skip;
3970 }
3971 EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue);
3972 #endif
3973 #endif /* CONFIG_NET_EGRESS */
3974 
3975 #ifdef CONFIG_NET_XGRESS
3976 static int tc_run(struct tcx_entry *entry, struct sk_buff *skb,
3977                   enum skb_drop_reason *drop_reason)
3978 {
3979         int ret = TC_ACT_UNSPEC;
3980 #ifdef CONFIG_NET_CLS_ACT
3981         struct mini_Qdisc *miniq = rcu_dereference_bh(entry->miniq);
3982         struct tcf_result res;
3983 
3984         if (!miniq)
3985                 return ret;
3986 
3987         if (static_branch_unlikely(&tcf_bypass_check_needed_key)) {
3988                 if (tcf_block_bypass_sw(miniq->block))
3989                         return ret;
3990         }
3991 
3992         tc_skb_cb(skb)->mru = 0;
3993         tc_skb_cb(skb)->post_ct = false;
3994         tcf_set_drop_reason(skb, *drop_reason);
3995 
3996         mini_qdisc_bstats_cpu_update(miniq, skb);
3997         ret = tcf_classify(skb, miniq->block, miniq->filter_list, &res, false);
3998         /* Only tcf related quirks below. */
3999         switch (ret) {
4000         case TC_ACT_SHOT:
4001                 *drop_reason = tcf_get_drop_reason(skb);
4002                 mini_qdisc_qstats_cpu_drop(miniq);
4003                 break;
4004         case TC_ACT_OK:
4005         case TC_ACT_RECLASSIFY:
4006                 skb->tc_index = TC_H_MIN(res.classid);
4007                 break;
4008         }
4009 #endif /* CONFIG_NET_CLS_ACT */
4010         return ret;
4011 }
4012 
4013 static DEFINE_STATIC_KEY_FALSE(tcx_needed_key);
4014 
4015 void tcx_inc(void)
4016 {
4017         static_branch_inc(&tcx_needed_key);
4018 }
4019 
4020 void tcx_dec(void)
4021 {
4022         static_branch_dec(&tcx_needed_key);
4023 }
4024 
4025 static __always_inline enum tcx_action_base
4026 tcx_run(const struct bpf_mprog_entry *entry, struct sk_buff *skb,
4027         const bool needs_mac)
4028 {
4029         const struct bpf_mprog_fp *fp;
4030         const struct bpf_prog *prog;
4031         int ret = TCX_NEXT;
4032 
4033         if (needs_mac)
4034                 __skb_push(skb, skb->mac_len);
4035         bpf_mprog_foreach_prog(entry, fp, prog) {
4036                 bpf_compute_data_pointers(skb);
4037                 ret = bpf_prog_run(prog, skb);
4038                 if (ret != TCX_NEXT)
4039                         break;
4040         }
4041         if (needs_mac)
4042                 __skb_pull(skb, skb->mac_len);
4043         return tcx_action_code(skb, ret);
4044 }
4045 
4046 static __always_inline struct sk_buff *
4047 sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
4048                    struct net_device *orig_dev, bool *another)
4049 {
4050         struct bpf_mprog_entry *entry = rcu_dereference_bh(skb->dev->tcx_ingress);
4051         enum skb_drop_reason drop_reason = SKB_DROP_REASON_TC_INGRESS;
4052         struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
4053         int sch_ret;
4054 
4055         if (!entry)
4056                 return skb;
4057 
4058         bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
4059         if (*pt_prev) {
4060                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
4061                 *pt_prev = NULL;
4062         }
4063 
4064         qdisc_skb_cb(skb)->pkt_len = skb->len;
4065         tcx_set_ingress(skb, true);
4066 
4067         if (static_branch_unlikely(&tcx_needed_key)) {
4068                 sch_ret = tcx_run(entry, skb, true);
4069                 if (sch_ret != TC_ACT_UNSPEC)
4070                         goto ingress_verdict;
4071         }
4072         sch_ret = tc_run(tcx_entry(entry), skb, &drop_reason);
4073 ingress_verdict:
4074         switch (sch_ret) {
4075         case TC_ACT_REDIRECT:
4076                 /* skb_mac_header check was done by BPF, so we can safely
4077                  * push the L2 header back before redirecting to another
4078                  * netdev.
4079                  */
4080                 __skb_push(skb, skb->mac_len);
4081                 if (skb_do_redirect(skb) == -EAGAIN) {
4082                         __skb_pull(skb, skb->mac_len);
4083                         *another = true;
4084                         break;
4085                 }
4086                 *ret = NET_RX_SUCCESS;
4087                 bpf_net_ctx_clear(bpf_net_ctx);
4088                 return NULL;
4089         case TC_ACT_SHOT:
4090                 kfree_skb_reason(skb, drop_reason);
4091                 *ret = NET_RX_DROP;
4092                 bpf_net_ctx_clear(bpf_net_ctx);
4093                 return NULL;
4094         /* used by tc_run */
4095         case TC_ACT_STOLEN:
4096         case TC_ACT_QUEUED:
4097         case TC_ACT_TRAP:
4098                 consume_skb(skb);
4099                 fallthrough;
4100         case TC_ACT_CONSUMED:
4101                 *ret = NET_RX_SUCCESS;
4102                 bpf_net_ctx_clear(bpf_net_ctx);
4103                 return NULL;
4104         }
4105         bpf_net_ctx_clear(bpf_net_ctx);
4106 
4107         return skb;
4108 }
4109 
4110 static __always_inline struct sk_buff *
4111 sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
4112 {
4113         struct bpf_mprog_entry *entry = rcu_dereference_bh(dev->tcx_egress);
4114         enum skb_drop_reason drop_reason = SKB_DROP_REASON_TC_EGRESS;
4115         struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
4116         int sch_ret;
4117 
4118         if (!entry)
4119                 return skb;
4120 
4121         bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
4122 
4123         /* qdisc_skb_cb(skb)->pkt_len & tcx_set_ingress() was
4124          * already set by the caller.
4125          */
4126         if (static_branch_unlikely(&tcx_needed_key)) {
4127                 sch_ret = tcx_run(entry, skb, false);
4128                 if (sch_ret != TC_ACT_UNSPEC)
4129                         goto egress_verdict;
4130         }
4131         sch_ret = tc_run(tcx_entry(entry), skb, &drop_reason);
4132 egress_verdict:
4133         switch (sch_ret) {
4134         case TC_ACT_REDIRECT:
4135                 /* No need to push/pop skb's mac_header here on egress! */
4136                 skb_do_redirect(skb);
4137                 *ret = NET_XMIT_SUCCESS;
4138                 bpf_net_ctx_clear(bpf_net_ctx);
4139                 return NULL;
4140         case TC_ACT_SHOT:
4141                 kfree_skb_reason(skb, drop_reason);
4142                 *ret = NET_XMIT_DROP;
4143                 bpf_net_ctx_clear(bpf_net_ctx);
4144                 return NULL;
4145         /* used by tc_run */
4146         case TC_ACT_STOLEN:
4147         case TC_ACT_QUEUED:
4148         case TC_ACT_TRAP:
4149                 consume_skb(skb);
4150                 fallthrough;
4151         case TC_ACT_CONSUMED:
4152                 *ret = NET_XMIT_SUCCESS;
4153                 bpf_net_ctx_clear(bpf_net_ctx);
4154                 return NULL;
4155         }
4156         bpf_net_ctx_clear(bpf_net_ctx);
4157 
4158         return skb;
4159 }
4160 #else
4161 static __always_inline struct sk_buff *
4162 sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
4163                    struct net_device *orig_dev, bool *another)
4164 {
4165         return skb;
4166 }
4167 
4168 static __always_inline struct sk_buff *
4169 sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
4170 {
4171         return skb;
4172 }
4173 #endif /* CONFIG_NET_XGRESS */
4174 
4175 #ifdef CONFIG_XPS
4176 static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb,
4177                                struct xps_dev_maps *dev_maps, unsigned int tci)
4178 {
4179         int tc = netdev_get_prio_tc_map(dev, skb->priority);
4180         struct xps_map *map;
4181         int queue_index = -1;
4182 
4183         if (tc >= dev_maps->num_tc || tci >= dev_maps->nr_ids)
4184                 return queue_index;
4185 
4186         tci *= dev_maps->num_tc;
4187         tci += tc;
4188 
4189         map = rcu_dereference(dev_maps->attr_map[tci]);
4190         if (map) {
4191                 if (map->len == 1)
4192                         queue_index = map->queues[0];
4193                 else
4194                         queue_index = map->queues[reciprocal_scale(
4195                                                 skb_get_hash(skb), map->len)];
4196                 if (unlikely(queue_index >= dev->real_num_tx_queues))
4197                         queue_index = -1;
4198         }
4199         return queue_index;
4200 }
4201 #endif
4202 
4203 static int get_xps_queue(struct net_device *dev, struct net_device *sb_dev,
4204                          struct sk_buff *skb)
4205 {
4206 #ifdef CONFIG_XPS
4207         struct xps_dev_maps *dev_maps;
4208         struct sock *sk = skb->sk;
4209         int queue_index = -1;
4210 
4211         if (!static_key_false(&xps_needed))
4212                 return -1;
4213 
4214         rcu_read_lock();
4215         if (!static_key_false(&xps_rxqs_needed))
4216                 goto get_cpus_map;
4217 
4218         dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_RXQS]);
4219         if (dev_maps) {
4220                 int tci = sk_rx_queue_get(sk);
4221 
4222                 if (tci >= 0)
4223                         queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
4224                                                           tci);
4225         }
4226 
4227 get_cpus_map:
4228         if (queue_index < 0) {
4229                 dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_CPUS]);
4230                 if (dev_maps) {
4231                         unsigned int tci = skb->sender_cpu - 1;
4232 
4233                         queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
4234                                                           tci);
4235                 }
4236         }
4237         rcu_read_unlock();
4238 
4239         return queue_index;
4240 #else
4241         return -1;
4242 #endif
4243 }
4244 
4245 u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb,
4246                      struct net_device *sb_dev)
4247 {
4248         return 0;
4249 }
4250 EXPORT_SYMBOL(dev_pick_tx_zero);
4251 
4252 u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb,
4253                        struct net_device *sb_dev)
4254 {
4255         return (u16)raw_smp_processor_id() % dev->real_num_tx_queues;
4256 }
4257 EXPORT_SYMBOL(dev_pick_tx_cpu_id);
4258 
4259 u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
4260                      struct net_device *sb_dev)
4261 {
4262         struct sock *sk = skb->sk;
4263         int queue_index = sk_tx_queue_get(sk);
4264 
4265         sb_dev = sb_dev ? : dev;
4266 
4267         if (queue_index < 0 || skb->ooo_okay ||
4268             queue_index >= dev->real_num_tx_queues) {
4269                 int new_index = get_xps_queue(dev, sb_dev, skb);
4270 
4271                 if (new_index < 0)
4272                         new_index = skb_tx_hash(dev, sb_dev, skb);
4273 
4274                 if (queue_index != new_index && sk &&
4275                     sk_fullsock(sk) &&
4276                     rcu_access_pointer(sk->sk_dst_cache))
4277                         sk_tx_queue_set(sk, new_index);
4278 
4279                 queue_index = new_index;
4280         }
4281 
4282         return queue_index;
4283 }
4284 EXPORT_SYMBOL(netdev_pick_tx);
4285 
4286 struct netdev_queue *netdev_core_pick_tx(struct net_device *dev,
4287                                          struct sk_buff *skb,
4288                                          struct net_device *sb_dev)
4289 {
4290         int queue_index = 0;
4291 
4292 #ifdef CONFIG_XPS
4293         u32 sender_cpu = skb->sender_cpu - 1;
4294 
4295         if (sender_cpu >= (u32)NR_CPUS)
4296                 skb->sender_cpu = raw_smp_processor_id() + 1;
4297 #endif
4298 
4299         if (dev->real_num_tx_queues != 1) {
4300                 const struct net_device_ops *ops = dev->netdev_ops;
4301 
4302                 if (ops->ndo_select_queue)
4303                         queue_index = ops->ndo_select_queue(dev, skb, sb_dev);
4304                 else
4305                         queue_index = netdev_pick_tx(dev, skb, sb_dev);
4306 
4307                 queue_index = netdev_cap_txqueue(dev, queue_index);
4308         }
4309 
4310         skb_set_queue_mapping(skb, queue_index);
4311         return netdev_get_tx_queue(dev, queue_index);
4312 }
4313 
4314 /**
4315  * __dev_queue_xmit() - transmit a buffer
4316  * @skb:        buffer to transmit
4317  * @sb_dev:     suboordinate device used for L2 forwarding offload
4318  *
4319  * Queue a buffer for transmission to a network device. The caller must
4320  * have set the device and priority and built the buffer before calling
4321  * this function. The function can be called from an interrupt.
4322  *
4323  * When calling this method, interrupts MUST be enabled. This is because
4324  * the BH enable code must have IRQs enabled so that it will not deadlock.
4325  *
4326  * Regardless of the return value, the skb is consumed, so it is currently
4327  * difficult to retry a send to this method. (You can bump the ref count
4328  * before sending to hold a reference for retry if you are careful.)
4329  *
4330  * Return:
4331  * * 0                          - buffer successfully transmitted
4332  * * positive qdisc return code - NET_XMIT_DROP etc.
4333  * * negative errno             - other errors
4334  */
4335 int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
4336 {
4337         struct net_device *dev = skb->dev;
4338         struct netdev_queue *txq = NULL;
4339         struct Qdisc *q;
4340         int rc = -ENOMEM;
4341         bool again = false;
4342 
4343         skb_reset_mac_header(skb);
4344         skb_assert_len(skb);
4345 
4346         if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
4347                 __skb_tstamp_tx(skb, NULL, NULL, skb->sk, SCM_TSTAMP_SCHED);
4348 
4349         /* Disable soft irqs for various locks below. Also
4350          * stops preemption for RCU.
4351          */
4352         rcu_read_lock_bh();
4353 
4354         skb_update_prio(skb);
4355 
4356         qdisc_pkt_len_init(skb);
4357         tcx_set_ingress(skb, false);
4358 #ifdef CONFIG_NET_EGRESS
4359         if (static_branch_unlikely(&egress_needed_key)) {
4360                 if (nf_hook_egress_active()) {
4361                         skb = nf_hook_egress(skb, &rc, dev);
4362                         if (!skb)
4363                                 goto out;
4364                 }
4365 
4366                 netdev_xmit_skip_txqueue(false);
4367 
4368                 nf_skip_egress(skb, true);
4369                 skb = sch_handle_egress(skb, &rc, dev);
4370                 if (!skb)
4371                         goto out;
4372                 nf_skip_egress(skb, false);
4373 
4374                 if (netdev_xmit_txqueue_skipped())
4375                         txq = netdev_tx_queue_mapping(dev, skb);
4376         }
4377 #endif
4378         /* If device/qdisc don't need skb->dst, release it right now while
4379          * its hot in this cpu cache.
4380          */
4381         if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
4382                 skb_dst_drop(skb);
4383         else
4384                 skb_dst_force(skb);
4385 
4386         if (!txq)
4387                 txq = netdev_core_pick_tx(dev, skb, sb_dev);
4388 
4389         q = rcu_dereference_bh(txq->qdisc);
4390 
4391         trace_net_dev_queue(skb);
4392         if (q->enqueue) {
4393                 rc = __dev_xmit_skb(skb, q, dev, txq);
4394                 goto out;
4395         }
4396 
4397         /* The device has no queue. Common case for software devices:
4398          * loopback, all the sorts of tunnels...
4399 
4400          * Really, it is unlikely that netif_tx_lock protection is necessary
4401          * here.  (f.e. loopback and IP tunnels are clean ignoring statistics
4402          * counters.)
4403          * However, it is possible, that they rely on protection
4404          * made by us here.
4405 
4406          * Check this and shot the lock. It is not prone from deadlocks.
4407          *Either shot noqueue qdisc, it is even simpler 8)
4408          */
4409         if (dev->flags & IFF_UP) {
4410                 int cpu = smp_processor_id(); /* ok because BHs are off */
4411 
4412                 /* Other cpus might concurrently change txq->xmit_lock_owner
4413                  * to -1 or to their cpu id, but not to our id.
4414                  */
4415                 if (READ_ONCE(txq->xmit_lock_owner) != cpu) {
4416                         if (dev_xmit_recursion())
4417                                 goto recursion_alert;
4418 
4419                         skb = validate_xmit_skb(skb, dev, &again);
4420                         if (!skb)
4421                                 goto out;
4422 
4423                         HARD_TX_LOCK(dev, txq, cpu);
4424 
4425                         if (!netif_xmit_stopped(txq)) {
4426                                 dev_xmit_recursion_inc();
4427                                 skb = dev_hard_start_xmit(skb, dev, txq, &rc);
4428                                 dev_xmit_recursion_dec();
4429                                 if (dev_xmit_complete(rc)) {
4430                                         HARD_TX_UNLOCK(dev, txq);
4431                                         goto out;
4432                                 }
4433                         }
4434                         HARD_TX_UNLOCK(dev, txq);
4435                         net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
4436                                              dev->name);
4437                 } else {
4438                         /* Recursion is detected! It is possible,
4439                          * unfortunately
4440                          */
4441 recursion_alert:
4442                         net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
4443                                              dev->name);
4444                 }
4445         }
4446 
4447         rc = -ENETDOWN;
4448         rcu_read_unlock_bh();
4449 
4450         dev_core_stats_tx_dropped_inc(dev);
4451         kfree_skb_list(skb);
4452         return rc;
4453 out:
4454         rcu_read_unlock_bh();
4455         return rc;
4456 }
4457 EXPORT_SYMBOL(__dev_queue_xmit);
4458 
4459 int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
4460 {
4461         struct net_device *dev = skb->dev;
4462         struct sk_buff *orig_skb = skb;
4463         struct netdev_queue *txq;
4464         int ret = NETDEV_TX_BUSY;
4465         bool again = false;
4466 
4467         if (unlikely(!netif_running(dev) ||
4468                      !netif_carrier_ok(dev)))
4469                 goto drop;
4470 
4471         skb = validate_xmit_skb_list(skb, dev, &again);
4472         if (skb != orig_skb)
4473                 goto drop;
4474 
4475         skb_set_queue_mapping(skb, queue_id);
4476         txq = skb_get_tx_queue(dev, skb);
4477 
4478         local_bh_disable();
4479 
4480         dev_xmit_recursion_inc();
4481         HARD_TX_LOCK(dev, txq, smp_processor_id());
4482         if (!netif_xmit_frozen_or_drv_stopped(txq))
4483                 ret = netdev_start_xmit(skb, dev, txq, false);
4484         HARD_TX_UNLOCK(dev, txq);
4485         dev_xmit_recursion_dec();
4486 
4487         local_bh_enable();
4488         return ret;
4489 drop:
4490         dev_core_stats_tx_dropped_inc(dev);
4491         kfree_skb_list(skb);
4492         return NET_XMIT_DROP;
4493 }
4494 EXPORT_SYMBOL(__dev_direct_xmit);
4495 
4496 /*************************************************************************
4497  *                      Receiver routines
4498  *************************************************************************/
4499 static DEFINE_PER_CPU(struct task_struct *, backlog_napi);
4500 
4501 int weight_p __read_mostly = 64;           /* old backlog weight */
4502 int dev_weight_rx_bias __read_mostly = 1;  /* bias for backlog weight */
4503 int dev_weight_tx_bias __read_mostly = 1;  /* bias for output_queue quota */
4504 
4505 /* Called with irq disabled */
4506 static inline void ____napi_schedule(struct softnet_data *sd,
4507                                      struct napi_struct *napi)
4508 {
4509         struct task_struct *thread;
4510 
4511         lockdep_assert_irqs_disabled();
4512 
4513         if (test_bit(NAPI_STATE_THREADED, &napi->state)) {
4514                 /* Paired with smp_mb__before_atomic() in
4515                  * napi_enable()/dev_set_threaded().
4516                  * Use READ_ONCE() to guarantee a complete
4517                  * read on napi->thread. Only call
4518                  * wake_up_process() when it's not NULL.
4519                  */
4520                 thread = READ_ONCE(napi->thread);
4521                 if (thread) {
4522                         if (use_backlog_threads() && thread == raw_cpu_read(backlog_napi))
4523                                 goto use_local_napi;
4524 
4525                         set_bit(NAPI_STATE_SCHED_THREADED, &napi->state);
4526                         wake_up_process(thread);
4527                         return;
4528                 }
4529         }
4530 
4531 use_local_napi:
4532         list_add_tail(&napi->poll_list, &sd->poll_list);
4533         WRITE_ONCE(napi->list_owner, smp_processor_id());
4534         /* If not called from net_rx_action()
4535          * we have to raise NET_RX_SOFTIRQ.
4536          */
4537         if (!sd->in_net_rx_action)
4538                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4539 }
4540 
4541 #ifdef CONFIG_RPS
4542 
4543 struct static_key_false rps_needed __read_mostly;
4544 EXPORT_SYMBOL(rps_needed);
4545 struct static_key_false rfs_needed __read_mostly;
4546 EXPORT_SYMBOL(rfs_needed);
4547 
4548 static struct rps_dev_flow *
4549 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
4550             struct rps_dev_flow *rflow, u16 next_cpu)
4551 {
4552         if (next_cpu < nr_cpu_ids) {
4553                 u32 head;
4554 #ifdef CONFIG_RFS_ACCEL
4555                 struct netdev_rx_queue *rxqueue;
4556                 struct rps_dev_flow_table *flow_table;
4557                 struct rps_dev_flow *old_rflow;
4558                 u16 rxq_index;
4559                 u32 flow_id;
4560                 int rc;
4561 
4562                 /* Should we steer this flow to a different hardware queue? */
4563                 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
4564                     !(dev->features & NETIF_F_NTUPLE))
4565                         goto out;
4566                 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
4567                 if (rxq_index == skb_get_rx_queue(skb))
4568                         goto out;
4569 
4570                 rxqueue = dev->_rx + rxq_index;
4571                 flow_table = rcu_dereference(rxqueue->rps_flow_table);
4572                 if (!flow_table)
4573                         goto out;
4574                 flow_id = skb_get_hash(skb) & flow_table->mask;
4575                 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
4576                                                         rxq_index, flow_id);
4577                 if (rc < 0)
4578                         goto out;
4579                 old_rflow = rflow;
4580                 rflow = &flow_table->flows[flow_id];
4581                 WRITE_ONCE(rflow->filter, rc);
4582                 if (old_rflow->filter == rc)
4583                         WRITE_ONCE(old_rflow->filter, RPS_NO_FILTER);
4584         out:
4585 #endif
4586                 head = READ_ONCE(per_cpu(softnet_data, next_cpu).input_queue_head);
4587                 rps_input_queue_tail_save(&rflow->last_qtail, head);
4588         }
4589 
4590         WRITE_ONCE(rflow->cpu, next_cpu);
4591         return rflow;
4592 }
4593 
4594 /*
4595  * get_rps_cpu is called from netif_receive_skb and returns the target
4596  * CPU from the RPS map of the receiving queue for a given skb.
4597  * rcu_read_lock must be held on entry.
4598  */
4599 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
4600                        struct rps_dev_flow **rflowp)
4601 {
4602         const struct rps_sock_flow_table *sock_flow_table;
4603         struct netdev_rx_queue *rxqueue = dev->_rx;
4604         struct rps_dev_flow_table *flow_table;
4605         struct rps_map *map;
4606         int cpu = -1;
4607         u32 tcpu;
4608         u32 hash;
4609 
4610         if (skb_rx_queue_recorded(skb)) {
4611                 u16 index = skb_get_rx_queue(skb);
4612 
4613                 if (unlikely(index >= dev->real_num_rx_queues)) {
4614                         WARN_ONCE(dev->real_num_rx_queues > 1,
4615                                   "%s received packet on queue %u, but number "
4616                                   "of RX queues is %u\n",
4617                                   dev->name, index, dev->real_num_rx_queues);
4618                         goto done;
4619                 }
4620                 rxqueue += index;
4621         }
4622 
4623         /* Avoid computing hash if RFS/RPS is not active for this rxqueue */
4624 
4625         flow_table = rcu_dereference(rxqueue->rps_flow_table);
4626         map = rcu_dereference(rxqueue->rps_map);
4627         if (!flow_table && !map)
4628                 goto done;
4629 
4630         skb_reset_network_header(skb);
4631         hash = skb_get_hash(skb);
4632         if (!hash)
4633                 goto done;
4634 
4635         sock_flow_table = rcu_dereference(net_hotdata.rps_sock_flow_table);
4636         if (flow_table && sock_flow_table) {
4637                 struct rps_dev_flow *rflow;
4638                 u32 next_cpu;
4639                 u32 ident;
4640 
4641                 /* First check into global flow table if there is a match.
4642                  * This READ_ONCE() pairs with WRITE_ONCE() from rps_record_sock_flow().
4643                  */
4644                 ident = READ_ONCE(sock_flow_table->ents[hash & sock_flow_table->mask]);
4645                 if ((ident ^ hash) & ~net_hotdata.rps_cpu_mask)
4646                         goto try_rps;
4647 
4648                 next_cpu = ident & net_hotdata.rps_cpu_mask;
4649 
4650                 /* OK, now we know there is a match,
4651                  * we can look at the local (per receive queue) flow table
4652                  */
4653                 rflow = &flow_table->flows[hash & flow_table->mask];
4654                 tcpu = rflow->cpu;
4655 
4656                 /*
4657                  * If the desired CPU (where last recvmsg was done) is
4658                  * different from current CPU (one in the rx-queue flow
4659                  * table entry), switch if one of the following holds:
4660                  *   - Current CPU is unset (>= nr_cpu_ids).
4661                  *   - Current CPU is offline.
4662                  *   - The current CPU's queue tail has advanced beyond the
4663                  *     last packet that was enqueued using this table entry.
4664                  *     This guarantees that all previous packets for the flow
4665                  *     have been dequeued, thus preserving in order delivery.
4666                  */
4667                 if (unlikely(tcpu != next_cpu) &&
4668                     (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
4669                      ((int)(READ_ONCE(per_cpu(softnet_data, tcpu).input_queue_head) -
4670                       rflow->last_qtail)) >= 0)) {
4671                         tcpu = next_cpu;
4672                         rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
4673                 }
4674 
4675                 if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
4676                         *rflowp = rflow;
4677                         cpu = tcpu;
4678                         goto done;
4679                 }
4680         }
4681 
4682 try_rps:
4683 
4684         if (map) {
4685                 tcpu = map->cpus[reciprocal_scale(hash, map->len)];
4686                 if (cpu_online(tcpu)) {
4687                         cpu = tcpu;
4688                         goto done;
4689                 }
4690         }
4691 
4692 done:
4693         return cpu;
4694 }
4695 
4696 #ifdef CONFIG_RFS_ACCEL
4697 
4698 /**
4699  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
4700  * @dev: Device on which the filter was set
4701  * @rxq_index: RX queue index
4702  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
4703  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
4704  *
4705  * Drivers that implement ndo_rx_flow_steer() should periodically call
4706  * this function for each installed filter and remove the filters for
4707  * which it returns %true.
4708  */
4709 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
4710                          u32 flow_id, u16 filter_id)
4711 {
4712         struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
4713         struct rps_dev_flow_table *flow_table;
4714         struct rps_dev_flow *rflow;
4715         bool expire = true;
4716         unsigned int cpu;
4717 
4718         rcu_read_lock();
4719         flow_table = rcu_dereference(rxqueue->rps_flow_table);
4720         if (flow_table && flow_id <= flow_table->mask) {
4721                 rflow = &flow_table->flows[flow_id];
4722                 cpu = READ_ONCE(rflow->cpu);
4723                 if (READ_ONCE(rflow->filter) == filter_id && cpu < nr_cpu_ids &&
4724                     ((int)(READ_ONCE(per_cpu(softnet_data, cpu).input_queue_head) -
4725                            READ_ONCE(rflow->last_qtail)) <
4726                      (int)(10 * flow_table->mask)))
4727                         expire = false;
4728         }
4729         rcu_read_unlock();
4730         return expire;
4731 }
4732 EXPORT_SYMBOL(rps_may_expire_flow);
4733 
4734 #endif /* CONFIG_RFS_ACCEL */
4735 
4736 /* Called from hardirq (IPI) context */
4737 static void rps_trigger_softirq(void *data)
4738 {
4739         struct softnet_data *sd = data;
4740 
4741         ____napi_schedule(sd, &sd->backlog);
4742         sd->received_rps++;
4743 }
4744 
4745 #endif /* CONFIG_RPS */
4746 
4747 /* Called from hardirq (IPI) context */
4748 static void trigger_rx_softirq(void *data)
4749 {
4750         struct softnet_data *sd = data;
4751 
4752         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4753         smp_store_release(&sd->defer_ipi_scheduled, 0);
4754 }
4755 
4756 /*
4757  * After we queued a packet into sd->input_pkt_queue,
4758  * we need to make sure this queue is serviced soon.
4759  *
4760  * - If this is another cpu queue, link it to our rps_ipi_list,
4761  *   and make sure we will process rps_ipi_list from net_rx_action().
4762  *
4763  * - If this is our own queue, NAPI schedule our backlog.
4764  *   Note that this also raises NET_RX_SOFTIRQ.
4765  */
4766 static void napi_schedule_rps(struct softnet_data *sd)
4767 {
4768         struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
4769 
4770 #ifdef CONFIG_RPS
4771         if (sd != mysd) {
4772                 if (use_backlog_threads()) {
4773                         __napi_schedule_irqoff(&sd->backlog);
4774                         return;
4775                 }
4776 
4777                 sd->rps_ipi_next = mysd->rps_ipi_list;
4778                 mysd->rps_ipi_list = sd;
4779 
4780                 /* If not called from net_rx_action() or napi_threaded_poll()
4781                  * we have to raise NET_RX_SOFTIRQ.
4782                  */
4783                 if (!mysd->in_net_rx_action && !mysd->in_napi_threaded_poll)
4784                         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4785                 return;
4786         }
4787 #endif /* CONFIG_RPS */
4788         __napi_schedule_irqoff(&mysd->backlog);
4789 }
4790 
4791 void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu)
4792 {
4793         unsigned long flags;
4794 
4795         if (use_backlog_threads()) {
4796                 backlog_lock_irq_save(sd, &flags);
4797 
4798                 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state))
4799                         __napi_schedule_irqoff(&sd->backlog);
4800 
4801                 backlog_unlock_irq_restore(sd, &flags);
4802 
4803         } else if (!cmpxchg(&sd->defer_ipi_scheduled, 0, 1)) {
4804                 smp_call_function_single_async(cpu, &sd->defer_csd);
4805         }
4806 }
4807 
4808 #ifdef CONFIG_NET_FLOW_LIMIT
4809 int netdev_flow_limit_table_len __read_mostly = (1 << 12);
4810 #endif
4811 
4812 static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
4813 {
4814 #ifdef CONFIG_NET_FLOW_LIMIT
4815         struct sd_flow_limit *fl;
4816         struct softnet_data *sd;
4817         unsigned int old_flow, new_flow;
4818 
4819         if (qlen < (READ_ONCE(net_hotdata.max_backlog) >> 1))
4820                 return false;
4821 
4822         sd = this_cpu_ptr(&softnet_data);
4823 
4824         rcu_read_lock();
4825         fl = rcu_dereference(sd->flow_limit);
4826         if (fl) {
4827                 new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
4828                 old_flow = fl->history[fl->history_head];
4829                 fl->history[fl->history_head] = new_flow;
4830 
4831                 fl->history_head++;
4832                 fl->history_head &= FLOW_LIMIT_HISTORY - 1;
4833 
4834                 if (likely(fl->buckets[old_flow]))
4835                         fl->buckets[old_flow]--;
4836 
4837                 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
4838                         fl->count++;
4839                         rcu_read_unlock();
4840                         return true;
4841                 }
4842         }
4843         rcu_read_unlock();
4844 #endif
4845         return false;
4846 }
4847 
4848 /*
4849  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
4850  * queue (may be a remote CPU queue).
4851  */
4852 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
4853                               unsigned int *qtail)
4854 {
4855         enum skb_drop_reason reason;
4856         struct softnet_data *sd;
4857         unsigned long flags;
4858         unsigned int qlen;
4859         int max_backlog;
4860         u32 tail;
4861 
4862         reason = SKB_DROP_REASON_DEV_READY;
4863         if (!netif_running(skb->dev))
4864                 goto bad_dev;
4865 
4866         reason = SKB_DROP_REASON_CPU_BACKLOG;
4867         sd = &per_cpu(softnet_data, cpu);
4868 
4869         qlen = skb_queue_len_lockless(&sd->input_pkt_queue);
4870         max_backlog = READ_ONCE(net_hotdata.max_backlog);
4871         if (unlikely(qlen > max_backlog))
4872                 goto cpu_backlog_drop;
4873         backlog_lock_irq_save(sd, &flags);
4874         qlen = skb_queue_len(&sd->input_pkt_queue);
4875         if (qlen <= max_backlog && !skb_flow_limit(skb, qlen)) {
4876                 if (!qlen) {
4877                         /* Schedule NAPI for backlog device. We can use
4878                          * non atomic operation as we own the queue lock.
4879                          */
4880                         if (!__test_and_set_bit(NAPI_STATE_SCHED,
4881                                                 &sd->backlog.state))
4882                                 napi_schedule_rps(sd);
4883                 }
4884                 __skb_queue_tail(&sd->input_pkt_queue, skb);
4885                 tail = rps_input_queue_tail_incr(sd);
4886                 backlog_unlock_irq_restore(sd, &flags);
4887 
4888                 /* save the tail outside of the critical section */
4889                 rps_input_queue_tail_save(qtail, tail);
4890                 return NET_RX_SUCCESS;
4891         }
4892 
4893         backlog_unlock_irq_restore(sd, &flags);
4894 
4895 cpu_backlog_drop:
4896         atomic_inc(&sd->dropped);
4897 bad_dev:
4898         dev_core_stats_rx_dropped_inc(skb->dev);
4899         kfree_skb_reason(skb, reason);
4900         return NET_RX_DROP;
4901 }
4902 
4903 static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb)
4904 {
4905         struct net_device *dev = skb->dev;
4906         struct netdev_rx_queue *rxqueue;
4907 
4908         rxqueue = dev->_rx;
4909 
4910         if (skb_rx_queue_recorded(skb)) {
4911                 u16 index = skb_get_rx_queue(skb);
4912 
4913                 if (unlikely(index >= dev->real_num_rx_queues)) {
4914                         WARN_ONCE(dev->real_num_rx_queues > 1,
4915                                   "%s received packet on queue %u, but number "
4916                                   "of RX queues is %u\n",
4917                                   dev->name, index, dev->real_num_rx_queues);
4918 
4919                         return rxqueue; /* Return first rxqueue */
4920                 }
4921                 rxqueue += index;
4922         }
4923         return rxqueue;
4924 }
4925 
4926 u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
4927                              struct bpf_prog *xdp_prog)
4928 {
4929         void *orig_data, *orig_data_end, *hard_start;
4930         struct netdev_rx_queue *rxqueue;
4931         bool orig_bcast, orig_host;
4932         u32 mac_len, frame_sz;
4933         __be16 orig_eth_type;
4934         struct ethhdr *eth;
4935         u32 metalen, act;
4936         int off;
4937 
4938         /* The XDP program wants to see the packet starting at the MAC
4939          * header.
4940          */
4941         mac_len = skb->data - skb_mac_header(skb);
4942         hard_start = skb->data - skb_headroom(skb);
4943 
4944         /* SKB "head" area always have tailroom for skb_shared_info */
4945         frame_sz = (void *)skb_end_pointer(skb) - hard_start;
4946         frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
4947 
4948         rxqueue = netif_get_rxqueue(skb);
4949         xdp_init_buff(xdp, frame_sz, &rxqueue->xdp_rxq);
4950         xdp_prepare_buff(xdp, hard_start, skb_headroom(skb) - mac_len,
4951                          skb_headlen(skb) + mac_len, true);
4952         if (skb_is_nonlinear(skb)) {
4953                 skb_shinfo(skb)->xdp_frags_size = skb->data_len;
4954                 xdp_buff_set_frags_flag(xdp);
4955         } else {
4956                 xdp_buff_clear_frags_flag(xdp);
4957         }
4958 
4959         orig_data_end = xdp->data_end;
4960         orig_data = xdp->data;
4961         eth = (struct ethhdr *)xdp->data;
4962         orig_host = ether_addr_equal_64bits(eth->h_dest, skb->dev->dev_addr);
4963         orig_bcast = is_multicast_ether_addr_64bits(eth->h_dest);
4964         orig_eth_type = eth->h_proto;
4965 
4966         act = bpf_prog_run_xdp(xdp_prog, xdp);
4967 
4968         /* check if bpf_xdp_adjust_head was used */
4969         off = xdp->data - orig_data;
4970         if (off) {
4971                 if (off > 0)
4972                         __skb_pull(skb, off);
4973                 else if (off < 0)
4974                         __skb_push(skb, -off);
4975 
4976                 skb->mac_header += off;
4977                 skb_reset_network_header(skb);
4978         }
4979 
4980         /* check if bpf_xdp_adjust_tail was used */
4981         off = xdp->data_end - orig_data_end;
4982         if (off != 0) {
4983                 skb_set_tail_pointer(skb, xdp->data_end - xdp->data);
4984                 skb->len += off; /* positive on grow, negative on shrink */
4985         }
4986 
4987         /* XDP frag metadata (e.g. nr_frags) are updated in eBPF helpers
4988          * (e.g. bpf_xdp_adjust_tail), we need to update data_len here.
4989          */
4990         if (xdp_buff_has_frags(xdp))
4991                 skb->data_len = skb_shinfo(skb)->xdp_frags_size;
4992         else
4993                 skb->data_len = 0;
4994 
4995         /* check if XDP changed eth hdr such SKB needs update */
4996         eth = (struct ethhdr *)xdp->data;
4997         if ((orig_eth_type != eth->h_proto) ||
4998             (orig_host != ether_addr_equal_64bits(eth->h_dest,
4999                                                   skb->dev->dev_addr)) ||
5000             (orig_bcast != is_multicast_ether_addr_64bits(eth->h_dest))) {
5001                 __skb_push(skb, ETH_HLEN);
5002                 skb->pkt_type = PACKET_HOST;
5003                 skb->protocol = eth_type_trans(skb, skb->dev);
5004         }
5005 
5006         /* Redirect/Tx gives L2 packet, code that will reuse skb must __skb_pull
5007          * before calling us again on redirect path. We do not call do_redirect
5008          * as we leave that up to the caller.
5009          *
5010          * Caller is responsible for managing lifetime of skb (i.e. calling
5011          * kfree_skb in response to actions it cannot handle/XDP_DROP).
5012          */
5013         switch (act) {
5014         case XDP_REDIRECT:
5015         case XDP_TX:
5016                 __skb_push(skb, mac_len);
5017                 break;
5018         case XDP_PASS:
5019                 metalen = xdp->data - xdp->data_meta;
5020                 if (metalen)
5021                         skb_metadata_set(skb, metalen);
5022                 break;
5023         }
5024 
5025         return act;
5026 }
5027 
5028 static int
5029 netif_skb_check_for_xdp(struct sk_buff **pskb, struct bpf_prog *prog)
5030 {
5031         struct sk_buff *skb = *pskb;
5032         int err, hroom, troom;
5033 
5034         if (!skb_cow_data_for_xdp(this_cpu_read(system_page_pool), pskb, prog))
5035                 return 0;
5036 
5037         /* In case we have to go down the path and also linearize,
5038          * then lets do the pskb_expand_head() work just once here.
5039          */
5040         hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
5041         troom = skb->tail + skb->data_len - skb->end;
5042         err = pskb_expand_head(skb,
5043                                hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
5044                                troom > 0 ? troom + 128 : 0, GFP_ATOMIC);
5045         if (err)
5046                 return err;
5047 
5048         return skb_linearize(skb);
5049 }
5050 
5051 static u32 netif_receive_generic_xdp(struct sk_buff **pskb,
5052                                      struct xdp_buff *xdp,
5053                                      struct bpf_prog *xdp_prog)
5054 {
5055         struct sk_buff *skb = *pskb;
5056         u32 mac_len, act = XDP_DROP;
5057 
5058         /* Reinjected packets coming from act_mirred or similar should
5059          * not get XDP generic processing.
5060          */
5061         if (skb_is_redirected(skb))
5062                 return XDP_PASS;
5063 
5064         /* XDP packets must have sufficient headroom of XDP_PACKET_HEADROOM
5065          * bytes. This is the guarantee that also native XDP provides,
5066          * thus we need to do it here as well.
5067          */
5068         mac_len = skb->data - skb_mac_header(skb);
5069         __skb_push(skb, mac_len);
5070 
5071         if (skb_cloned(skb) || skb_is_nonlinear(skb) ||
5072             skb_headroom(skb) < XDP_PACKET_HEADROOM) {
5073                 if (netif_skb_check_for_xdp(pskb, xdp_prog))
5074                         goto do_drop;
5075         }
5076 
5077         __skb_pull(*pskb, mac_len);
5078 
5079         act = bpf_prog_run_generic_xdp(*pskb, xdp, xdp_prog);
5080         switch (act) {
5081         case XDP_REDIRECT:
5082         case XDP_TX:
5083         case XDP_PASS:
5084                 break;
5085         default:
5086                 bpf_warn_invalid_xdp_action((*pskb)->dev, xdp_prog, act);
5087                 fallthrough;
5088         case XDP_ABORTED:
5089                 trace_xdp_exception((*pskb)->dev, xdp_prog, act);
5090                 fallthrough;
5091         case XDP_DROP:
5092         do_drop:
5093                 kfree_skb(*pskb);
5094                 break;
5095         }
5096 
5097         return act;
5098 }
5099 
5100 /* When doing generic XDP we have to bypass the qdisc layer and the
5101  * network taps in order to match in-driver-XDP behavior. This also means
5102  * that XDP packets are able to starve other packets going through a qdisc,
5103  * and DDOS attacks will be more effective. In-driver-XDP use dedicated TX
5104  * queues, so they do not have this starvation issue.
5105  */
5106 void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
5107 {
5108         struct net_device *dev = skb->dev;
5109         struct netdev_queue *txq;
5110         bool free_skb = true;
5111         int cpu, rc;
5112 
5113         txq = netdev_core_pick_tx(dev, skb, NULL);
5114         cpu = smp_processor_id();
5115         HARD_TX_LOCK(dev, txq, cpu);
5116         if (!netif_xmit_frozen_or_drv_stopped(txq)) {
5117                 rc = netdev_start_xmit(skb, dev, txq, 0);
5118                 if (dev_xmit_complete(rc))
5119                         free_skb = false;
5120         }
5121         HARD_TX_UNLOCK(dev, txq);
5122         if (free_skb) {
5123                 trace_xdp_exception(dev, xdp_prog, XDP_TX);
5124                 dev_core_stats_tx_dropped_inc(dev);
5125                 kfree_skb(skb);
5126         }
5127 }
5128 
5129 static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key);
5130 
5131 int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff **pskb)
5132 {
5133         struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
5134 
5135         if (xdp_prog) {
5136                 struct xdp_buff xdp;
5137                 u32 act;
5138                 int err;
5139 
5140                 bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
5141                 act = netif_receive_generic_xdp(pskb, &xdp, xdp_prog);
5142                 if (act != XDP_PASS) {
5143                         switch (act) {
5144                         case XDP_REDIRECT:
5145                                 err = xdp_do_generic_redirect((*pskb)->dev, *pskb,
5146                                                               &xdp, xdp_prog);
5147                                 if (err)
5148                                         goto out_redir;
5149                                 break;
5150                         case XDP_TX:
5151                                 generic_xdp_tx(*pskb, xdp_prog);
5152                                 break;
5153                         }
5154                         bpf_net_ctx_clear(bpf_net_ctx);
5155                         return XDP_DROP;
5156                 }
5157                 bpf_net_ctx_clear(bpf_net_ctx);
5158         }
5159         return XDP_PASS;
5160 out_redir:
5161         bpf_net_ctx_clear(bpf_net_ctx);
5162         kfree_skb_reason(*pskb, SKB_DROP_REASON_XDP);
5163         return XDP_DROP;
5164 }
5165 EXPORT_SYMBOL_GPL(do_xdp_generic);
5166 
5167 static int netif_rx_internal(struct sk_buff *skb)
5168 {
5169         int ret;
5170 
5171         net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue), skb);
5172 
5173         trace_netif_rx(skb);
5174 
5175 #ifdef CONFIG_RPS
5176         if (static_branch_unlikely(&rps_needed)) {
5177                 struct rps_dev_flow voidflow, *rflow = &voidflow;
5178                 int cpu;
5179 
5180                 rcu_read_lock();
5181 
5182                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
5183                 if (cpu < 0)
5184                         cpu = smp_processor_id();
5185 
5186                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
5187 
5188                 rcu_read_unlock();
5189         } else
5190 #endif
5191         {
5192                 unsigned int qtail;
5193 
5194                 ret = enqueue_to_backlog(skb, smp_processor_id(), &qtail);
5195         }
5196         return ret;
5197 }
5198 
5199 /**
5200  *      __netif_rx      -       Slightly optimized version of netif_rx
5201  *      @skb: buffer to post
5202  *
5203  *      This behaves as netif_rx except that it does not disable bottom halves.
5204  *      As a result this function may only be invoked from the interrupt context
5205  *      (either hard or soft interrupt).
5206  */
5207 int __netif_rx(struct sk_buff *skb)
5208 {
5209         int ret;
5210 
5211         lockdep_assert_once(hardirq_count() | softirq_count());
5212 
5213         trace_netif_rx_entry(skb);
5214         ret = netif_rx_internal(skb);
5215         trace_netif_rx_exit(ret);
5216         return ret;
5217 }
5218 EXPORT_SYMBOL(__netif_rx);
5219 
5220 /**
5221  *      netif_rx        -       post buffer to the network code
5222  *      @skb: buffer to post
5223  *
5224  *      This function receives a packet from a device driver and queues it for
5225  *      the upper (protocol) levels to process via the backlog NAPI device. It
5226  *      always succeeds. The buffer may be dropped during processing for
5227  *      congestion control or by the protocol layers.
5228  *      The network buffer is passed via the backlog NAPI device. Modern NIC
5229  *      driver should use NAPI and GRO.
5230  *      This function can used from interrupt and from process context. The
5231  *      caller from process context must not disable interrupts before invoking
5232  *      this function.
5233  *
5234  *      return values:
5235  *      NET_RX_SUCCESS  (no congestion)
5236  *      NET_RX_DROP     (packet was dropped)
5237  *
5238  */
5239 int netif_rx(struct sk_buff *skb)
5240 {
5241         bool need_bh_off = !(hardirq_count() | softirq_count());
5242         int ret;
5243 
5244         if (need_bh_off)
5245                 local_bh_disable();
5246         trace_netif_rx_entry(skb);
5247         ret = netif_rx_internal(skb);
5248         trace_netif_rx_exit(ret);
5249         if (need_bh_off)
5250                 local_bh_enable();
5251         return ret;
5252 }
5253 EXPORT_SYMBOL(netif_rx);
5254 
5255 static __latent_entropy void net_tx_action(struct softirq_action *h)
5256 {
5257         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
5258 
5259         if (sd->completion_queue) {
5260                 struct sk_buff *clist;
5261 
5262                 local_irq_disable();
5263                 clist = sd->completion_queue;
5264                 sd->completion_queue = NULL;
5265                 local_irq_enable();
5266 
5267                 while (clist) {
5268                         struct sk_buff *skb = clist;
5269 
5270                         clist = clist->next;
5271 
5272                         WARN_ON(refcount_read(&skb->users));
5273                         if (likely(get_kfree_skb_cb(skb)->reason == SKB_CONSUMED))
5274                                 trace_consume_skb(skb, net_tx_action);
5275                         else
5276                                 trace_kfree_skb(skb, net_tx_action,
5277                                                 get_kfree_skb_cb(skb)->reason, NULL);
5278 
5279                         if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
5280                                 __kfree_skb(skb);
5281                         else
5282                                 __napi_kfree_skb(skb,
5283                                                  get_kfree_skb_cb(skb)->reason);
5284                 }
5285         }
5286 
5287         if (sd->output_queue) {
5288                 struct Qdisc *head;
5289 
5290                 local_irq_disable();
5291                 head = sd->output_queue;
5292                 sd->output_queue = NULL;
5293                 sd->output_queue_tailp = &sd->output_queue;
5294                 local_irq_enable();
5295 
5296                 rcu_read_lock();
5297 
5298                 while (head) {
5299                         struct Qdisc *q = head;
5300                         spinlock_t *root_lock = NULL;
5301 
5302                         head = head->next_sched;
5303 
5304                         /* We need to make sure head->next_sched is read
5305                          * before clearing __QDISC_STATE_SCHED
5306                          */
5307                         smp_mb__before_atomic();
5308 
5309                         if (!(q->flags & TCQ_F_NOLOCK)) {
5310                                 root_lock = qdisc_lock(q);
5311                                 spin_lock(root_lock);
5312                         } else if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED,
5313                                                      &q->state))) {
5314                                 /* There is a synchronize_net() between
5315                                  * STATE_DEACTIVATED flag being set and
5316                                  * qdisc_reset()/some_qdisc_is_busy() in
5317                                  * dev_deactivate(), so we can safely bail out
5318                                  * early here to avoid data race between
5319                                  * qdisc_deactivate() and some_qdisc_is_busy()
5320                                  * for lockless qdisc.
5321                                  */
5322                                 clear_bit(__QDISC_STATE_SCHED, &q->state);
5323                                 continue;
5324                         }
5325 
5326                         clear_bit(__QDISC_STATE_SCHED, &q->state);
5327                         qdisc_run(q);
5328                         if (root_lock)
5329                                 spin_unlock(root_lock);
5330                 }
5331 
5332                 rcu_read_unlock();
5333         }
5334 
5335         xfrm_dev_backlog(sd);
5336 }
5337 
5338 #if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE)
5339 /* This hook is defined here for ATM LANE */
5340 int (*br_fdb_test_addr_hook)(struct net_device *dev,
5341                              unsigned char *addr) __read_mostly;
5342 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
5343 #endif
5344 
5345 /**
5346  *      netdev_is_rx_handler_busy - check if receive handler is registered
5347  *      @dev: device to check
5348  *
5349  *      Check if a receive handler is already registered for a given device.
5350  *      Return true if there one.
5351  *
5352  *      The caller must hold the rtnl_mutex.
5353  */
5354 bool netdev_is_rx_handler_busy(struct net_device *dev)
5355 {
5356         ASSERT_RTNL();
5357         return dev && rtnl_dereference(dev->rx_handler);
5358 }
5359 EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);
5360 
5361 /**
5362  *      netdev_rx_handler_register - register receive handler
5363  *      @dev: device to register a handler for
5364  *      @rx_handler: receive handler to register
5365  *      @rx_handler_data: data pointer that is used by rx handler
5366  *
5367  *      Register a receive handler for a device. This handler will then be
5368  *      called from __netif_receive_skb. A negative errno code is returned
5369  *      on a failure.
5370  *
5371  *      The caller must hold the rtnl_mutex.
5372  *
5373  *      For a general description of rx_handler, see enum rx_handler_result.
5374  */
5375 int netdev_rx_handler_register(struct net_device *dev,
5376                                rx_handler_func_t *rx_handler,
5377                                void *rx_handler_data)
5378 {
5379         if (netdev_is_rx_handler_busy(dev))
5380                 return -EBUSY;
5381 
5382         if (dev->priv_flags & IFF_NO_RX_HANDLER)
5383                 return -EINVAL;
5384 
5385         /* Note: rx_handler_data must be set before rx_handler */
5386         rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
5387         rcu_assign_pointer(dev->rx_handler, rx_handler);
5388 
5389         return 0;
5390 }
5391 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
5392 
5393 /**
5394  *      netdev_rx_handler_unregister - unregister receive handler
5395  *      @dev: device to unregister a handler from
5396  *
5397  *      Unregister a receive handler from a device.
5398  *
5399  *      The caller must hold the rtnl_mutex.
5400  */
5401 void netdev_rx_handler_unregister(struct net_device *dev)
5402 {
5403 
5404         ASSERT_RTNL();
5405         RCU_INIT_POINTER(dev->rx_handler, NULL);
5406         /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
5407          * section has a guarantee to see a non NULL rx_handler_data
5408          * as well.
5409          */
5410         synchronize_net();
5411         RCU_INIT_POINTER(dev->rx_handler_data, NULL);
5412 }
5413 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
5414 
5415 /*
5416  * Limit the use of PFMEMALLOC reserves to those protocols that implement
5417  * the special handling of PFMEMALLOC skbs.
5418  */
5419 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
5420 {
5421         switch (skb->protocol) {
5422         case htons(ETH_P_ARP):
5423         case htons(ETH_P_IP):
5424         case htons(ETH_P_IPV6):
5425         case htons(ETH_P_8021Q):
5426         case htons(ETH_P_8021AD):
5427                 return true;
5428         default:
5429                 return false;
5430         }
5431 }
5432 
5433 static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
5434                              int *ret, struct net_device *orig_dev)
5435 {
5436         if (nf_hook_ingress_active(skb)) {
5437                 int ingress_retval;
5438 
5439                 if (*pt_prev) {
5440                         *ret = deliver_skb(skb, *pt_prev, orig_dev);
5441                         *pt_prev = NULL;
5442                 }
5443 
5444                 rcu_read_lock();
5445                 ingress_retval = nf_hook_ingress(skb);
5446                 rcu_read_unlock();
5447                 return ingress_retval;
5448         }
5449         return 0;
5450 }
5451 
5452 static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
5453                                     struct packet_type **ppt_prev)
5454 {
5455         struct packet_type *ptype, *pt_prev;
5456         rx_handler_func_t *rx_handler;
5457         struct sk_buff *skb = *pskb;
5458         struct net_device *orig_dev;
5459         bool deliver_exact = false;
5460         int ret = NET_RX_DROP;
5461         __be16 type;
5462 
5463         net_timestamp_check(!READ_ONCE(net_hotdata.tstamp_prequeue), skb);
5464 
5465         trace_netif_receive_skb(skb);
5466 
5467         orig_dev = skb->dev;
5468 
5469         skb_reset_network_header(skb);
5470         if (!skb_transport_header_was_set(skb))
5471                 skb_reset_transport_header(skb);
5472         skb_reset_mac_len(skb);
5473 
5474         pt_prev = NULL;
5475 
5476 another_round:
5477         skb->skb_iif = skb->dev->ifindex;
5478 
5479         __this_cpu_inc(softnet_data.processed);
5480 
5481         if (static_branch_unlikely(&generic_xdp_needed_key)) {
5482                 int ret2;
5483 
5484                 migrate_disable();
5485                 ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog),
5486                                       &skb);
5487                 migrate_enable();
5488 
5489                 if (ret2 != XDP_PASS) {
5490                         ret = NET_RX_DROP;
5491                         goto out;
5492                 }
5493         }
5494 
5495         if (eth_type_vlan(skb->protocol)) {
5496                 skb = skb_vlan_untag(skb);
5497                 if (unlikely(!skb))
5498                         goto out;
5499         }
5500 
5501         if (skb_skip_tc_classify(skb))
5502                 goto skip_classify;
5503 
5504         if (pfmemalloc)
5505                 goto skip_taps;
5506 
5507         list_for_each_entry_rcu(ptype, &net_hotdata.ptype_all, list) {
5508                 if (pt_prev)
5509                         ret = deliver_skb(skb, pt_prev, orig_dev);
5510                 pt_prev = ptype;
5511         }
5512 
5513         list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
5514                 if (pt_prev)
5515                         ret = deliver_skb(skb, pt_prev, orig_dev);
5516                 pt_prev = ptype;
5517         }
5518 
5519 skip_taps:
5520 #ifdef CONFIG_NET_INGRESS
5521         if (static_branch_unlikely(&ingress_needed_key)) {
5522                 bool another = false;
5523 
5524                 nf_skip_egress(skb, true);
5525                 skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev,
5526                                          &another);
5527                 if (another)
5528                         goto another_round;
5529                 if (!skb)
5530                         goto out;
5531 
5532                 nf_skip_egress(skb, false);
5533                 if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
5534                         goto out;
5535         }
5536 #endif
5537         skb_reset_redirect(skb);
5538 skip_classify:
5539         if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
5540                 goto drop;
5541 
5542         if (skb_vlan_tag_present(skb)) {
5543                 if (pt_prev) {
5544                         ret = deliver_skb(skb, pt_prev, orig_dev);
5545                         pt_prev = NULL;
5546                 }
5547                 if (vlan_do_receive(&skb))
5548                         goto another_round;
5549                 else if (unlikely(!skb))
5550                         goto out;
5551         }
5552 
5553         rx_handler = rcu_dereference(skb->dev->rx_handler);
5554         if (rx_handler) {
5555                 if (pt_prev) {
5556                         ret = deliver_skb(skb, pt_prev, orig_dev);
5557                         pt_prev = NULL;
5558                 }
5559                 switch (rx_handler(&skb)) {
5560                 case RX_HANDLER_CONSUMED:
5561                         ret = NET_RX_SUCCESS;
5562                         goto out;
5563                 case RX_HANDLER_ANOTHER:
5564                         goto another_round;
5565                 case RX_HANDLER_EXACT:
5566                         deliver_exact = true;
5567                         break;
5568                 case RX_HANDLER_PASS:
5569                         break;
5570                 default:
5571                         BUG();
5572                 }
5573         }
5574 
5575         if (unlikely(skb_vlan_tag_present(skb)) && !netdev_uses_dsa(skb->dev)) {
5576 check_vlan_id:
5577                 if (skb_vlan_tag_get_id(skb)) {
5578                         /* Vlan id is non 0 and vlan_do_receive() above couldn't
5579                          * find vlan device.
5580                          */
5581                         skb->pkt_type = PACKET_OTHERHOST;
5582                 } else if (eth_type_vlan(skb->protocol)) {
5583                         /* Outer header is 802.1P with vlan 0, inner header is
5584                          * 802.1Q or 802.1AD and vlan_do_receive() above could
5585                          * not find vlan dev for vlan id 0.
5586                          */
5587                         __vlan_hwaccel_clear_tag(skb);
5588                         skb = skb_vlan_untag(skb);
5589                         if (unlikely(!skb))
5590                                 goto out;
5591                         if (vlan_do_receive(&skb))
5592                                 /* After stripping off 802.1P header with vlan 0
5593                                  * vlan dev is found for inner header.
5594                                  */
5595                                 goto another_round;
5596                         else if (unlikely(!skb))
5597                                 goto out;
5598                         else
5599                                 /* We have stripped outer 802.1P vlan 0 header.
5600                                  * But could not find vlan dev.
5601                                  * check again for vlan id to set OTHERHOST.
5602                                  */
5603                                 goto check_vlan_id;
5604                 }
5605                 /* Note: we might in the future use prio bits
5606                  * and set skb->priority like in vlan_do_receive()
5607                  * For the time being, just ignore Priority Code Point
5608                  */
5609                 __vlan_hwaccel_clear_tag(skb);
5610         }
5611 
5612         type = skb->protocol;
5613 
5614         /* deliver only exact match when indicated */
5615         if (likely(!deliver_exact)) {
5616                 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
5617                                        &ptype_base[ntohs(type) &
5618                                                    PTYPE_HASH_MASK]);
5619         }
5620 
5621         deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
5622                                &orig_dev->ptype_specific);
5623 
5624         if (unlikely(skb->dev != orig_dev)) {
5625                 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
5626                                        &skb->dev->ptype_specific);
5627         }
5628 
5629         if (pt_prev) {
5630                 if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
5631                         goto drop;
5632                 *ppt_prev = pt_prev;
5633         } else {
5634 drop:
5635                 if (!deliver_exact)
5636                         dev_core_stats_rx_dropped_inc(skb->dev);
5637                 else
5638                         dev_core_stats_rx_nohandler_inc(skb->dev);
5639                 kfree_skb_reason(skb, SKB_DROP_REASON_UNHANDLED_PROTO);
5640                 /* Jamal, now you will not able to escape explaining
5641                  * me how you were going to use this. :-)
5642                  */
5643                 ret = NET_RX_DROP;
5644         }
5645 
5646 out:
5647         /* The invariant here is that if *ppt_prev is not NULL
5648          * then skb should also be non-NULL.
5649          *
5650          * Apparently *ppt_prev assignment above holds this invariant due to
5651          * skb dereferencing near it.
5652          */
5653         *pskb = skb;
5654         return ret;
5655 }
5656 
5657 static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc)
5658 {
5659         struct net_device *orig_dev = skb->dev;
5660         struct packet_type *pt_prev = NULL;
5661         int ret;
5662 
5663         ret = __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
5664         if (pt_prev)
5665                 ret = INDIRECT_CALL_INET(pt_prev->func, ipv6_rcv, ip_rcv, skb,
5666                                          skb->dev, pt_prev, orig_dev);
5667         return ret;
5668 }
5669 
5670 /**
5671  *      netif_receive_skb_core - special purpose version of netif_receive_skb
5672  *      @skb: buffer to process
5673  *
5674  *      More direct receive version of netif_receive_skb().  It should
5675  *      only be used by callers that have a need to skip RPS and Generic XDP.
5676  *      Caller must also take care of handling if ``(page_is_)pfmemalloc``.
5677  *
5678  *      This function may only be called from softirq context and interrupts
5679  *      should be enabled.
5680  *
5681  *      Return values (usually ignored):
5682  *      NET_RX_SUCCESS: no congestion
5683  *      NET_RX_DROP: packet was dropped
5684  */
5685 int netif_receive_skb_core(struct sk_buff *skb)
5686 {
5687         int ret;
5688 
5689         rcu_read_lock();
5690         ret = __netif_receive_skb_one_core(skb, false);
5691         rcu_read_unlock();
5692 
5693         return ret;
5694 }
5695 EXPORT_SYMBOL(netif_receive_skb_core);
5696 
5697 static inline void __netif_receive_skb_list_ptype(struct list_head *head,
5698                                                   struct packet_type *pt_prev,
5699                                                   struct net_device *orig_dev)
5700 {
5701         struct sk_buff *skb, *next;
5702 
5703         if (!pt_prev)
5704                 return;
5705         if (list_empty(head))
5706                 return;
5707         if (pt_prev->list_func != NULL)
5708                 INDIRECT_CALL_INET(pt_prev->list_func, ipv6_list_rcv,
5709                                    ip_list_rcv, head, pt_prev, orig_dev);
5710         else
5711                 list_for_each_entry_safe(skb, next, head, list) {
5712                         skb_list_del_init(skb);
5713                         pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
5714                 }
5715 }
5716 
5717 static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemalloc)
5718 {
5719         /* Fast-path assumptions:
5720          * - There is no RX handler.
5721          * - Only one packet_type matches.
5722          * If either of these fails, we will end up doing some per-packet
5723          * processing in-line, then handling the 'last ptype' for the whole
5724          * sublist.  This can't cause out-of-order delivery to any single ptype,
5725          * because the 'last ptype' must be constant across the sublist, and all
5726          * other ptypes are handled per-packet.
5727          */
5728         /* Current (common) ptype of sublist */
5729         struct packet_type *pt_curr = NULL;
5730         /* Current (common) orig_dev of sublist */
5731         struct net_device *od_curr = NULL;
5732         struct list_head sublist;
5733         struct sk_buff *skb, *next;
5734 
5735         INIT_LIST_HEAD(&sublist);
5736         list_for_each_entry_safe(skb, next, head, list) {
5737                 struct net_device *orig_dev = skb->dev;
5738                 struct packet_type *pt_prev = NULL;
5739 
5740                 skb_list_del_init(skb);
5741                 __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
5742                 if (!pt_prev)
5743                         continue;
5744                 if (pt_curr != pt_prev || od_curr != orig_dev) {
5745                         /* dispatch old sublist */
5746                         __netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
5747                         /* start new sublist */
5748                         INIT_LIST_HEAD(&sublist);
5749                         pt_curr = pt_prev;
5750                         od_curr = orig_dev;
5751                 }
5752                 list_add_tail(&skb->list, &sublist);
5753         }
5754 
5755         /* dispatch final sublist */
5756         __netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
5757 }
5758 
5759 static int __netif_receive_skb(struct sk_buff *skb)
5760 {
5761         int ret;
5762 
5763         if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
5764                 unsigned int noreclaim_flag;
5765 
5766                 /*
5767                  * PFMEMALLOC skbs are special, they should
5768                  * - be delivered to SOCK_MEMALLOC sockets only
5769                  * - stay away from userspace
5770                  * - have bounded memory usage
5771                  *
5772                  * Use PF_MEMALLOC as this saves us from propagating the allocation
5773                  * context down to all allocation sites.
5774                  */
5775                 noreclaim_flag = memalloc_noreclaim_save();
5776                 ret = __netif_receive_skb_one_core(skb, true);
5777                 memalloc_noreclaim_restore(noreclaim_flag);
5778         } else
5779                 ret = __netif_receive_skb_one_core(skb, false);
5780 
5781         return ret;
5782 }
5783 
5784 static void __netif_receive_skb_list(struct list_head *head)
5785 {
5786         unsigned long noreclaim_flag = 0;
5787         struct sk_buff *skb, *next;
5788         bool pfmemalloc = false; /* Is current sublist PF_MEMALLOC? */
5789 
5790         list_for_each_entry_safe(skb, next, head, list) {
5791                 if ((sk_memalloc_socks() && skb_pfmemalloc(skb)) != pfmemalloc) {
5792                         struct list_head sublist;
5793 
5794                         /* Handle the previous sublist */
5795                         list_cut_before(&sublist, head, &skb->list);
5796                         if (!list_empty(&sublist))
5797                                 __netif_receive_skb_list_core(&sublist, pfmemalloc);
5798                         pfmemalloc = !pfmemalloc;
5799                         /* See comments in __netif_receive_skb */
5800                         if (pfmemalloc)
5801                                 noreclaim_flag = memalloc_noreclaim_save();
5802                         else
5803                                 memalloc_noreclaim_restore(noreclaim_flag);
5804                 }
5805         }
5806         /* Handle the remaining sublist */
5807         if (!list_empty(head))
5808                 __netif_receive_skb_list_core(head, pfmemalloc);
5809         /* Restore pflags */
5810         if (pfmemalloc)
5811                 memalloc_noreclaim_restore(noreclaim_flag);
5812 }
5813 
5814 static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
5815 {
5816         struct bpf_prog *old = rtnl_dereference(dev->xdp_prog);
5817         struct bpf_prog *new = xdp->prog;
5818         int ret = 0;
5819 
5820         switch (xdp->command) {
5821         case XDP_SETUP_PROG:
5822                 rcu_assign_pointer(dev->xdp_prog, new);
5823                 if (old)
5824                         bpf_prog_put(old);
5825 
5826                 if (old && !new) {
5827                         static_branch_dec(&generic_xdp_needed_key);
5828                 } else if (new && !old) {
5829                         static_branch_inc(&generic_xdp_needed_key);
5830                         dev_disable_lro(dev);
5831                         dev_disable_gro_hw(dev);
5832                 }
5833                 break;
5834 
5835         default:
5836                 ret = -EINVAL;
5837                 break;
5838         }
5839 
5840         return ret;
5841 }
5842 
5843 static int netif_receive_skb_internal(struct sk_buff *skb)
5844 {
5845         int ret;
5846 
5847         net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue), skb);
5848 
5849         if (skb_defer_rx_timestamp(skb))
5850                 return NET_RX_SUCCESS;
5851 
5852         rcu_read_lock();
5853 #ifdef CONFIG_RPS
5854         if (static_branch_unlikely(&rps_needed)) {
5855                 struct rps_dev_flow voidflow, *rflow = &voidflow;
5856                 int cpu = get_rps_cpu(skb->dev, skb, &rflow);
5857 
5858                 if (cpu >= 0) {
5859                         ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
5860                         rcu_read_unlock();
5861                         return ret;
5862                 }
5863         }
5864 #endif
5865         ret = __netif_receive_skb(skb);
5866         rcu_read_unlock();
5867         return ret;
5868 }
5869 
5870 void netif_receive_skb_list_internal(struct list_head *head)
5871 {
5872         struct sk_buff *skb, *next;
5873         struct list_head sublist;
5874 
5875         INIT_LIST_HEAD(&sublist);
5876         list_for_each_entry_safe(skb, next, head, list) {
5877                 net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue),
5878                                     skb);
5879                 skb_list_del_init(skb);
5880                 if (!skb_defer_rx_timestamp(skb))
5881                         list_add_tail(&skb->list, &sublist);
5882         }
5883         list_splice_init(&sublist, head);
5884 
5885         rcu_read_lock();
5886 #ifdef CONFIG_RPS
5887         if (static_branch_unlikely(&rps_needed)) {
5888                 list_for_each_entry_safe(skb, next, head, list) {
5889                         struct rps_dev_flow voidflow, *rflow = &voidflow;
5890                         int cpu = get_rps_cpu(skb->dev, skb, &rflow);
5891 
5892                         if (cpu >= 0) {
5893                                 /* Will be handled, remove from list */
5894                                 skb_list_del_init(skb);
5895                                 enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
5896                         }
5897                 }
5898         }
5899 #endif
5900         __netif_receive_skb_list(head);
5901         rcu_read_unlock();
5902 }
5903 
5904 /**
5905  *      netif_receive_skb - process receive buffer from network
5906  *      @skb: buffer to process
5907  *
5908  *      netif_receive_skb() is the main receive data processing function.
5909  *      It always succeeds. The buffer may be dropped during processing
5910  *      for congestion control or by the protocol layers.
5911  *
5912  *      This function may only be called from softirq context and interrupts
5913  *      should be enabled.
5914  *
5915  *      Return values (usually ignored):
5916  *      NET_RX_SUCCESS: no congestion
5917  *      NET_RX_DROP: packet was dropped
5918  */
5919 int netif_receive_skb(struct sk_buff *skb)
5920 {
5921         int ret;
5922 
5923         trace_netif_receive_skb_entry(skb);
5924 
5925         ret = netif_receive_skb_internal(skb);
5926         trace_netif_receive_skb_exit(ret);
5927 
5928         return ret;
5929 }
5930 EXPORT_SYMBOL(netif_receive_skb);
5931 
5932 /**
5933  *      netif_receive_skb_list - process many receive buffers from network
5934  *      @head: list of skbs to process.
5935  *
5936  *      Since return value of netif_receive_skb() is normally ignored, and
5937  *      wouldn't be meaningful for a list, this function returns void.
5938  *
5939  *      This function may only be called from softirq context and interrupts
5940  *      should be enabled.
5941  */
5942 void netif_receive_skb_list(struct list_head *head)
5943 {
5944         struct sk_buff *skb;
5945 
5946         if (list_empty(head))
5947                 return;
5948         if (trace_netif_receive_skb_list_entry_enabled()) {
5949                 list_for_each_entry(skb, head, list)
5950                         trace_netif_receive_skb_list_entry(skb);
5951         }
5952         netif_receive_skb_list_internal(head);
5953         trace_netif_receive_skb_list_exit(0);
5954 }
5955 EXPORT_SYMBOL(netif_receive_skb_list);
5956 
5957 static DEFINE_PER_CPU(struct work_struct, flush_works);
5958 
5959 /* Network device is going away, flush any packets still pending */
5960 static void flush_backlog(struct work_struct *work)
5961 {
5962         struct sk_buff *skb, *tmp;
5963         struct softnet_data *sd;
5964 
5965         local_bh_disable();
5966         sd = this_cpu_ptr(&softnet_data);
5967 
5968         backlog_lock_irq_disable(sd);
5969         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
5970                 if (skb->dev->reg_state == NETREG_UNREGISTERING) {
5971                         __skb_unlink(skb, &sd->input_pkt_queue);
5972                         dev_kfree_skb_irq(skb);
5973                         rps_input_queue_head_incr(sd);
5974                 }
5975         }
5976         backlog_unlock_irq_enable(sd);
5977 
5978         local_lock_nested_bh(&softnet_data.process_queue_bh_lock);
5979         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
5980                 if (skb->dev->reg_state == NETREG_UNREGISTERING) {
5981                         __skb_unlink(skb, &sd->process_queue);
5982                         kfree_skb(skb);
5983                         rps_input_queue_head_incr(sd);
5984                 }
5985         }
5986         local_unlock_nested_bh(&softnet_data.process_queue_bh_lock);
5987         local_bh_enable();
5988 }
5989 
5990 static bool flush_required(int cpu)
5991 {
5992 #if IS_ENABLED(CONFIG_RPS)
5993         struct softnet_data *sd = &per_cpu(softnet_data, cpu);
5994         bool do_flush;
5995 
5996         backlog_lock_irq_disable(sd);
5997 
5998         /* as insertion into process_queue happens with the rps lock held,
5999          * process_queue access may race only with dequeue
6000          */
6001         do_flush = !skb_queue_empty(&sd->input_pkt_queue) ||
6002                    !skb_queue_empty_lockless(&sd->process_queue);
6003         backlog_unlock_irq_enable(sd);
6004 
6005         return do_flush;
6006 #endif
6007         /* without RPS we can't safely check input_pkt_queue: during a
6008          * concurrent remote skb_queue_splice() we can detect as empty both
6009          * input_pkt_queue and process_queue even if the latter could end-up
6010          * containing a lot of packets.
6011          */
6012         return true;
6013 }
6014 
6015 static void flush_all_backlogs(void)
6016 {
6017         static cpumask_t flush_cpus;
6018         unsigned int cpu;
6019 
6020         /* since we are under rtnl lock protection we can use static data
6021          * for the cpumask and avoid allocating on stack the possibly
6022          * large mask
6023          */
6024         ASSERT_RTNL();
6025 
6026         cpus_read_lock();
6027 
6028         cpumask_clear(&flush_cpus);
6029         for_each_online_cpu(cpu) {
6030                 if (flush_required(cpu)) {
6031                         queue_work_on(cpu, system_highpri_wq,
6032                                       per_cpu_ptr(&flush_works, cpu));
6033                         cpumask_set_cpu(cpu, &flush_cpus);
6034                 }
6035         }
6036 
6037         /* we can have in flight packet[s] on the cpus we are not flushing,
6038          * synchronize_net() in unregister_netdevice_many() will take care of
6039          * them
6040          */
6041         for_each_cpu(cpu, &flush_cpus)
6042                 flush_work(per_cpu_ptr(&flush_works, cpu));
6043 
6044         cpus_read_unlock();
6045 }
6046 
6047 static void net_rps_send_ipi(struct softnet_data *remsd)
6048 {
6049 #ifdef CONFIG_RPS
6050         while (remsd) {
6051                 struct softnet_data *next = remsd->rps_ipi_next;
6052 
6053                 if (cpu_online(remsd->cpu))
6054                         smp_call_function_single_async(remsd->cpu, &remsd->csd);
6055                 remsd = next;
6056         }
6057 #endif
6058 }
6059 
6060 /*
6061  * net_rps_action_and_irq_enable sends any pending IPI's for rps.
6062  * Note: called with local irq disabled, but exits with local irq enabled.
6063  */
6064 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
6065 {
6066 #ifdef CONFIG_RPS
6067         struct softnet_data *remsd = sd->rps_ipi_list;
6068 
6069         if (!use_backlog_threads() && remsd) {
6070                 sd->rps_ipi_list = NULL;
6071 
6072                 local_irq_enable();
6073 
6074                 /* Send pending IPI's to kick RPS processing on remote cpus. */
6075                 net_rps_send_ipi(remsd);
6076         } else
6077 #endif
6078                 local_irq_enable();
6079 }
6080 
6081 static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
6082 {
6083 #ifdef CONFIG_RPS
6084         return !use_backlog_threads() && sd->rps_ipi_list;
6085 #else
6086         return false;
6087 #endif
6088 }
6089 
6090 static int process_backlog(struct napi_struct *napi, int quota)
6091 {
6092         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
6093         bool again = true;
6094         int work = 0;
6095 
6096         /* Check if we have pending ipi, its better to send them now,
6097          * not waiting net_rx_action() end.
6098          */
6099         if (sd_has_rps_ipi_waiting(sd)) {
6100                 local_irq_disable();
6101                 net_rps_action_and_irq_enable(sd);
6102         }
6103 
6104         napi->weight = READ_ONCE(net_hotdata.dev_rx_weight);
6105         while (again) {
6106                 struct sk_buff *skb;
6107 
6108                 local_lock_nested_bh(&softnet_data.process_queue_bh_lock);
6109                 while ((skb = __skb_dequeue(&sd->process_queue))) {
6110                         local_unlock_nested_bh(&softnet_data.process_queue_bh_lock);
6111                         rcu_read_lock();
6112                         __netif_receive_skb(skb);
6113                         rcu_read_unlock();
6114                         if (++work >= quota) {
6115                                 rps_input_queue_head_add(sd, work);
6116                                 return work;
6117                         }
6118 
6119                         local_lock_nested_bh(&softnet_data.process_queue_bh_lock);
6120                 }
6121                 local_unlock_nested_bh(&softnet_data.process_queue_bh_lock);
6122 
6123                 backlog_lock_irq_disable(sd);
6124                 if (skb_queue_empty(&sd->input_pkt_queue)) {
6125                         /*
6126                          * Inline a custom version of __napi_complete().
6127                          * only current cpu owns and manipulates this napi,
6128                          * and NAPI_STATE_SCHED is the only possible flag set
6129                          * on backlog.
6130                          * We can use a plain write instead of clear_bit(),
6131                          * and we dont need an smp_mb() memory barrier.
6132                          */
6133                         napi->state &= NAPIF_STATE_THREADED;
6134                         again = false;
6135                 } else {
6136                         local_lock_nested_bh(&softnet_data.process_queue_bh_lock);
6137                         skb_queue_splice_tail_init(&sd->input_pkt_queue,
6138                                                    &sd->process_queue);
6139                         local_unlock_nested_bh(&softnet_data.process_queue_bh_lock);
6140                 }
6141                 backlog_unlock_irq_enable(sd);
6142         }
6143 
6144         if (work)
6145                 rps_input_queue_head_add(sd, work);
6146         return work;
6147 }
6148 
6149 /**
6150  * __napi_schedule - schedule for receive
6151  * @n: entry to schedule
6152  *
6153  * The entry's receive function will be scheduled to run.
6154  * Consider using __napi_schedule_irqoff() if hard irqs are masked.
6155  */
6156 void __napi_schedule(struct napi_struct *n)
6157 {
6158         unsigned long flags;
6159 
6160         local_irq_save(flags);
6161         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
6162         local_irq_restore(flags);
6163 }
6164 EXPORT_SYMBOL(__napi_schedule);
6165 
6166 /**
6167  *      napi_schedule_prep - check if napi can be scheduled
6168  *      @n: napi context
6169  *
6170  * Test if NAPI routine is already running, and if not mark
6171  * it as running.  This is used as a condition variable to
6172  * insure only one NAPI poll instance runs.  We also make
6173  * sure there is no pending NAPI disable.
6174  */
6175 bool napi_schedule_prep(struct napi_struct *n)
6176 {
6177         unsigned long new, val = READ_ONCE(n->state);
6178 
6179         do {
6180                 if (unlikely(val & NAPIF_STATE_DISABLE))
6181                         return false;
6182                 new = val | NAPIF_STATE_SCHED;
6183 
6184                 /* Sets STATE_MISSED bit if STATE_SCHED was already set
6185                  * This was suggested by Alexander Duyck, as compiler
6186                  * emits better code than :
6187                  * if (val & NAPIF_STATE_SCHED)
6188                  *     new |= NAPIF_STATE_MISSED;
6189                  */
6190                 new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED *
6191                                                    NAPIF_STATE_MISSED;
6192         } while (!try_cmpxchg(&n->state, &val, new));
6193 
6194         return !(val & NAPIF_STATE_SCHED);
6195 }
6196 EXPORT_SYMBOL(napi_schedule_prep);
6197 
6198 /**
6199  * __napi_schedule_irqoff - schedule for receive
6200  * @n: entry to schedule
6201  *
6202  * Variant of __napi_schedule() assuming hard irqs are masked.
6203  *
6204  * On PREEMPT_RT enabled kernels this maps to __napi_schedule()
6205  * because the interrupt disabled assumption might not be true
6206  * due to force-threaded interrupts and spinlock substitution.
6207  */
6208 void __napi_schedule_irqoff(struct napi_struct *n)
6209 {
6210         if (!IS_ENABLED(CONFIG_PREEMPT_RT))
6211                 ____napi_schedule(this_cpu_ptr(&softnet_data), n);
6212         else
6213                 __napi_schedule(n);
6214 }
6215 EXPORT_SYMBOL(__napi_schedule_irqoff);
6216 
6217 bool napi_complete_done(struct napi_struct *n, int work_done)
6218 {
6219         unsigned long flags, val, new, timeout = 0;
6220         bool ret = true;
6221 
6222         /*
6223          * 1) Don't let napi dequeue from the cpu poll list
6224          *    just in case its running on a different cpu.
6225          * 2) If we are busy polling, do nothing here, we have
6226          *    the guarantee we will be called later.
6227          */
6228         if (unlikely(n->state & (NAPIF_STATE_NPSVC |
6229                                  NAPIF_STATE_IN_BUSY_POLL)))
6230                 return false;
6231 
6232         if (work_done) {
6233                 if (n->gro_bitmask)
6234                         timeout = READ_ONCE(n->dev->gro_flush_timeout);
6235                 n->defer_hard_irqs_count = READ_ONCE(n->dev->napi_defer_hard_irqs);
6236         }
6237         if (n->defer_hard_irqs_count > 0) {
6238                 n->defer_hard_irqs_count--;
6239                 timeout = READ_ONCE(n->dev->gro_flush_timeout);
6240                 if (timeout)
6241                         ret = false;
6242         }
6243         if (n->gro_bitmask) {
6244                 /* When the NAPI instance uses a timeout and keeps postponing
6245                  * it, we need to bound somehow the time packets are kept in
6246                  * the GRO layer
6247                  */
6248                 napi_gro_flush(n, !!timeout);
6249         }
6250 
6251         gro_normal_list(n);
6252 
6253         if (unlikely(!list_empty(&n->poll_list))) {
6254                 /* If n->poll_list is not empty, we need to mask irqs */
6255                 local_irq_save(flags);
6256                 list_del_init(&n->poll_list);
6257                 local_irq_restore(flags);
6258         }
6259         WRITE_ONCE(n->list_owner, -1);
6260 
6261         val = READ_ONCE(n->state);
6262         do {
6263                 WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));
6264 
6265                 new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED |
6266                               NAPIF_STATE_SCHED_THREADED |
6267                               NAPIF_STATE_PREFER_BUSY_POLL);
6268 
6269                 /* If STATE_MISSED was set, leave STATE_SCHED set,
6270                  * because we will call napi->poll() one more time.
6271                  * This C code was suggested by Alexander Duyck to help gcc.
6272                  */
6273                 new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED *
6274                                                     NAPIF_STATE_SCHED;
6275         } while (!try_cmpxchg(&n->state, &val, new));
6276 
6277         if (unlikely(val & NAPIF_STATE_MISSED)) {
6278                 __napi_schedule(n);
6279                 return false;
6280         }
6281 
6282         if (timeout)
6283                 hrtimer_start(&n->timer, ns_to_ktime(timeout),
6284                               HRTIMER_MODE_REL_PINNED);
6285         return ret;
6286 }
6287 EXPORT_SYMBOL(napi_complete_done);
6288 
6289 /* must be called under rcu_read_lock(), as we dont take a reference */
6290 struct napi_struct *napi_by_id(unsigned int napi_id)
6291 {
6292         unsigned int hash = napi_id % HASH_SIZE(napi_hash);
6293         struct napi_struct *napi;
6294 
6295         hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
6296                 if (napi->napi_id == napi_id)
6297                         return napi;
6298 
6299         return NULL;
6300 }
6301 
6302 static void skb_defer_free_flush(struct softnet_data *sd)
6303 {
6304         struct sk_buff *skb, *next;
6305 
6306         /* Paired with WRITE_ONCE() in skb_attempt_defer_free() */
6307         if (!READ_ONCE(sd->defer_list))
6308                 return;
6309 
6310         spin_lock(&sd->defer_lock);
6311         skb = sd->defer_list;
6312         sd->defer_list = NULL;
6313         sd->defer_count = 0;
6314         spin_unlock(&sd->defer_lock);
6315 
6316         while (skb != NULL) {
6317                 next = skb->next;
6318                 napi_consume_skb(skb, 1);
6319                 skb = next;
6320         }
6321 }
6322 
6323 #if defined(CONFIG_NET_RX_BUSY_POLL)
6324 
6325 static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule)
6326 {
6327         if (!skip_schedule) {
6328                 gro_normal_list(napi);
6329                 __napi_schedule(napi);
6330                 return;
6331         }
6332 
6333         if (napi->gro_bitmask) {
6334                 /* flush too old packets
6335                  * If HZ < 1000, flush all packets.
6336                  */
6337                 napi_gro_flush(napi, HZ >= 1000);
6338         }
6339 
6340         gro_normal_list(napi);
6341         clear_bit(NAPI_STATE_SCHED, &napi->state);
6342 }
6343 
6344 enum {
6345         NAPI_F_PREFER_BUSY_POLL = 1,
6346         NAPI_F_END_ON_RESCHED   = 2,
6347 };
6348 
6349 static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock,
6350                            unsigned flags, u16 budget)
6351 {
6352         struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
6353         bool skip_schedule = false;
6354         unsigned long timeout;
6355         int rc;
6356 
6357         /* Busy polling means there is a high chance device driver hard irq
6358          * could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was
6359          * set in napi_schedule_prep().
6360          * Since we are about to call napi->poll() once more, we can safely
6361          * clear NAPI_STATE_MISSED.
6362          *
6363          * Note: x86 could use a single "lock and ..." instruction
6364          * to perform these two clear_bit()
6365          */
6366         clear_bit(NAPI_STATE_MISSED, &napi->state);
6367         clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
6368 
6369         local_bh_disable();
6370         bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
6371 
6372         if (flags & NAPI_F_PREFER_BUSY_POLL) {
6373                 napi->defer_hard_irqs_count = READ_ONCE(napi->dev->napi_defer_hard_irqs);
6374                 timeout = READ_ONCE(napi->dev->gro_flush_timeout);
6375                 if (napi->defer_hard_irqs_count && timeout) {
6376                         hrtimer_start(&napi->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED);
6377                         skip_schedule = true;
6378                 }
6379         }
6380 
6381         /* All we really want here is to re-enable device interrupts.
6382          * Ideally, a new ndo_busy_poll_stop() could avoid another round.
6383          */
6384         rc = napi->poll(napi, budget);
6385         /* We can't gro_normal_list() here, because napi->poll() might have
6386          * rearmed the napi (napi_complete_done()) in which case it could
6387          * already be running on another CPU.
6388          */
6389         trace_napi_poll(napi, rc, budget);
6390         netpoll_poll_unlock(have_poll_lock);
6391         if (rc == budget)
6392                 __busy_poll_stop(napi, skip_schedule);
6393         bpf_net_ctx_clear(bpf_net_ctx);
6394         local_bh_enable();
6395 }
6396 
6397 static void __napi_busy_loop(unsigned int napi_id,
6398                       bool (*loop_end)(void *, unsigned long),
6399                       void *loop_end_arg, unsigned flags, u16 budget)
6400 {
6401         unsigned long start_time = loop_end ? busy_loop_current_time() : 0;
6402         int (*napi_poll)(struct napi_struct *napi, int budget);
6403         struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
6404         void *have_poll_lock = NULL;
6405         struct napi_struct *napi;
6406 
6407         WARN_ON_ONCE(!rcu_read_lock_held());
6408 
6409 restart:
6410         napi_poll = NULL;
6411 
6412         napi = napi_by_id(napi_id);
6413         if (!napi)
6414                 return;
6415 
6416         if (!IS_ENABLED(CONFIG_PREEMPT_RT))
6417                 preempt_disable();
6418         for (;;) {
6419                 int work = 0;
6420 
6421                 local_bh_disable();
6422                 bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
6423                 if (!napi_poll) {
6424                         unsigned long val = READ_ONCE(napi->state);
6425 
6426                         /* If multiple threads are competing for this napi,
6427                          * we avoid dirtying napi->state as much as we can.
6428                          */
6429                         if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
6430                                    NAPIF_STATE_IN_BUSY_POLL)) {
6431                                 if (flags & NAPI_F_PREFER_BUSY_POLL)
6432                                         set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
6433                                 goto count;
6434                         }
6435                         if (cmpxchg(&napi->state, val,
6436                                     val | NAPIF_STATE_IN_BUSY_POLL |
6437                                           NAPIF_STATE_SCHED) != val) {
6438                                 if (flags & NAPI_F_PREFER_BUSY_POLL)
6439                                         set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
6440                                 goto count;
6441                         }
6442                         have_poll_lock = netpoll_poll_lock(napi);
6443                         napi_poll = napi->poll;
6444                 }
6445                 work = napi_poll(napi, budget);
6446                 trace_napi_poll(napi, work, budget);
6447                 gro_normal_list(napi);
6448 count:
6449                 if (work > 0)
6450                         __NET_ADD_STATS(dev_net(napi->dev),
6451                                         LINUX_MIB_BUSYPOLLRXPACKETS, work);
6452                 skb_defer_free_flush(this_cpu_ptr(&softnet_data));
6453                 bpf_net_ctx_clear(bpf_net_ctx);
6454                 local_bh_enable();
6455 
6456                 if (!loop_end || loop_end(loop_end_arg, start_time))
6457                         break;
6458 
6459                 if (unlikely(need_resched())) {
6460                         if (flags & NAPI_F_END_ON_RESCHED)
6461                                 break;
6462                         if (napi_poll)
6463                                 busy_poll_stop(napi, have_poll_lock, flags, budget);
6464                         if (!IS_ENABLED(CONFIG_PREEMPT_RT))
6465                                 preempt_enable();
6466                         rcu_read_unlock();
6467                         cond_resched();
6468                         rcu_read_lock();
6469                         if (loop_end(loop_end_arg, start_time))
6470                                 return;
6471                         goto restart;
6472                 }
6473                 cpu_relax();
6474         }
6475         if (napi_poll)
6476                 busy_poll_stop(napi, have_poll_lock, flags, budget);
6477         if (!IS_ENABLED(CONFIG_PREEMPT_RT))
6478                 preempt_enable();
6479 }
6480 
6481 void napi_busy_loop_rcu(unsigned int napi_id,
6482                         bool (*loop_end)(void *, unsigned long),
6483                         void *loop_end_arg, bool prefer_busy_poll, u16 budget)
6484 {
6485         unsigned flags = NAPI_F_END_ON_RESCHED;
6486 
6487         if (prefer_busy_poll)
6488                 flags |= NAPI_F_PREFER_BUSY_POLL;
6489 
6490         __napi_busy_loop(napi_id, loop_end, loop_end_arg, flags, budget);
6491 }
6492 
6493 void napi_busy_loop(unsigned int napi_id,
6494                     bool (*loop_end)(void *, unsigned long),
6495                     void *loop_end_arg, bool prefer_busy_poll, u16 budget)
6496 {
6497         unsigned flags = prefer_busy_poll ? NAPI_F_PREFER_BUSY_POLL : 0;
6498 
6499         rcu_read_lock();
6500         __napi_busy_loop(napi_id, loop_end, loop_end_arg, flags, budget);
6501         rcu_read_unlock();
6502 }
6503 EXPORT_SYMBOL(napi_busy_loop);
6504 
6505 #endif /* CONFIG_NET_RX_BUSY_POLL */
6506 
6507 static void napi_hash_add(struct napi_struct *napi)
6508 {
6509         if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state))
6510                 return;
6511 
6512         spin_lock(&napi_hash_lock);
6513 
6514         /* 0..NR_CPUS range is reserved for sender_cpu use */
6515         do {
6516                 if (unlikely(++napi_gen_id < MIN_NAPI_ID))
6517                         napi_gen_id = MIN_NAPI_ID;
6518         } while (napi_by_id(napi_gen_id));
6519         napi->napi_id = napi_gen_id;
6520 
6521         hlist_add_head_rcu(&napi->napi_hash_node,
6522                            &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
6523 
6524         spin_unlock(&napi_hash_lock);
6525 }
6526 
6527 /* Warning : caller is responsible to make sure rcu grace period
6528  * is respected before freeing memory containing @napi
6529  */
6530 static void napi_hash_del(struct napi_struct *napi)
6531 {
6532         spin_lock(&napi_hash_lock);
6533 
6534         hlist_del_init_rcu(&napi->napi_hash_node);
6535 
6536         spin_unlock(&napi_hash_lock);
6537 }
6538 
6539 static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
6540 {
6541         struct napi_struct *napi;
6542 
6543         napi = container_of(timer, struct napi_struct, timer);
6544 
6545         /* Note : we use a relaxed variant of napi_schedule_prep() not setting
6546          * NAPI_STATE_MISSED, since we do not react to a device IRQ.
6547          */
6548         if (!napi_disable_pending(napi) &&
6549             !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) {
6550                 clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
6551                 __napi_schedule_irqoff(napi);
6552         }
6553 
6554         return HRTIMER_NORESTART;
6555 }
6556 
6557 static void init_gro_hash(struct napi_struct *napi)
6558 {
6559         int i;
6560 
6561         for (i = 0; i < GRO_HASH_BUCKETS; i++) {
6562                 INIT_LIST_HEAD(&napi->gro_hash[i].list);
6563                 napi->gro_hash[i].count = 0;
6564         }
6565         napi->gro_bitmask = 0;
6566 }
6567 
6568 int dev_set_threaded(struct net_device *dev, bool threaded)
6569 {
6570         struct napi_struct *napi;
6571         int err = 0;
6572 
6573         if (dev->threaded == threaded)
6574                 return 0;
6575 
6576         if (threaded) {
6577                 list_for_each_entry(napi, &dev->napi_list, dev_list) {
6578                         if (!napi->thread) {
6579                                 err = napi_kthread_create(napi);
6580                                 if (err) {
6581                                         threaded = false;
6582                                         break;
6583                                 }
6584                         }
6585                 }
6586         }
6587 
6588         WRITE_ONCE(dev->threaded, threaded);
6589 
6590         /* Make sure kthread is created before THREADED bit
6591          * is set.
6592          */
6593         smp_mb__before_atomic();
6594 
6595         /* Setting/unsetting threaded mode on a napi might not immediately
6596          * take effect, if the current napi instance is actively being
6597          * polled. In this case, the switch between threaded mode and
6598          * softirq mode will happen in the next round of napi_schedule().
6599          * This should not cause hiccups/stalls to the live traffic.
6600          */
6601         list_for_each_entry(napi, &dev->napi_list, dev_list)
6602                 assign_bit(NAPI_STATE_THREADED, &napi->state, threaded);
6603 
6604         return err;
6605 }
6606 EXPORT_SYMBOL(dev_set_threaded);
6607 
6608 /**
6609  * netif_queue_set_napi - Associate queue with the napi
6610  * @dev: device to which NAPI and queue belong
6611  * @queue_index: Index of queue
6612  * @type: queue type as RX or TX
6613  * @napi: NAPI context, pass NULL to clear previously set NAPI
6614  *
6615  * Set queue with its corresponding napi context. This should be done after
6616  * registering the NAPI handler for the queue-vector and the queues have been
6617  * mapped to the corresponding interrupt vector.
6618  */
6619 void netif_queue_set_napi(struct net_device *dev, unsigned int queue_index,
6620                           enum netdev_queue_type type, struct napi_struct *napi)
6621 {
6622         struct netdev_rx_queue *rxq;
6623         struct netdev_queue *txq;
6624 
6625         if (WARN_ON_ONCE(napi && !napi->dev))
6626                 return;
6627         if (dev->reg_state >= NETREG_REGISTERED)
6628                 ASSERT_RTNL();
6629 
6630         switch (type) {
6631         case NETDEV_QUEUE_TYPE_RX:
6632                 rxq = __netif_get_rx_queue(dev, queue_index);
6633                 rxq->napi = napi;
6634                 return;
6635         case NETDEV_QUEUE_TYPE_TX:
6636                 txq = netdev_get_tx_queue(dev, queue_index);
6637                 txq->napi = napi;
6638                 return;
6639         default:
6640                 return;
6641         }
6642 }
6643 EXPORT_SYMBOL(netif_queue_set_napi);
6644 
6645 void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
6646                            int (*poll)(struct napi_struct *, int), int weight)
6647 {
6648         if (WARN_ON(test_and_set_bit(NAPI_STATE_LISTED, &napi->state)))
6649                 return;
6650 
6651         INIT_LIST_HEAD(&napi->poll_list);
6652         INIT_HLIST_NODE(&napi->napi_hash_node);
6653         hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
6654         napi->timer.function = napi_watchdog;
6655         init_gro_hash(napi);
6656         napi->skb = NULL;
6657         INIT_LIST_HEAD(&napi->rx_list);
6658         napi->rx_count = 0;
6659         napi->poll = poll;
6660         if (weight > NAPI_POLL_WEIGHT)
6661                 netdev_err_once(dev, "%s() called with weight %d\n", __func__,
6662                                 weight);
6663         napi->weight = weight;
6664         napi->dev = dev;
6665 #ifdef CONFIG_NETPOLL
6666         napi->poll_owner = -1;
6667 #endif
6668         napi->list_owner = -1;
6669         set_bit(NAPI_STATE_SCHED, &napi->state);
6670         set_bit(NAPI_STATE_NPSVC, &napi->state);
6671         list_add_rcu(&napi->dev_list, &dev->napi_list);
6672         napi_hash_add(napi);
6673         napi_get_frags_check(napi);
6674         /* Create kthread for this napi if dev->threaded is set.
6675          * Clear dev->threaded if kthread creation failed so that
6676          * threaded mode will not be enabled in napi_enable().
6677          */
6678         if (dev->threaded && napi_kthread_create(napi))
6679                 dev->threaded = false;
6680         netif_napi_set_irq(napi, -1);
6681 }
6682 EXPORT_SYMBOL(netif_napi_add_weight);
6683 
6684 void napi_disable(struct napi_struct *n)
6685 {
6686         unsigned long val, new;
6687 
6688         might_sleep();
6689         set_bit(NAPI_STATE_DISABLE, &n->state);
6690 
6691         val = READ_ONCE(n->state);
6692         do {
6693                 while (val & (NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC)) {
6694                         usleep_range(20, 200);
6695                         val = READ_ONCE(n->state);
6696                 }
6697 
6698                 new = val | NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC;
6699                 new &= ~(NAPIF_STATE_THREADED | NAPIF_STATE_PREFER_BUSY_POLL);
6700         } while (!try_cmpxchg(&n->state, &val, new));
6701 
6702         hrtimer_cancel(&n->timer);
6703 
6704         clear_bit(NAPI_STATE_DISABLE, &n->state);
6705 }
6706 EXPORT_SYMBOL(napi_disable);
6707 
6708 /**
6709  *      napi_enable - enable NAPI scheduling
6710  *      @n: NAPI context
6711  *
6712  * Resume NAPI from being scheduled on this context.
6713  * Must be paired with napi_disable.
6714  */
6715 void napi_enable(struct napi_struct *n)
6716 {
6717         unsigned long new, val = READ_ONCE(n->state);
6718 
6719         do {
6720                 BUG_ON(!test_bit(NAPI_STATE_SCHED, &val));
6721 
6722                 new = val & ~(NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC);
6723                 if (n->dev->threaded && n->thread)
6724                         new |= NAPIF_STATE_THREADED;
6725         } while (!try_cmpxchg(&n->state, &val, new));
6726 }
6727 EXPORT_SYMBOL(napi_enable);
6728 
6729 static void flush_gro_hash(struct napi_struct *napi)
6730 {
6731         int i;
6732 
6733         for (i = 0; i < GRO_HASH_BUCKETS; i++) {
6734                 struct sk_buff *skb, *n;
6735 
6736                 list_for_each_entry_safe(skb, n, &napi->gro_hash[i].list, list)
6737                         kfree_skb(skb);
6738                 napi->gro_hash[i].count = 0;
6739         }
6740 }
6741 
6742 /* Must be called in process context */
6743 void __netif_napi_del(struct napi_struct *napi)
6744 {
6745         if (!test_and_clear_bit(NAPI_STATE_LISTED, &napi->state))
6746                 return;
6747 
6748         napi_hash_del(napi);
6749         list_del_rcu(&napi->dev_list);
6750         napi_free_frags(napi);
6751 
6752         flush_gro_hash(napi);
6753         napi->gro_bitmask = 0;
6754 
6755         if (napi->thread) {
6756                 kthread_stop(napi->thread);
6757                 napi->thread = NULL;
6758         }
6759 }
6760 EXPORT_SYMBOL(__netif_napi_del);
6761 
6762 static int __napi_poll(struct napi_struct *n, bool *repoll)
6763 {
6764         int work, weight;
6765 
6766         weight = n->weight;
6767 
6768         /* This NAPI_STATE_SCHED test is for avoiding a race
6769          * with netpoll's poll_napi().  Only the entity which
6770          * obtains the lock and sees NAPI_STATE_SCHED set will
6771          * actually make the ->poll() call.  Therefore we avoid
6772          * accidentally calling ->poll() when NAPI is not scheduled.
6773          */
6774         work = 0;
6775         if (napi_is_scheduled(n)) {
6776                 work = n->poll(n, weight);
6777                 trace_napi_poll(n, work, weight);
6778 
6779                 xdp_do_check_flushed(n);
6780         }
6781 
6782         if (unlikely(work > weight))
6783                 netdev_err_once(n->dev, "NAPI poll function %pS returned %d, exceeding its budget of %d.\n",
6784                                 n->poll, work, weight);
6785 
6786         if (likely(work < weight))
6787                 return work;
6788 
6789         /* Drivers must not modify the NAPI state if they
6790          * consume the entire weight.  In such cases this code
6791          * still "owns" the NAPI instance and therefore can
6792          * move the instance around on the list at-will.
6793          */
6794         if (unlikely(napi_disable_pending(n))) {
6795                 napi_complete(n);
6796                 return work;
6797         }
6798 
6799         /* The NAPI context has more processing work, but busy-polling
6800          * is preferred. Exit early.
6801          */
6802         if (napi_prefer_busy_poll(n)) {
6803                 if (napi_complete_done(n, work)) {
6804                         /* If timeout is not set, we need to make sure
6805                          * that the NAPI is re-scheduled.
6806                          */
6807                         napi_schedule(n);
6808                 }
6809                 return work;
6810         }
6811 
6812         if (n->gro_bitmask) {
6813                 /* flush too old packets
6814                  * If HZ < 1000, flush all packets.
6815                  */
6816                 napi_gro_flush(n, HZ >= 1000);
6817         }
6818 
6819         gro_normal_list(n);
6820 
6821         /* Some drivers may have called napi_schedule
6822          * prior to exhausting their budget.
6823          */
6824         if (unlikely(!list_empty(&n->poll_list))) {
6825                 pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
6826                              n->dev ? n->dev->name : "backlog");
6827                 return work;
6828         }
6829 
6830         *repoll = true;
6831 
6832         return work;
6833 }
6834 
6835 static int napi_poll(struct napi_struct *n, struct list_head *repoll)
6836 {
6837         bool do_repoll = false;
6838         void *have;
6839         int work;
6840 
6841         list_del_init(&n->poll_list);
6842 
6843         have = netpoll_poll_lock(n);
6844 
6845         work = __napi_poll(n, &do_repoll);
6846 
6847         if (do_repoll)
6848                 list_add_tail(&n->poll_list, repoll);
6849 
6850         netpoll_poll_unlock(have);
6851 
6852         return work;
6853 }
6854 
6855 static int napi_thread_wait(struct napi_struct *napi)
6856 {
6857         set_current_state(TASK_INTERRUPTIBLE);
6858 
6859         while (!kthread_should_stop()) {
6860                 /* Testing SCHED_THREADED bit here to make sure the current
6861                  * kthread owns this napi and could poll on this napi.
6862                  * Testing SCHED bit is not enough because SCHED bit might be
6863                  * set by some other busy poll thread or by napi_disable().
6864                  */
6865                 if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state)) {
6866                         WARN_ON(!list_empty(&napi->poll_list));
6867                         __set_current_state(TASK_RUNNING);
6868                         return 0;
6869                 }
6870 
6871                 schedule();
6872                 set_current_state(TASK_INTERRUPTIBLE);
6873         }
6874         __set_current_state(TASK_RUNNING);
6875 
6876         return -1;
6877 }
6878 
6879 static void napi_threaded_poll_loop(struct napi_struct *napi)
6880 {
6881         struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
6882         struct softnet_data *sd;
6883         unsigned long last_qs = jiffies;
6884 
6885         for (;;) {
6886                 bool repoll = false;
6887                 void *have;
6888 
6889                 local_bh_disable();
6890                 bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
6891 
6892                 sd = this_cpu_ptr(&softnet_data);
6893                 sd->in_napi_threaded_poll = true;
6894 
6895                 have = netpoll_poll_lock(napi);
6896                 __napi_poll(napi, &repoll);
6897                 netpoll_poll_unlock(have);
6898 
6899                 sd->in_napi_threaded_poll = false;
6900                 barrier();
6901 
6902                 if (sd_has_rps_ipi_waiting(sd)) {
6903                         local_irq_disable();
6904                         net_rps_action_and_irq_enable(sd);
6905                 }
6906                 skb_defer_free_flush(sd);
6907                 bpf_net_ctx_clear(bpf_net_ctx);
6908                 local_bh_enable();
6909 
6910                 if (!repoll)
6911                         break;
6912 
6913                 rcu_softirq_qs_periodic(last_qs);
6914                 cond_resched();
6915         }
6916 }
6917 
6918 static int napi_threaded_poll(void *data)
6919 {
6920         struct napi_struct *napi = data;
6921 
6922         while (!napi_thread_wait(napi))
6923                 napi_threaded_poll_loop(napi);
6924 
6925         return 0;
6926 }
6927 
6928 static __latent_entropy void net_rx_action(struct softirq_action *h)
6929 {
6930         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
6931         unsigned long time_limit = jiffies +
6932                 usecs_to_jiffies(READ_ONCE(net_hotdata.netdev_budget_usecs));
6933         struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
6934         int budget = READ_ONCE(net_hotdata.netdev_budget);
6935         LIST_HEAD(list);
6936         LIST_HEAD(repoll);
6937 
6938         bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
6939 start:
6940         sd->in_net_rx_action = true;
6941         local_irq_disable();
6942         list_splice_init(&sd->poll_list, &list);
6943         local_irq_enable();
6944 
6945         for (;;) {
6946                 struct napi_struct *n;
6947 
6948                 skb_defer_free_flush(sd);
6949 
6950                 if (list_empty(&list)) {
6951                         if (list_empty(&repoll)) {
6952                                 sd->in_net_rx_action = false;
6953                                 barrier();
6954                                 /* We need to check if ____napi_schedule()
6955                                  * had refilled poll_list while
6956                                  * sd->in_net_rx_action was true.
6957                                  */
6958                                 if (!list_empty(&sd->poll_list))
6959                                         goto start;
6960                                 if (!sd_has_rps_ipi_waiting(sd))
6961                                         goto end;
6962                         }
6963                         break;
6964                 }
6965 
6966                 n = list_first_entry(&list, struct napi_struct, poll_list);
6967                 budget -= napi_poll(n, &repoll);
6968 
6969                 /* If softirq window is exhausted then punt.
6970                  * Allow this to run for 2 jiffies since which will allow
6971                  * an average latency of 1.5/HZ.
6972                  */
6973                 if (unlikely(budget <= 0 ||
6974                              time_after_eq(jiffies, time_limit))) {
6975                         sd->time_squeeze++;
6976                         break;
6977                 }
6978         }
6979 
6980         local_irq_disable();
6981 
6982         list_splice_tail_init(&sd->poll_list, &list);
6983         list_splice_tail(&repoll, &list);
6984         list_splice(&list, &sd->poll_list);
6985         if (!list_empty(&sd->poll_list))
6986                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
6987         else
6988                 sd->in_net_rx_action = false;
6989 
6990         net_rps_action_and_irq_enable(sd);
6991 end:
6992         bpf_net_ctx_clear(bpf_net_ctx);
6993 }
6994 
6995 struct netdev_adjacent {
6996         struct net_device *dev;
6997         netdevice_tracker dev_tracker;
6998 
6999         /* upper master flag, there can only be one master device per list */
7000         bool master;
7001 
7002         /* lookup ignore flag */
7003         bool ignore;
7004 
7005         /* counter for the number of times this device was added to us */
7006         u16 ref_nr;
7007 
7008         /* private field for the users */
7009         void *private;
7010 
7011         struct list_head list;
7012         struct rcu_head rcu;
7013 };
7014 
7015 static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
7016                                                  struct list_head *adj_list)
7017 {
7018         struct netdev_adjacent *adj;
7019 
7020         list_for_each_entry(adj, adj_list, list) {
7021                 if (adj->dev == adj_dev)
7022                         return adj;
7023         }
7024         return NULL;
7025 }
7026 
7027 static int ____netdev_has_upper_dev(struct net_device *upper_dev,
7028                                     struct netdev_nested_priv *priv)
7029 {
7030         struct net_device *dev = (struct net_device *)priv->data;
7031 
7032         return upper_dev == dev;
7033 }
7034 
7035 /**
7036  * netdev_has_upper_dev - Check if device is linked to an upper device
7037  * @dev: device
7038  * @upper_dev: upper device to check
7039  *
7040  * Find out if a device is linked to specified upper device and return true
7041  * in case it is. Note that this checks only immediate upper device,
7042  * not through a complete stack of devices. The caller must hold the RTNL lock.
7043  */
7044 bool netdev_has_upper_dev(struct net_device *dev,
7045                           struct net_device *upper_dev)
7046 {
7047         struct netdev_nested_priv priv = {
7048                 .data = (void *)upper_dev,
7049         };
7050 
7051         ASSERT_RTNL();
7052 
7053         return netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
7054                                              &priv);
7055 }
7056 EXPORT_SYMBOL(netdev_has_upper_dev);
7057 
7058 /**
7059  * netdev_has_upper_dev_all_rcu - Check if device is linked to an upper device
7060  * @dev: device
7061  * @upper_dev: upper device to check
7062  *
7063  * Find out if a device is linked to specified upper device and return true
7064  * in case it is. Note that this checks the entire upper device chain.
7065  * The caller must hold rcu lock.
7066  */
7067 
7068 bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
7069                                   struct net_device *upper_dev)
7070 {
7071         struct netdev_nested_priv priv = {
7072                 .data = (void *)upper_dev,
7073         };
7074 
7075         return !!netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
7076                                                &priv);
7077 }
7078 EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu);
7079 
7080 /**
7081  * netdev_has_any_upper_dev - Check if device is linked to some device
7082  * @dev: device
7083  *
7084  * Find out if a device is linked to an upper device and return true in case
7085  * it is. The caller must hold the RTNL lock.
7086  */
7087 bool netdev_has_any_upper_dev(struct net_device *dev)
7088 {
7089         ASSERT_RTNL();
7090 
7091         return !list_empty(&dev->adj_list.upper);
7092 }
7093 EXPORT_SYMBOL(netdev_has_any_upper_dev);
7094 
7095 /**
7096  * netdev_master_upper_dev_get - Get master upper device
7097  * @dev: device
7098  *
7099  * Find a master upper device and return pointer to it or NULL in case
7100  * it's not there. The caller must hold the RTNL lock.
7101  */
7102 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
7103 {
7104         struct netdev_adjacent *upper;
7105 
7106         ASSERT_RTNL();
7107 
7108         if (list_empty(&dev->adj_list.upper))
7109                 return NULL;
7110 
7111         upper = list_first_entry(&dev->adj_list.upper,
7112                                  struct netdev_adjacent, list);
7113         if (likely(upper->master))
7114                 return upper->dev;
7115         return NULL;
7116 }
7117 EXPORT_SYMBOL(netdev_master_upper_dev_get);
7118 
7119 static struct net_device *__netdev_master_upper_dev_get(struct net_device *dev)
7120 {
7121         struct netdev_adjacent *upper;
7122 
7123         ASSERT_RTNL();
7124 
7125         if (list_empty(&dev->adj_list.upper))
7126                 return NULL;
7127 
7128         upper = list_first_entry(&dev->adj_list.upper,
7129                                  struct netdev_adjacent, list);
7130         if (likely(upper->master) && !upper->ignore)
7131                 return upper->dev;
7132         return NULL;
7133 }
7134 
7135 /**
7136  * netdev_has_any_lower_dev - Check if device is linked to some device
7137  * @dev: device
7138  *
7139  * Find out if a device is linked to a lower device and return true in case
7140  * it is. The caller must hold the RTNL lock.
7141  */
7142 static bool netdev_has_any_lower_dev(struct net_device *dev)
7143 {
7144         ASSERT_RTNL();
7145 
7146         return !list_empty(&dev->adj_list.lower);
7147 }
7148 
7149 void *netdev_adjacent_get_private(struct list_head *adj_list)
7150 {
7151         struct netdev_adjacent *adj;
7152 
7153         adj = list_entry(adj_list, struct netdev_adjacent, list);
7154 
7155         return adj->private;
7156 }
7157 EXPORT_SYMBOL(netdev_adjacent_get_private);
7158 
7159 /**
7160  * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
7161  * @dev: device
7162  * @iter: list_head ** of the current position
7163  *
7164  * Gets the next device from the dev's upper list, starting from iter
7165  * position. The caller must hold RCU read lock.
7166  */
7167 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
7168                                                  struct list_head **iter)
7169 {
7170         struct netdev_adjacent *upper;
7171 
7172         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
7173 
7174         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
7175 
7176         if (&upper->list == &dev->adj_list.upper)
7177                 return NULL;
7178 
7179         *iter = &upper->list;
7180 
7181         return upper->dev;
7182 }
7183 EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
7184 
7185 static struct net_device *__netdev_next_upper_dev(struct net_device *dev,
7186                                                   struct list_head **iter,
7187                                                   bool *ignore)
7188 {
7189         struct netdev_adjacent *upper;
7190 
7191         upper = list_entry((*iter)->next, struct netdev_adjacent, list);
7192 
7193         if (&upper->list == &dev->adj_list.upper)
7194                 return NULL;
7195 
7196         *iter = &upper->list;
7197         *ignore = upper->ignore;
7198 
7199         return upper->dev;
7200 }
7201 
7202 static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev,
7203                                                     struct list_head **iter)
7204 {
7205         struct netdev_adjacent *upper;
7206 
7207         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
7208 
7209         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
7210 
7211         if (&upper->list == &dev->adj_list.upper)
7212                 return NULL;
7213 
7214         *iter = &upper->list;
7215 
7216         return upper->dev;
7217 }
7218 
7219 static int __netdev_walk_all_upper_dev(struct net_device *dev,
7220                                        int (*fn)(struct net_device *dev,
7221                                          struct netdev_nested_priv *priv),
7222                                        struct netdev_nested_priv *priv)
7223 {
7224         struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
7225         struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
7226         int ret, cur = 0;
7227         bool ignore;
7228 
7229         now = dev;
7230         iter = &dev->adj_list.upper;
7231 
7232         while (1) {
7233                 if (now != dev) {
7234                         ret = fn(now, priv);
7235                         if (ret)
7236                                 return ret;
7237                 }
7238 
7239                 next = NULL;
7240                 while (1) {
7241                         udev = __netdev_next_upper_dev(now, &iter, &ignore);
7242                         if (!udev)
7243                                 break;
7244                         if (ignore)
7245                                 continue;
7246 
7247                         next = udev;
7248                         niter = &udev->adj_list.upper;
7249                         dev_stack[cur] = now;
7250                         iter_stack[cur++] = iter;
7251                         break;
7252                 }
7253 
7254                 if (!next) {
7255                         if (!cur)
7256                                 return 0;
7257                         next = dev_stack[--cur];
7258                         niter = iter_stack[cur];
7259                 }
7260 
7261                 now = next;
7262                 iter = niter;
7263         }
7264 
7265         return 0;
7266 }
7267 
7268 int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
7269                                   int (*fn)(struct net_device *dev,
7270                                             struct netdev_nested_priv *priv),
7271                                   struct netdev_nested_priv *priv)
7272 {
7273         struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
7274         struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
7275         int ret, cur = 0;
7276 
7277         now = dev;
7278         iter = &dev->adj_list.upper;
7279 
7280         while (1) {
7281                 if (now != dev) {
7282                         ret = fn(now, priv);
7283                         if (ret)
7284                                 return ret;
7285                 }
7286 
7287                 next = NULL;
7288                 while (1) {
7289                         udev = netdev_next_upper_dev_rcu(now, &iter);
7290                         if (!udev)
7291                                 break;
7292 
7293                         next = udev;
7294                         niter = &udev->adj_list.upper;
7295                         dev_stack[cur] = now;
7296                         iter_stack[cur++] = iter;
7297                         break;
7298                 }
7299 
7300                 if (!next) {
7301                         if (!cur)
7302                                 return 0;
7303                         next = dev_stack[--cur];
7304                         niter = iter_stack[cur];
7305                 }
7306 
7307                 now = next;
7308                 iter = niter;
7309         }
7310 
7311         return 0;
7312 }
7313 EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu);
7314 
7315 static bool __netdev_has_upper_dev(struct net_device *dev,
7316                                    struct net_device *upper_dev)
7317 {
7318         struct netdev_nested_priv priv = {
7319                 .flags = 0,
7320                 .data = (void *)upper_dev,
7321         };
7322 
7323         ASSERT_RTNL();
7324 
7325         return __netdev_walk_all_upper_dev(dev, ____netdev_has_upper_dev,
7326                                            &priv);
7327 }
7328 
7329 /**
7330  * netdev_lower_get_next_private - Get the next ->private from the
7331  *                                 lower neighbour list
7332  * @dev: device
7333  * @iter: list_head ** of the current position
7334  *
7335  * Gets the next netdev_adjacent->private from the dev's lower neighbour
7336  * list, starting from iter position. The caller must hold either hold the
7337  * RTNL lock or its own locking that guarantees that the neighbour lower
7338  * list will remain unchanged.
7339  */
7340 void *netdev_lower_get_next_private(struct net_device *dev,
7341                                     struct list_head **iter)
7342 {
7343         struct netdev_adjacent *lower;
7344 
7345         lower = list_entry(*iter, struct netdev_adjacent, list);
7346 
7347         if (&lower->list == &dev->adj_list.lower)
7348                 return NULL;
7349 
7350         *iter = lower->list.next;
7351 
7352         return lower->private;
7353 }
7354 EXPORT_SYMBOL(netdev_lower_get_next_private);
7355 
7356 /**
7357  * netdev_lower_get_next_private_rcu - Get the next ->private from the
7358  *                                     lower neighbour list, RCU
7359  *                                     variant
7360  * @dev: device
7361  * @iter: list_head ** of the current position
7362  *
7363  * Gets the next netdev_adjacent->private from the dev's lower neighbour
7364  * list, starting from iter position. The caller must hold RCU read lock.
7365  */
7366 void *netdev_lower_get_next_private_rcu(struct net_device *dev,
7367                                         struct list_head **iter)
7368 {
7369         struct netdev_adjacent *lower;
7370 
7371         WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
7372 
7373         lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
7374 
7375         if (&lower->list == &dev->adj_list.lower)
7376                 return NULL;
7377 
7378         *iter = &lower->list;
7379 
7380         return lower->private;
7381 }
7382 EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
7383 
7384 /**
7385  * netdev_lower_get_next - Get the next device from the lower neighbour
7386  *                         list
7387  * @dev: device
7388  * @iter: list_head ** of the current position
7389  *
7390  * Gets the next netdev_adjacent from the dev's lower neighbour
7391  * list, starting from iter position. The caller must hold RTNL lock or
7392  * its own locking that guarantees that the neighbour lower
7393  * list will remain unchanged.
7394  */
7395 void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
7396 {
7397         struct netdev_adjacent *lower;
7398 
7399         lower = list_entry(*iter, struct netdev_adjacent, list);
7400 
7401         if (&lower->list == &dev->adj_list.lower)
7402                 return NULL;
7403 
7404         *iter = lower->list.next;
7405 
7406         return lower->dev;
7407 }
7408 EXPORT_SYMBOL(netdev_lower_get_next);
7409 
7410 static struct net_device *netdev_next_lower_dev(struct net_device *dev,
7411                                                 struct list_head **iter)
7412 {
7413         struct netdev_adjacent *lower;
7414 
7415         lower = list_entry((*iter)->next, struct netdev_adjacent, list);
7416 
7417         if (&lower->list == &dev->adj_list.lower)
7418                 return NULL;
7419 
7420         *iter = &lower->list;
7421 
7422         return lower->dev;
7423 }
7424 
7425 static struct net_device *__netdev_next_lower_dev(struct net_device *dev,
7426                                                   struct list_head **iter,
7427                                                   bool *ignore)
7428 {
7429         struct netdev_adjacent *lower;
7430 
7431         lower = list_entry((*iter)->next, struct netdev_adjacent, list);
7432 
7433         if (&lower->list == &dev->adj_list.lower)
7434                 return NULL;
7435 
7436         *iter = &lower->list;
7437         *ignore = lower->ignore;
7438 
7439         return lower->dev;
7440 }
7441 
7442 int netdev_walk_all_lower_dev(struct net_device *dev,
7443                               int (*fn)(struct net_device *dev,
7444                                         struct netdev_nested_priv *priv),
7445                               struct netdev_nested_priv *priv)
7446 {
7447         struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
7448         struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
7449         int ret, cur = 0;
7450 
7451         now = dev;
7452         iter = &dev->adj_list.lower;
7453 
7454         while (1) {
7455                 if (now != dev) {
7456                         ret = fn(now, priv);
7457                         if (ret)
7458                                 return ret;
7459                 }
7460 
7461                 next = NULL;
7462                 while (1) {
7463                         ldev = netdev_next_lower_dev(now, &iter);
7464                         if (!ldev)
7465                                 break;
7466 
7467                         next = ldev;
7468                         niter = &ldev->adj_list.lower;
7469                         dev_stack[cur] = now;
7470                         iter_stack[cur++] = iter;
7471                         break;
7472                 }
7473 
7474                 if (!next) {
7475                         if (!cur)
7476                                 return 0;
7477                         next = dev_stack[--cur];
7478                         niter = iter_stack[cur];
7479                 }
7480 
7481                 now = next;
7482                 iter = niter;
7483         }
7484 
7485         return 0;
7486 }
7487 EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev);
7488 
7489 static int __netdev_walk_all_lower_dev(struct net_device *dev,
7490                                        int (*fn)(struct net_device *dev,
7491                                          struct netdev_nested_priv *priv),
7492                                        struct netdev_nested_priv *priv)
7493 {
7494         struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
7495         struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
7496         int ret, cur = 0;
7497         bool ignore;
7498 
7499         now = dev;
7500         iter = &dev->adj_list.lower;
7501 
7502         while (1) {
7503                 if (now != dev) {
7504                         ret = fn(now, priv);
7505                         if (ret)
7506                                 return ret;
7507                 }
7508 
7509                 next = NULL;
7510                 while (1) {
7511                         ldev = __netdev_next_lower_dev(now, &iter, &ignore);
7512                         if (!ldev)
7513                                 break;
7514                         if (ignore)
7515                                 continue;
7516 
7517                         next = ldev;
7518                         niter = &ldev->adj_list.lower;
7519                         dev_stack[cur] = now;
7520                         iter_stack[cur++] = iter;
7521                         break;
7522                 }
7523 
7524                 if (!next) {
7525                         if (!cur)
7526                                 return 0;
7527                         next = dev_stack[--cur];
7528                         niter = iter_stack[cur];
7529                 }
7530 
7531                 now = next;
7532                 iter = niter;
7533         }
7534 
7535         return 0;
7536 }
7537 
7538 struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
7539                                              struct list_head **iter)
7540 {
7541         struct netdev_adjacent *lower;
7542 
7543         lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
7544         if (&lower->list == &dev->adj_list.lower)
7545                 return NULL;
7546 
7547         *iter = &lower->list;
7548 
7549         return lower->dev;
7550 }
7551 EXPORT_SYMBOL(netdev_next_lower_dev_rcu);
7552 
7553 static u8 __netdev_upper_depth(struct net_device *dev)
7554 {
7555         struct net_device *udev;
7556         struct list_head *iter;
7557         u8 max_depth = 0;
7558         bool ignore;
7559 
7560         for (iter = &dev->adj_list.upper,
7561              udev = __netdev_next_upper_dev(dev, &iter, &ignore);
7562              udev;
7563              udev = __netdev_next_upper_dev(dev, &iter, &ignore)) {
7564                 if (ignore)
7565                         continue;
7566                 if (max_depth < udev->upper_level)
7567                         max_depth = udev->upper_level;
7568         }
7569 
7570         return max_depth;
7571 }
7572 
7573 static u8 __netdev_lower_depth(struct net_device *dev)
7574 {
7575         struct net_device *ldev;
7576         struct list_head *iter;
7577         u8 max_depth = 0;
7578         bool ignore;
7579 
7580         for (iter = &dev->adj_list.lower,
7581              ldev = __netdev_next_lower_dev(dev, &iter, &ignore);
7582              ldev;
7583              ldev = __netdev_next_lower_dev(dev, &iter, &ignore)) {
7584                 if (ignore)
7585                         continue;
7586                 if (max_depth < ldev->lower_level)
7587                         max_depth = ldev->lower_level;
7588         }
7589 
7590         return max_depth;
7591 }
7592 
7593 static int __netdev_update_upper_level(struct net_device *dev,
7594                                        struct netdev_nested_priv *__unused)
7595 {
7596         dev->upper_level = __netdev_upper_depth(dev) + 1;
7597         return 0;
7598 }
7599 
7600 #ifdef CONFIG_LOCKDEP
7601 static LIST_HEAD(net_unlink_list);
7602 
7603 static void net_unlink_todo(struct net_device *dev)
7604 {
7605         if (list_empty(&dev->unlink_list))
7606                 list_add_tail(&dev->unlink_list, &net_unlink_list);
7607 }
7608 #endif
7609 
7610 static int __netdev_update_lower_level(struct net_device *dev,
7611                                        struct netdev_nested_priv *priv)
7612 {
7613         dev->lower_level = __netdev_lower_depth(dev) + 1;
7614 
7615 #ifdef CONFIG_LOCKDEP
7616         if (!priv)
7617                 return 0;
7618 
7619         if (priv->flags & NESTED_SYNC_IMM)
7620                 dev->nested_level = dev->lower_level - 1;
7621         if (priv->flags & NESTED_SYNC_TODO)
7622                 net_unlink_todo(dev);
7623 #endif
7624         return 0;
7625 }
7626 
7627 int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
7628                                   int (*fn)(struct net_device *dev,
7629                                             struct netdev_nested_priv *priv),
7630                                   struct netdev_nested_priv *priv)
7631 {
7632         struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
7633         struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
7634         int ret, cur = 0;
7635 
7636         now = dev;
7637         iter = &dev->adj_list.lower;
7638 
7639         while (1) {
7640                 if (now != dev) {
7641                         ret = fn(now, priv);
7642                         if (ret)
7643                                 return ret;
7644                 }
7645 
7646                 next = NULL;
7647                 while (1) {
7648                         ldev = netdev_next_lower_dev_rcu(now, &iter);
7649                         if (!ldev)
7650                                 break;
7651 
7652                         next = ldev;
7653                         niter = &ldev->adj_list.lower;
7654                         dev_stack[cur] = now;
7655                         iter_stack[cur++] = iter;
7656                         break;
7657                 }
7658 
7659                 if (!next) {
7660                         if (!cur)
7661                                 return 0;
7662                         next = dev_stack[--cur];
7663                         niter = iter_stack[cur];
7664                 }
7665 
7666                 now = next;
7667                 iter = niter;
7668         }
7669 
7670         return 0;
7671 }
7672 EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu);
7673 
7674 /**
7675  * netdev_lower_get_first_private_rcu - Get the first ->private from the
7676  *                                     lower neighbour list, RCU
7677  *                                     variant
7678  * @dev: device
7679  *
7680  * Gets the first netdev_adjacent->private from the dev's lower neighbour
7681  * list. The caller must hold RCU read lock.
7682  */
7683 void *netdev_lower_get_first_private_rcu(struct net_device *dev)
7684 {
7685         struct netdev_adjacent *lower;
7686 
7687         lower = list_first_or_null_rcu(&dev->adj_list.lower,
7688                         struct netdev_adjacent, list);
7689         if (lower)
7690                 return lower->private;
7691         return NULL;
7692 }
7693 EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
7694 
7695 /**
7696  * netdev_master_upper_dev_get_rcu - Get master upper device
7697  * @dev: device
7698  *
7699  * Find a master upper device and return pointer to it or NULL in case
7700  * it's not there. The caller must hold the RCU read lock.
7701  */
7702 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
7703 {
7704         struct netdev_adjacent *upper;
7705 
7706         upper = list_first_or_null_rcu(&dev->adj_list.upper,
7707                                        struct netdev_adjacent, list);
7708         if (upper && likely(upper->master))
7709                 return upper->dev;
7710         return NULL;
7711 }
7712 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
7713 
7714 static int netdev_adjacent_sysfs_add(struct net_device *dev,
7715                               struct net_device *adj_dev,
7716                               struct list_head *dev_list)
7717 {
7718         char linkname[IFNAMSIZ+7];
7719 
7720         sprintf(linkname, dev_list == &dev->adj_list.upper ?
7721                 "upper_%s" : "lower_%s", adj_dev->name);
7722         return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
7723                                  linkname);
7724 }
7725 static void netdev_adjacent_sysfs_del(struct net_device *dev,
7726                                char *name,
7727                                struct list_head *dev_list)
7728 {
7729         char linkname[IFNAMSIZ+7];
7730 
7731         sprintf(linkname, dev_list == &dev->adj_list.upper ?
7732                 "upper_%s" : "lower_%s", name);
7733         sysfs_remove_link(&(dev->dev.kobj), linkname);
7734 }
7735 
7736 static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
7737                                                  struct net_device *adj_dev,
7738                                                  struct list_head *dev_list)
7739 {
7740         return (dev_list == &dev->adj_list.upper ||
7741                 dev_list == &dev->adj_list.lower) &&
7742                 net_eq(dev_net(dev), dev_net(adj_dev));
7743 }
7744 
7745 static int __netdev_adjacent_dev_insert(struct net_device *dev,
7746                                         struct net_device *adj_dev,
7747                                         struct list_head *dev_list,
7748                                         void *private, bool master)
7749 {
7750         struct netdev_adjacent *adj;
7751         int ret;
7752 
7753         adj = __netdev_find_adj(adj_dev, dev_list);
7754 
7755         if (adj) {
7756                 adj->ref_nr += 1;
7757                 pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n",
7758                          dev->name, adj_dev->name, adj->ref_nr);
7759 
7760                 return 0;
7761         }
7762 
7763         adj = kmalloc(sizeof(*adj), GFP_KERNEL);
7764         if (!adj)
7765                 return -ENOMEM;
7766 
7767         adj->dev = adj_dev;
7768         adj->master = master;
7769         adj->ref_nr = 1;
7770         adj->private = private;
7771         adj->ignore = false;
7772         netdev_hold(adj_dev, &adj->dev_tracker, GFP_KERNEL);
7773 
7774         pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n",
7775                  dev->name, adj_dev->name, adj->ref_nr, adj_dev->name);
7776 
7777         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
7778                 ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
7779                 if (ret)
7780                         goto free_adj;
7781         }
7782 
7783         /* Ensure that master link is always the first item in list. */
7784         if (master) {
7785                 ret = sysfs_create_link(&(dev->dev.kobj),
7786                                         &(adj_dev->dev.kobj), "master");
7787                 if (ret)
7788                         goto remove_symlinks;
7789 
7790                 list_add_rcu(&adj->list, dev_list);
7791         } else {
7792                 list_add_tail_rcu(&adj->list, dev_list);
7793         }
7794 
7795         return 0;
7796 
7797 remove_symlinks:
7798         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
7799                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
7800 free_adj:
7801         netdev_put(adj_dev, &adj->dev_tracker);
7802         kfree(adj);
7803 
7804         return ret;
7805 }
7806 
7807 static void __netdev_adjacent_dev_remove(struct net_device *dev,
7808                                          struct net_device *adj_dev,
7809                                          u16 ref_nr,
7810                                          struct list_head *dev_list)
7811 {
7812         struct netdev_adjacent *adj;
7813 
7814         pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n",
7815                  dev->name, adj_dev->name, ref_nr);
7816 
7817         adj = __netdev_find_adj(adj_dev, dev_list);
7818 
7819         if (!adj) {
7820                 pr_err("Adjacency does not exist for device %s from %s\n",
7821                        dev->name, adj_dev->name);
7822                 WARN_ON(1);
7823                 return;
7824         }
7825 
7826         if (adj->ref_nr > ref_nr) {
7827                 pr_debug("adjacency: %s to %s ref_nr - %d = %d\n",
7828                          dev->name, adj_dev->name, ref_nr,
7829                          adj->ref_nr - ref_nr);
7830                 adj->ref_nr -= ref_nr;
7831                 return;
7832         }
7833 
7834         if (adj->master)
7835                 sysfs_remove_link(&(dev->dev.kobj), "master");
7836 
7837         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
7838                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
7839 
7840         list_del_rcu(&adj->list);
7841         pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n",
7842                  adj_dev->name, dev->name, adj_dev->name);
7843         netdev_put(adj_dev, &adj->dev_tracker);
7844         kfree_rcu(adj, rcu);
7845 }
7846 
7847 static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
7848                                             struct net_device *upper_dev,
7849                                             struct list_head *up_list,
7850                                             struct list_head *down_list,
7851                                             void *private, bool master)
7852 {
7853         int ret;
7854 
7855         ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list,
7856                                            private, master);
7857         if (ret)
7858                 return ret;
7859 
7860         ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list,
7861                                            private, false);
7862         if (ret) {
7863                 __netdev_adjacent_dev_remove(dev, upper_dev, 1, up_list);
7864                 return ret;
7865         }
7866 
7867         return 0;
7868 }
7869 
7870 static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
7871                                                struct net_device *upper_dev,
7872                                                u16 ref_nr,
7873                                                struct list_head *up_list,
7874                                                struct list_head *down_list)
7875 {
7876         __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
7877         __netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
7878 }
7879 
7880 static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
7881                                                 struct net_device *upper_dev,
7882                                                 void *private, bool master)
7883 {
7884         return __netdev_adjacent_dev_link_lists(dev, upper_dev,
7885                                                 &dev->adj_list.upper,
7886                                                 &upper_dev->adj_list.lower,
7887                                                 private, master);
7888 }
7889 
7890 static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
7891                                                    struct net_device *upper_dev)
7892 {
7893         __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
7894                                            &dev->adj_list.upper,
7895                                            &upper_dev->adj_list.lower);
7896 }
7897 
7898 static int __netdev_upper_dev_link(struct net_device *dev,
7899                                    struct net_device *upper_dev, bool master,
7900                                    void *upper_priv, void *upper_info,
7901                                    struct netdev_nested_priv *priv,
7902                                    struct netlink_ext_ack *extack)
7903 {
7904         struct netdev_notifier_changeupper_info changeupper_info = {
7905                 .info = {
7906                         .dev = dev,
7907                         .extack = extack,
7908                 },
7909                 .upper_dev = upper_dev,
7910                 .master = master,
7911                 .linking = true,
7912                 .upper_info = upper_info,
7913         };
7914         struct net_device *master_dev;
7915         int ret = 0;
7916 
7917         ASSERT_RTNL();
7918 
7919         if (dev == upper_dev)
7920                 return -EBUSY;
7921 
7922         /* To prevent loops, check if dev is not upper device to upper_dev. */
7923         if (__netdev_has_upper_dev(upper_dev, dev))
7924                 return -EBUSY;
7925 
7926         if ((dev->lower_level + upper_dev->upper_level) > MAX_NEST_DEV)
7927                 return -EMLINK;
7928 
7929         if (!master) {
7930                 if (__netdev_has_upper_dev(dev, upper_dev))
7931                         return -EEXIST;
7932         } else {
7933                 master_dev = __netdev_master_upper_dev_get(dev);
7934                 if (master_dev)
7935                         return master_dev == upper_dev ? -EEXIST : -EBUSY;
7936         }
7937 
7938         ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
7939                                             &changeupper_info.info);
7940         ret = notifier_to_errno(ret);
7941         if (ret)
7942                 return ret;
7943 
7944         ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv,
7945                                                    master);
7946         if (ret)
7947                 return ret;
7948 
7949         ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
7950                                             &changeupper_info.info);
7951         ret = notifier_to_errno(ret);
7952         if (ret)
7953                 goto rollback;
7954 
7955         __netdev_update_upper_level(dev, NULL);
7956         __netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
7957 
7958         __netdev_update_lower_level(upper_dev, priv);
7959         __netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
7960                                     priv);
7961 
7962         return 0;
7963 
7964 rollback:
7965         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
7966 
7967         return ret;
7968 }
7969 
7970 /**
7971  * netdev_upper_dev_link - Add a link to the upper device
7972  * @dev: device
7973  * @upper_dev: new upper device
7974  * @extack: netlink extended ack
7975  *
7976  * Adds a link to device which is upper to this one. The caller must hold
7977  * the RTNL lock. On a failure a negative errno code is returned.
7978  * On success the reference counts are adjusted and the function
7979  * returns zero.
7980  */
7981 int netdev_upper_dev_link(struct net_device *dev,
7982                           struct net_device *upper_dev,
7983                           struct netlink_ext_ack *extack)
7984 {
7985         struct netdev_nested_priv priv = {
7986                 .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
7987                 .data = NULL,
7988         };
7989 
7990         return __netdev_upper_dev_link(dev, upper_dev, false,
7991                                        NULL, NULL, &priv, extack);
7992 }
7993 EXPORT_SYMBOL(netdev_upper_dev_link);
7994 
7995 /**
7996  * netdev_master_upper_dev_link - Add a master link to the upper device
7997  * @dev: device
7998  * @upper_dev: new upper device
7999  * @upper_priv: upper device private
8000  * @upper_info: upper info to be passed down via notifier
8001  * @extack: netlink extended ack
8002  *
8003  * Adds a link to device which is upper to this one. In this case, only
8004  * one master upper device can be linked, although other non-master devices
8005  * might be linked as well. The caller must hold the RTNL lock.
8006  * On a failure a negative errno code is returned. On success the reference
8007  * counts are adjusted and the function returns zero.
8008  */
8009 int netdev_master_upper_dev_link(struct net_device *dev,
8010                                  struct net_device *upper_dev,
8011                                  void *upper_priv, void *upper_info,
8012                                  struct netlink_ext_ack *extack)
8013 {
8014         struct netdev_nested_priv priv = {
8015                 .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
8016                 .data = NULL,
8017         };
8018 
8019         return __netdev_upper_dev_link(dev, upper_dev, true,
8020                                        upper_priv, upper_info, &priv, extack);
8021 }
8022 EXPORT_SYMBOL(netdev_master_upper_dev_link);
8023 
8024 static void __netdev_upper_dev_unlink(struct net_device *dev,
8025                                       struct net_device *upper_dev,
8026                                       struct netdev_nested_priv *priv)
8027 {
8028         struct netdev_notifier_changeupper_info changeupper_info = {
8029                 .info = {
8030                         .dev = dev,
8031                 },
8032                 .upper_dev = upper_dev,
8033                 .linking = false,
8034         };
8035 
8036         ASSERT_RTNL();
8037 
8038         changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
8039 
8040         call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
8041                                       &changeupper_info.info);
8042 
8043         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
8044 
8045         call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
8046                                       &changeupper_info.info);
8047 
8048         __netdev_update_upper_level(dev, NULL);
8049         __netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
8050 
8051         __netdev_update_lower_level(upper_dev, priv);
8052         __netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
8053                                     priv);
8054 }
8055 
8056 /**
8057  * netdev_upper_dev_unlink - Removes a link to upper device
8058  * @dev: device
8059  * @upper_dev: new upper device
8060  *
8061  * Removes a link to device which is upper to this one. The caller must hold
8062  * the RTNL lock.
8063  */
8064 void netdev_upper_dev_unlink(struct net_device *dev,
8065                              struct net_device *upper_dev)
8066 {
8067         struct netdev_nested_priv priv = {
8068                 .flags = NESTED_SYNC_TODO,
8069                 .data = NULL,
8070         };
8071 
8072         __netdev_upper_dev_unlink(dev, upper_dev, &priv);
8073 }
8074 EXPORT_SYMBOL(netdev_upper_dev_unlink);
8075 
8076 static void __netdev_adjacent_dev_set(struct net_device *upper_dev,
8077                                       struct net_device *lower_dev,
8078                                       bool val)
8079 {
8080         struct netdev_adjacent *adj;
8081 
8082         adj = __netdev_find_adj(lower_dev, &upper_dev->adj_list.lower);
8083         if (adj)
8084                 adj->ignore = val;
8085 
8086         adj = __netdev_find_adj(upper_dev, &lower_dev->adj_list.upper);
8087         if (adj)
8088                 adj->ignore = val;
8089 }
8090 
8091 static void netdev_adjacent_dev_disable(struct net_device *upper_dev,
8092                                         struct net_device *lower_dev)
8093 {
8094         __netdev_adjacent_dev_set(upper_dev, lower_dev, true);
8095 }
8096 
8097 static void netdev_adjacent_dev_enable(struct net_device *upper_dev,
8098                                        struct net_device *lower_dev)
8099 {
8100         __netdev_adjacent_dev_set(upper_dev, lower_dev, false);
8101 }
8102 
8103 int netdev_adjacent_change_prepare(struct net_device *old_dev,
8104                                    struct net_device *new_dev,
8105                                    struct net_device *dev,
8106                                    struct netlink_ext_ack *extack)
8107 {
8108         struct netdev_nested_priv priv = {
8109                 .flags = 0,
8110                 .data = NULL,
8111         };
8112         int err;
8113 
8114         if (!new_dev)
8115                 return 0;
8116 
8117         if (old_dev && new_dev != old_dev)
8118                 netdev_adjacent_dev_disable(dev, old_dev);
8119         err = __netdev_upper_dev_link(new_dev, dev, false, NULL, NULL, &priv,
8120                                       extack);
8121         if (err) {
8122                 if (old_dev && new_dev != old_dev)
8123                         netdev_adjacent_dev_enable(dev, old_dev);
8124                 return err;
8125         }
8126 
8127         return 0;
8128 }
8129 EXPORT_SYMBOL(netdev_adjacent_change_prepare);
8130 
8131 void netdev_adjacent_change_commit(struct net_device *old_dev,
8132                                    struct net_device *new_dev,
8133                                    struct net_device *dev)
8134 {
8135         struct netdev_nested_priv priv = {
8136                 .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
8137                 .data = NULL,
8138         };
8139 
8140         if (!new_dev || !old_dev)
8141                 return;
8142 
8143         if (new_dev == old_dev)
8144                 return;
8145 
8146         netdev_adjacent_dev_enable(dev, old_dev);
8147         __netdev_upper_dev_unlink(old_dev, dev, &priv);
8148 }
8149 EXPORT_SYMBOL(netdev_adjacent_change_commit);
8150 
8151 void netdev_adjacent_change_abort(struct net_device *old_dev,
8152                                   struct net_device *new_dev,
8153                                   struct net_device *dev)
8154 {
8155         struct netdev_nested_priv priv = {
8156                 .flags = 0,
8157                 .data = NULL,
8158         };
8159 
8160         if (!new_dev)
8161                 return;
8162 
8163         if (old_dev && new_dev != old_dev)
8164                 netdev_adjacent_dev_enable(dev, old_dev);
8165 
8166         __netdev_upper_dev_unlink(new_dev, dev, &priv);
8167 }
8168 EXPORT_SYMBOL(netdev_adjacent_change_abort);
8169 
8170 /**
8171  * netdev_bonding_info_change - Dispatch event about slave change
8172  * @dev: device
8173  * @bonding_info: info to dispatch
8174  *
8175  * Send NETDEV_BONDING_INFO to netdev notifiers with info.
8176  * The caller must hold the RTNL lock.
8177  */
8178 void netdev_bonding_info_change(struct net_device *dev,
8179                                 struct netdev_bonding_info *bonding_info)
8180 {
8181         struct netdev_notifier_bonding_info info = {
8182                 .info.dev = dev,
8183         };
8184 
8185         memcpy(&info.bonding_info, bonding_info,
8186                sizeof(struct netdev_bonding_info));
8187         call_netdevice_notifiers_info(NETDEV_BONDING_INFO,
8188                                       &info.info);
8189 }
8190 EXPORT_SYMBOL(netdev_bonding_info_change);
8191 
8192 static int netdev_offload_xstats_enable_l3(struct net_device *dev,
8193                                            struct netlink_ext_ack *extack)
8194 {
8195         struct netdev_notifier_offload_xstats_info info = {
8196                 .info.dev = dev,
8197                 .info.extack = extack,
8198                 .type = NETDEV_OFFLOAD_XSTATS_TYPE_L3,
8199         };
8200         int err;
8201         int rc;
8202 
8203         dev->offload_xstats_l3 = kzalloc(sizeof(*dev->offload_xstats_l3),
8204                                          GFP_KERNEL);
8205         if (!dev->offload_xstats_l3)
8206                 return -ENOMEM;
8207 
8208         rc = call_netdevice_notifiers_info_robust(NETDEV_OFFLOAD_XSTATS_ENABLE,
8209                                                   NETDEV_OFFLOAD_XSTATS_DISABLE,
8210                                                   &info.info);
8211         err = notifier_to_errno(rc);
8212         if (err)
8213                 goto free_stats;
8214 
8215         return 0;
8216 
8217 free_stats:
8218         kfree(dev->offload_xstats_l3);
8219         dev->offload_xstats_l3 = NULL;
8220         return err;
8221 }
8222 
8223 int netdev_offload_xstats_enable(struct net_device *dev,
8224                                  enum netdev_offload_xstats_type type,
8225                                  struct netlink_ext_ack *extack)
8226 {
8227         ASSERT_RTNL();
8228 
8229         if (netdev_offload_xstats_enabled(dev, type))
8230                 return -EALREADY;
8231 
8232         switch (type) {
8233         case NETDEV_OFFLOAD_XSTATS_TYPE_L3:
8234                 return netdev_offload_xstats_enable_l3(dev, extack);
8235         }
8236 
8237         WARN_ON(1);
8238         return -EINVAL;
8239 }
8240 EXPORT_SYMBOL(netdev_offload_xstats_enable);
8241 
8242 static void netdev_offload_xstats_disable_l3(struct net_device *dev)
8243 {
8244         struct netdev_notifier_offload_xstats_info info = {
8245                 .info.dev = dev,
8246                 .type = NETDEV_OFFLOAD_XSTATS_TYPE_L3,
8247         };
8248 
8249         call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_DISABLE,
8250                                       &info.info);
8251         kfree(dev->offload_xstats_l3);
8252         dev->offload_xstats_l3 = NULL;
8253 }
8254 
8255 int netdev_offload_xstats_disable(struct net_device *dev,
8256                                   enum netdev_offload_xstats_type type)
8257 {
8258         ASSERT_RTNL();
8259 
8260         if (!netdev_offload_xstats_enabled(dev, type))
8261                 return -EALREADY;
8262 
8263         switch (type) {
8264         case NETDEV_OFFLOAD_XSTATS_TYPE_L3:
8265                 netdev_offload_xstats_disable_l3(dev);
8266                 return 0;
8267         }
8268 
8269         WARN_ON(1);
8270         return -EINVAL;
8271 }
8272 EXPORT_SYMBOL(netdev_offload_xstats_disable);
8273 
8274 static void netdev_offload_xstats_disable_all(struct net_device *dev)
8275 {
8276         netdev_offload_xstats_disable(dev, NETDEV_OFFLOAD_XSTATS_TYPE_L3);
8277 }
8278 
8279 static struct rtnl_hw_stats64 *
8280 netdev_offload_xstats_get_ptr(const struct net_device *dev,
8281                               enum netdev_offload_xstats_type type)
8282 {
8283         switch (type) {
8284         case NETDEV_OFFLOAD_XSTATS_TYPE_L3:
8285                 return dev->offload_xstats_l3;
8286         }
8287 
8288         WARN_ON(1);
8289         return NULL;
8290 }
8291 
8292 bool netdev_offload_xstats_enabled(const struct net_device *dev,
8293                                    enum netdev_offload_xstats_type type)
8294 {
8295         ASSERT_RTNL();
8296 
8297         return netdev_offload_xstats_get_ptr(dev, type);
8298 }
8299 EXPORT_SYMBOL(netdev_offload_xstats_enabled);
8300 
8301 struct netdev_notifier_offload_xstats_ru {
8302         bool used;
8303 };
8304 
8305 struct netdev_notifier_offload_xstats_rd {
8306         struct rtnl_hw_stats64 stats;
8307         bool used;
8308 };
8309 
8310 static void netdev_hw_stats64_add(struct rtnl_hw_stats64 *dest,
8311                                   const struct rtnl_hw_stats64 *src)
8312 {
8313         dest->rx_packets          += src->rx_packets;
8314         dest->tx_packets          += src->tx_packets;
8315         dest->rx_bytes            += src->rx_bytes;
8316         dest->tx_bytes            += src->tx_bytes;
8317         dest->rx_errors           += src->rx_errors;
8318         dest->tx_errors           += src->tx_errors;
8319         dest->rx_dropped          += src->rx_dropped;
8320         dest->tx_dropped          += src->tx_dropped;
8321         dest->multicast           += src->multicast;
8322 }
8323 
8324 static int netdev_offload_xstats_get_used(struct net_device *dev,
8325                                           enum netdev_offload_xstats_type type,
8326                                           bool *p_used,
8327                                           struct netlink_ext_ack *extack)
8328 {
8329         struct netdev_notifier_offload_xstats_ru report_used = {};
8330         struct netdev_notifier_offload_xstats_info info = {
8331                 .info.dev = dev,
8332                 .info.extack = extack,
8333                 .type = type,
8334                 .report_used = &report_used,
8335         };
8336         int rc;
8337 
8338         WARN_ON(!netdev_offload_xstats_enabled(dev, type));
8339         rc = call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_REPORT_USED,
8340                                            &info.info);
8341         *p_used = report_used.used;
8342         return notifier_to_errno(rc);
8343 }
8344 
8345 static int netdev_offload_xstats_get_stats(struct net_device *dev,
8346                                            enum netdev_offload_xstats_type type,
8347                                            struct rtnl_hw_stats64 *p_stats,
8348                                            bool *p_used,
8349                                            struct netlink_ext_ack *extack)
8350 {
8351         struct netdev_notifier_offload_xstats_rd report_delta = {};
8352         struct netdev_notifier_offload_xstats_info info = {
8353                 .info.dev = dev,
8354                 .info.extack = extack,
8355                 .type = type,
8356                 .report_delta = &report_delta,
8357         };
8358         struct rtnl_hw_stats64 *stats;
8359         int rc;
8360 
8361         stats = netdev_offload_xstats_get_ptr(dev, type);
8362         if (WARN_ON(!stats))
8363                 return -EINVAL;
8364 
8365         rc = call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_REPORT_DELTA,
8366                                            &info.info);
8367 
8368         /* Cache whatever we got, even if there was an error, otherwise the
8369          * successful stats retrievals would get lost.
8370          */
8371         netdev_hw_stats64_add(stats, &report_delta.stats);
8372 
8373         if (p_stats)
8374                 *p_stats = *stats;
8375         *p_used = report_delta.used;
8376 
8377         return notifier_to_errno(rc);
8378 }
8379 
8380 int netdev_offload_xstats_get(struct net_device *dev,
8381                               enum netdev_offload_xstats_type type,
8382                               struct rtnl_hw_stats64 *p_stats, bool *p_used,
8383                               struct netlink_ext_ack *extack)
8384 {
8385         ASSERT_RTNL();
8386 
8387         if (p_stats)
8388                 return netdev_offload_xstats_get_stats(dev, type, p_stats,
8389                                                        p_used, extack);
8390         else
8391                 return netdev_offload_xstats_get_used(dev, type, p_used,
8392                                                       extack);
8393 }
8394 EXPORT_SYMBOL(netdev_offload_xstats_get);
8395 
8396 void
8397 netdev_offload_xstats_report_delta(struct netdev_notifier_offload_xstats_rd *report_delta,
8398                                    const struct rtnl_hw_stats64 *stats)
8399 {
8400         report_delta->used = true;
8401         netdev_hw_stats64_add(&report_delta->stats, stats);
8402 }
8403 EXPORT_SYMBOL(netdev_offload_xstats_report_delta);
8404 
8405 void
8406 netdev_offload_xstats_report_used(struct netdev_notifier_offload_xstats_ru *report_used)
8407 {
8408         report_used->used = true;
8409 }
8410 EXPORT_SYMBOL(netdev_offload_xstats_report_used);
8411 
8412 void netdev_offload_xstats_push_delta(struct net_device *dev,
8413                                       enum netdev_offload_xstats_type type,
8414                                       const struct rtnl_hw_stats64 *p_stats)
8415 {
8416         struct rtnl_hw_stats64 *stats;
8417 
8418         ASSERT_RTNL();
8419 
8420         stats = netdev_offload_xstats_get_ptr(dev, type);
8421         if (WARN_ON(!stats))
8422                 return;
8423 
8424         netdev_hw_stats64_add(stats, p_stats);
8425 }
8426 EXPORT_SYMBOL(netdev_offload_xstats_push_delta);
8427 
8428 /**
8429  * netdev_get_xmit_slave - Get the xmit slave of master device
8430  * @dev: device
8431  * @skb: The packet
8432  * @all_slaves: assume all the slaves are active
8433  *
8434  * The reference counters are not incremented so the caller must be
8435  * careful with locks. The caller must hold RCU lock.
8436  * %NULL is returned if no slave is found.
8437  */
8438 
8439 struct net_device *netdev_get_xmit_slave(struct net_device *dev,
8440                                          struct sk_buff *skb,
8441                                          bool all_slaves)
8442 {
8443         const struct net_device_ops *ops = dev->netdev_ops;
8444 
8445         if (!ops->ndo_get_xmit_slave)
8446                 return NULL;
8447         return ops->ndo_get_xmit_slave(dev, skb, all_slaves);
8448 }
8449 EXPORT_SYMBOL(netdev_get_xmit_slave);
8450 
8451 static struct net_device *netdev_sk_get_lower_dev(struct net_device *dev,
8452                                                   struct sock *sk)
8453 {
8454         const struct net_device_ops *ops = dev->netdev_ops;
8455 
8456         if (!ops->ndo_sk_get_lower_dev)
8457                 return NULL;
8458         return ops->ndo_sk_get_lower_dev(dev, sk);
8459 }
8460 
8461 /**
8462  * netdev_sk_get_lowest_dev - Get the lowest device in chain given device and socket
8463  * @dev: device
8464  * @sk: the socket
8465  *
8466  * %NULL is returned if no lower device is found.
8467  */
8468 
8469 struct net_device *netdev_sk_get_lowest_dev(struct net_device *dev,
8470                                             struct sock *sk)
8471 {
8472         struct net_device *lower;
8473 
8474         lower = netdev_sk_get_lower_dev(dev, sk);
8475         while (lower) {
8476                 dev = lower;
8477                 lower = netdev_sk_get_lower_dev(dev, sk);
8478         }
8479 
8480         return dev;
8481 }
8482 EXPORT_SYMBOL(netdev_sk_get_lowest_dev);
8483 
8484 static void netdev_adjacent_add_links(struct net_device *dev)
8485 {
8486         struct netdev_adjacent *iter;
8487 
8488         struct net *net = dev_net(dev);
8489 
8490         list_for_each_entry(iter, &dev->adj_list.upper, list) {
8491                 if (!net_eq(net, dev_net(iter->dev)))
8492                         continue;
8493                 netdev_adjacent_sysfs_add(iter->dev, dev,
8494                                           &iter->dev->adj_list.lower);
8495                 netdev_adjacent_sysfs_add(dev, iter->dev,
8496                                           &dev->adj_list.upper);
8497         }
8498 
8499         list_for_each_entry(iter, &dev->adj_list.lower, list) {
8500                 if (!net_eq(net, dev_net(iter->dev)))
8501                         continue;
8502                 netdev_adjacent_sysfs_add(iter->dev, dev,
8503                                           &iter->dev->adj_list.upper);
8504                 netdev_adjacent_sysfs_add(dev, iter->dev,
8505                                           &dev->adj_list.lower);
8506         }
8507 }
8508 
8509 static void netdev_adjacent_del_links(struct net_device *dev)
8510 {
8511         struct netdev_adjacent *iter;
8512 
8513         struct net *net = dev_net(dev);
8514 
8515         list_for_each_entry(iter, &dev->adj_list.upper, list) {
8516                 if (!net_eq(net, dev_net(iter->dev)))
8517                         continue;
8518                 netdev_adjacent_sysfs_del(iter->dev, dev->name,
8519                                           &iter->dev->adj_list.lower);
8520                 netdev_adjacent_sysfs_del(dev, iter->dev->name,
8521                                           &dev->adj_list.upper);
8522         }
8523 
8524         list_for_each_entry(iter, &dev->adj_list.lower, list) {
8525                 if (!net_eq(net, dev_net(iter->dev)))
8526                         continue;
8527                 netdev_adjacent_sysfs_del(iter->dev, dev->name,
8528                                           &iter->dev->adj_list.upper);
8529                 netdev_adjacent_sysfs_del(dev, iter->dev->name,
8530                                           &dev->adj_list.lower);
8531         }
8532 }
8533 
8534 void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
8535 {
8536         struct netdev_adjacent *iter;
8537 
8538         struct net *net = dev_net(dev);
8539 
8540         list_for_each_entry(iter, &dev->adj_list.upper, list) {
8541                 if (!net_eq(net, dev_net(iter->dev)))
8542                         continue;
8543                 netdev_adjacent_sysfs_del(iter->dev, oldname,
8544                                           &iter->dev->adj_list.lower);
8545                 netdev_adjacent_sysfs_add(iter->dev, dev,
8546                                           &iter->dev->adj_list.lower);
8547         }
8548 
8549         list_for_each_entry(iter, &dev->adj_list.lower, list) {
8550                 if (!net_eq(net, dev_net(iter->dev)))
8551                         continue;
8552                 netdev_adjacent_sysfs_del(iter->dev, oldname,
8553                                           &iter->dev->adj_list.upper);
8554                 netdev_adjacent_sysfs_add(iter->dev, dev,
8555                                           &iter->dev->adj_list.upper);
8556         }
8557 }
8558 
8559 void *netdev_lower_dev_get_private(struct net_device *dev,
8560                                    struct net_device *lower_dev)
8561 {
8562         struct netdev_adjacent *lower;
8563 
8564         if (!lower_dev)
8565                 return NULL;
8566         lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
8567         if (!lower)
8568                 return NULL;
8569 
8570         return lower->private;
8571 }
8572 EXPORT_SYMBOL(netdev_lower_dev_get_private);
8573 
8574 
8575 /**
8576  * netdev_lower_state_changed - Dispatch event about lower device state change
8577  * @lower_dev: device
8578  * @lower_state_info: state to dispatch
8579  *
8580  * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
8581  * The caller must hold the RTNL lock.
8582  */
8583 void netdev_lower_state_changed(struct net_device *lower_dev,
8584                                 void *lower_state_info)
8585 {
8586         struct netdev_notifier_changelowerstate_info changelowerstate_info = {
8587                 .info.dev = lower_dev,
8588         };
8589 
8590         ASSERT_RTNL();
8591         changelowerstate_info.lower_state_info = lower_state_info;
8592         call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE,
8593                                       &changelowerstate_info.info);
8594 }
8595 EXPORT_SYMBOL(netdev_lower_state_changed);
8596 
8597 static void dev_change_rx_flags(struct net_device *dev, int flags)
8598 {
8599         const struct net_device_ops *ops = dev->netdev_ops;
8600 
8601         if (ops->ndo_change_rx_flags)
8602                 ops->ndo_change_rx_flags(dev, flags);
8603 }
8604 
8605 static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
8606 {
8607         unsigned int old_flags = dev->flags;
8608         unsigned int promiscuity, flags;
8609         kuid_t uid;
8610         kgid_t gid;
8611 
8612         ASSERT_RTNL();
8613 
8614         promiscuity = dev->promiscuity + inc;
8615         if (promiscuity == 0) {
8616                 /*
8617                  * Avoid overflow.
8618                  * If inc causes overflow, untouch promisc and return error.
8619                  */
8620                 if (unlikely(inc > 0)) {
8621                         netdev_warn(dev, "promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n");
8622                         return -EOVERFLOW;
8623                 }
8624                 flags = old_flags & ~IFF_PROMISC;
8625         } else {
8626                 flags = old_flags | IFF_PROMISC;
8627         }
8628         WRITE_ONCE(dev->promiscuity, promiscuity);
8629         if (flags != old_flags) {
8630                 WRITE_ONCE(dev->flags, flags);
8631                 netdev_info(dev, "%s promiscuous mode\n",
8632                             dev->flags & IFF_PROMISC ? "entered" : "left");
8633                 if (audit_enabled) {
8634                         current_uid_gid(&uid, &gid);
8635                         audit_log(audit_context(), GFP_ATOMIC,
8636                                   AUDIT_ANOM_PROMISCUOUS,
8637                                   "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
8638                                   dev->name, (dev->flags & IFF_PROMISC),
8639                                   (old_flags & IFF_PROMISC),
8640                                   from_kuid(&init_user_ns, audit_get_loginuid(current)),
8641                                   from_kuid(&init_user_ns, uid),
8642                                   from_kgid(&init_user_ns, gid),
8643                                   audit_get_sessionid(current));
8644                 }
8645 
8646                 dev_change_rx_flags(dev, IFF_PROMISC);
8647         }
8648         if (notify)
8649                 __dev_notify_flags(dev, old_flags, IFF_PROMISC, 0, NULL);
8650         return 0;
8651 }
8652 
8653 /**
8654  *      dev_set_promiscuity     - update promiscuity count on a device
8655  *      @dev: device
8656  *      @inc: modifier
8657  *
8658  *      Add or remove promiscuity from a device. While the count in the device
8659  *      remains above zero the interface remains promiscuous. Once it hits zero
8660  *      the device reverts back to normal filtering operation. A negative inc
8661  *      value is used to drop promiscuity on the device.
8662  *      Return 0 if successful or a negative errno code on error.
8663  */
8664 int dev_set_promiscuity(struct net_device *dev, int inc)
8665 {
8666         unsigned int old_flags = dev->flags;
8667         int err;
8668 
8669         err = __dev_set_promiscuity(dev, inc, true);
8670         if (err < 0)
8671                 return err;
8672         if (dev->flags != old_flags)
8673                 dev_set_rx_mode(dev);
8674         return err;
8675 }
8676 EXPORT_SYMBOL(dev_set_promiscuity);
8677 
8678 static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
8679 {
8680         unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
8681         unsigned int allmulti, flags;
8682 
8683         ASSERT_RTNL();
8684 
8685         allmulti = dev->allmulti + inc;
8686         if (allmulti == 0) {
8687                 /*
8688                  * Avoid overflow.
8689                  * If inc causes overflow, untouch allmulti and return error.
8690                  */
8691                 if (unlikely(inc > 0)) {
8692                         netdev_warn(dev, "allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n");
8693                         return -EOVERFLOW;
8694                 }
8695                 flags = old_flags & ~IFF_ALLMULTI;
8696         } else {
8697                 flags = old_flags | IFF_ALLMULTI;
8698         }
8699         WRITE_ONCE(dev->allmulti, allmulti);
8700         if (flags != old_flags) {
8701                 WRITE_ONCE(dev->flags, flags);
8702                 netdev_info(dev, "%s allmulticast mode\n",
8703                             dev->flags & IFF_ALLMULTI ? "entered" : "left");
8704                 dev_change_rx_flags(dev, IFF_ALLMULTI);
8705                 dev_set_rx_mode(dev);
8706                 if (notify)
8707                         __dev_notify_flags(dev, old_flags,
8708                                            dev->gflags ^ old_gflags, 0, NULL);
8709         }
8710         return 0;
8711 }
8712 
8713 /**
8714  *      dev_set_allmulti        - update allmulti count on a device
8715  *      @dev: device
8716  *      @inc: modifier
8717  *
8718  *      Add or remove reception of all multicast frames to a device. While the
8719  *      count in the device remains above zero the interface remains listening
8720  *      to all interfaces. Once it hits zero the device reverts back to normal
8721  *      filtering operation. A negative @inc value is used to drop the counter
8722  *      when releasing a resource needing all multicasts.
8723  *      Return 0 if successful or a negative errno code on error.
8724  */
8725 
8726 int dev_set_allmulti(struct net_device *dev, int inc)
8727 {
8728         return __dev_set_allmulti(dev, inc, true);
8729 }
8730 EXPORT_SYMBOL(dev_set_allmulti);
8731 
8732 /*
8733  *      Upload unicast and multicast address lists to device and
8734  *      configure RX filtering. When the device doesn't support unicast
8735  *      filtering it is put in promiscuous mode while unicast addresses
8736  *      are present.
8737  */
8738 void __dev_set_rx_mode(struct net_device *dev)
8739 {
8740         const struct net_device_ops *ops = dev->netdev_ops;
8741 
8742         /* dev_open will call this function so the list will stay sane. */
8743         if (!(dev->flags&IFF_UP))
8744                 return;
8745 
8746         if (!netif_device_present(dev))
8747                 return;
8748 
8749         if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
8750                 /* Unicast addresses changes may only happen under the rtnl,
8751                  * therefore calling __dev_set_promiscuity here is safe.
8752                  */
8753                 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
8754                         __dev_set_promiscuity(dev, 1, false);
8755                         dev->uc_promisc = true;
8756                 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
8757                         __dev_set_promiscuity(dev, -1, false);
8758                         dev->uc_promisc = false;
8759                 }
8760         }
8761 
8762         if (ops->ndo_set_rx_mode)
8763                 ops->ndo_set_rx_mode(dev);
8764 }
8765 
8766 void dev_set_rx_mode(struct net_device *dev)
8767 {
8768         netif_addr_lock_bh(dev);
8769         __dev_set_rx_mode(dev);
8770         netif_addr_unlock_bh(dev);
8771 }
8772 
8773 /**
8774  *      dev_get_flags - get flags reported to userspace
8775  *      @dev: device
8776  *
8777  *      Get the combination of flag bits exported through APIs to userspace.
8778  */
8779 unsigned int dev_get_flags(const struct net_device *dev)
8780 {
8781         unsigned int flags;
8782 
8783         flags = (READ_ONCE(dev->flags) & ~(IFF_PROMISC |
8784                                 IFF_ALLMULTI |
8785                                 IFF_RUNNING |
8786                                 IFF_LOWER_UP |
8787                                 IFF_DORMANT)) |
8788                 (READ_ONCE(dev->gflags) & (IFF_PROMISC |
8789                                 IFF_ALLMULTI));
8790 
8791         if (netif_running(dev)) {
8792                 if (netif_oper_up(dev))
8793                         flags |= IFF_RUNNING;
8794                 if (netif_carrier_ok(dev))
8795                         flags |= IFF_LOWER_UP;
8796                 if (netif_dormant(dev))
8797                         flags |= IFF_DORMANT;
8798         }
8799 
8800         return flags;
8801 }
8802 EXPORT_SYMBOL(dev_get_flags);
8803 
8804 int __dev_change_flags(struct net_device *dev, unsigned int flags,
8805                        struct netlink_ext_ack *extack)
8806 {
8807         unsigned int old_flags = dev->flags;
8808         int ret;
8809 
8810         ASSERT_RTNL();
8811 
8812         /*
8813          *      Set the flags on our device.
8814          */
8815 
8816         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
8817                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
8818                                IFF_AUTOMEDIA)) |
8819                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
8820                                     IFF_ALLMULTI));
8821 
8822         /*
8823          *      Load in the correct multicast list now the flags have changed.
8824          */
8825 
8826         if ((old_flags ^ flags) & IFF_MULTICAST)
8827                 dev_change_rx_flags(dev, IFF_MULTICAST);
8828 
8829         dev_set_rx_mode(dev);
8830 
8831         /*
8832          *      Have we downed the interface. We handle IFF_UP ourselves
8833          *      according to user attempts to set it, rather than blindly
8834          *      setting it.
8835          */
8836 
8837         ret = 0;
8838         if ((old_flags ^ flags) & IFF_UP) {
8839                 if (old_flags & IFF_UP)
8840                         __dev_close(dev);
8841                 else
8842                         ret = __dev_open(dev, extack);
8843         }
8844 
8845         if ((flags ^ dev->gflags) & IFF_PROMISC) {
8846                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
8847                 unsigned int old_flags = dev->flags;
8848 
8849                 dev->gflags ^= IFF_PROMISC;
8850 
8851                 if (__dev_set_promiscuity(dev, inc, false) >= 0)
8852                         if (dev->flags != old_flags)
8853                                 dev_set_rx_mode(dev);
8854         }
8855 
8856         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
8857          * is important. Some (broken) drivers set IFF_PROMISC, when
8858          * IFF_ALLMULTI is requested not asking us and not reporting.
8859          */
8860         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
8861                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
8862 
8863                 dev->gflags ^= IFF_ALLMULTI;
8864                 __dev_set_allmulti(dev, inc, false);
8865         }
8866 
8867         return ret;
8868 }
8869 
8870 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
8871                         unsigned int gchanges, u32 portid,
8872                         const struct nlmsghdr *nlh)
8873 {
8874         unsigned int changes = dev->flags ^ old_flags;
8875 
8876         if (gchanges)
8877                 rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC, portid, nlh);
8878 
8879         if (changes & IFF_UP) {
8880                 if (dev->flags & IFF_UP)
8881                         call_netdevice_notifiers(NETDEV_UP, dev);
8882                 else
8883                         call_netdevice_notifiers(NETDEV_DOWN, dev);
8884         }
8885 
8886         if (dev->flags & IFF_UP &&
8887             (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
8888                 struct netdev_notifier_change_info change_info = {
8889                         .info = {
8890                                 .dev = dev,
8891                         },
8892                         .flags_changed = changes,
8893                 };
8894 
8895                 call_netdevice_notifiers_info(NETDEV_CHANGE, &change_info.info);
8896         }
8897 }
8898 
8899 /**
8900  *      dev_change_flags - change device settings
8901  *      @dev: device
8902  *      @flags: device state flags
8903  *      @extack: netlink extended ack
8904  *
8905  *      Change settings on device based state flags. The flags are
8906  *      in the userspace exported format.
8907  */
8908 int dev_change_flags(struct net_device *dev, unsigned int flags,
8909                      struct netlink_ext_ack *extack)
8910 {
8911         int ret;
8912         unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
8913 
8914         ret = __dev_change_flags(dev, flags, extack);
8915         if (ret < 0)
8916                 return ret;
8917 
8918         changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
8919         __dev_notify_flags(dev, old_flags, changes, 0, NULL);
8920         return ret;
8921 }
8922 EXPORT_SYMBOL(dev_change_flags);
8923 
8924 int __dev_set_mtu(struct net_device *dev, int new_mtu)
8925 {
8926         const struct net_device_ops *ops = dev->netdev_ops;
8927 
8928         if (ops->ndo_change_mtu)
8929                 return ops->ndo_change_mtu(dev, new_mtu);
8930 
8931         /* Pairs with all the lockless reads of dev->mtu in the stack */
8932         WRITE_ONCE(dev->mtu, new_mtu);
8933         return 0;
8934 }
8935 EXPORT_SYMBOL(__dev_set_mtu);
8936 
8937 int dev_validate_mtu(struct net_device *dev, int new_mtu,
8938                      struct netlink_ext_ack *extack)
8939 {
8940         /* MTU must be positive, and in range */
8941         if (new_mtu < 0 || new_mtu < dev->min_mtu) {
8942                 NL_SET_ERR_MSG(extack, "mtu less than device minimum");
8943                 return -EINVAL;
8944         }
8945 
8946         if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) {
8947                 NL_SET_ERR_MSG(extack, "mtu greater than device maximum");
8948                 return -EINVAL;
8949         }
8950         return 0;
8951 }
8952 
8953 /**
8954  *      dev_set_mtu_ext - Change maximum transfer unit
8955  *      @dev: device
8956  *      @new_mtu: new transfer unit
8957  *      @extack: netlink extended ack
8958  *
8959  *      Change the maximum transfer size of the network device.
8960  */
8961 int dev_set_mtu_ext(struct net_device *dev, int new_mtu,
8962                     struct netlink_ext_ack *extack)
8963 {
8964         int err, orig_mtu;
8965 
8966         if (new_mtu == dev->mtu)
8967                 return 0;
8968 
8969         err = dev_validate_mtu(dev, new_mtu, extack);
8970         if (err)
8971                 return err;
8972 
8973         if (!netif_device_present(dev))
8974                 return -ENODEV;
8975 
8976         err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
8977         err = notifier_to_errno(err);
8978         if (err)
8979                 return err;
8980 
8981         orig_mtu = dev->mtu;
8982         err = __dev_set_mtu(dev, new_mtu);
8983 
8984         if (!err) {
8985                 err = call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
8986                                                    orig_mtu);
8987                 err = notifier_to_errno(err);
8988                 if (err) {
8989                         /* setting mtu back and notifying everyone again,
8990                          * so that they have a chance to revert changes.
8991                          */
8992                         __dev_set_mtu(dev, orig_mtu);
8993                         call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
8994                                                      new_mtu);
8995                 }
8996         }
8997         return err;
8998 }
8999 
9000 int dev_set_mtu(struct net_device *dev, int new_mtu)
9001 {
9002         struct netlink_ext_ack extack;
9003         int err;
9004 
9005         memset(&extack, 0, sizeof(extack));
9006         err = dev_set_mtu_ext(dev, new_mtu, &extack);
9007         if (err && extack._msg)
9008                 net_err_ratelimited("%s: %s\n", dev->name, extack._msg);
9009         return err;
9010 }
9011 EXPORT_SYMBOL(dev_set_mtu);
9012 
9013 /**
9014  *      dev_change_tx_queue_len - Change TX queue length of a netdevice
9015  *      @dev: device
9016  *      @new_len: new tx queue length
9017  */
9018 int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len)
9019 {
9020         unsigned int orig_len = dev->tx_queue_len;
9021         int res;
9022 
9023         if (new_len != (unsigned int)new_len)
9024                 return -ERANGE;
9025 
9026         if (new_len != orig_len) {
9027                 WRITE_ONCE(dev->tx_queue_len, new_len);
9028                 res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev);
9029                 res = notifier_to_errno(res);
9030                 if (res)
9031                         goto err_rollback;
9032                 res = dev_qdisc_change_tx_queue_len(dev);
9033                 if (res)
9034                         goto err_rollback;
9035         }
9036 
9037         return 0;
9038 
9039 err_rollback:
9040         netdev_err(dev, "refused to change device tx_queue_len\n");
9041         WRITE_ONCE(dev->tx_queue_len, orig_len);
9042         return res;
9043 }
9044 
9045 /**
9046  *      dev_set_group - Change group this device belongs to
9047  *      @dev: device
9048  *      @new_group: group this device should belong to
9049  */
9050 void dev_set_group(struct net_device *dev, int new_group)
9051 {
9052         dev->group = new_group;
9053 }
9054 
9055 /**
9056  *      dev_pre_changeaddr_notify - Call NETDEV_PRE_CHANGEADDR.
9057  *      @dev: device
9058  *      @addr: new address
9059  *      @extack: netlink extended ack
9060  */
9061 int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr,
9062                               struct netlink_ext_ack *extack)
9063 {
9064         struct netdev_notifier_pre_changeaddr_info info = {
9065                 .info.dev = dev,
9066                 .info.extack = extack,
9067                 .dev_addr = addr,
9068         };
9069         int rc;
9070 
9071         rc = call_netdevice_notifiers_info(NETDEV_PRE_CHANGEADDR, &info.info);
9072         return notifier_to_errno(rc);
9073 }
9074 EXPORT_SYMBOL(dev_pre_changeaddr_notify);
9075 
9076 /**
9077  *      dev_set_mac_address - Change Media Access Control Address
9078  *      @dev: device
9079  *      @sa: new address
9080  *      @extack: netlink extended ack
9081  *
9082  *      Change the hardware (MAC) address of the device
9083  */
9084 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa,
9085                         struct netlink_ext_ack *extack)
9086 {
9087         const struct net_device_ops *ops = dev->netdev_ops;
9088         int err;
9089 
9090         if (!ops->ndo_set_mac_address)
9091                 return -EOPNOTSUPP;
9092         if (sa->sa_family != dev->type)
9093                 return -EINVAL;
9094         if (!netif_device_present(dev))
9095                 return -ENODEV;
9096         err = dev_pre_changeaddr_notify(dev, sa->sa_data, extack);
9097         if (err)
9098                 return err;
9099         if (memcmp(dev->dev_addr, sa->sa_data, dev->addr_len)) {
9100                 err = ops->ndo_set_mac_address(dev, sa);
9101                 if (err)
9102                         return err;
9103         }
9104         dev->addr_assign_type = NET_ADDR_SET;
9105         call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
9106         add_device_randomness(dev->dev_addr, dev->addr_len);
9107         return 0;
9108 }
9109 EXPORT_SYMBOL(dev_set_mac_address);
9110 
9111 DECLARE_RWSEM(dev_addr_sem);
9112 
9113 int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa,
9114                              struct netlink_ext_ack *extack)
9115 {
9116         int ret;
9117 
9118         down_write(&dev_addr_sem);
9119         ret = dev_set_mac_address(dev, sa, extack);
9120         up_write(&dev_addr_sem);
9121         return ret;
9122 }
9123 EXPORT_SYMBOL(dev_set_mac_address_user);
9124 
9125 int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name)
9126 {
9127         size_t size = sizeof(sa->sa_data_min);
9128         struct net_device *dev;
9129         int ret = 0;
9130 
9131         down_read(&dev_addr_sem);
9132         rcu_read_lock();
9133 
9134         dev = dev_get_by_name_rcu(net, dev_name);
9135         if (!dev) {
9136                 ret = -ENODEV;
9137                 goto unlock;
9138         }
9139         if (!dev->addr_len)
9140                 memset(sa->sa_data, 0, size);
9141         else
9142                 memcpy(sa->sa_data, dev->dev_addr,
9143                        min_t(size_t, size, dev->addr_len));
9144         sa->sa_family = dev->type;
9145 
9146 unlock:
9147         rcu_read_unlock();
9148         up_read(&dev_addr_sem);
9149         return ret;
9150 }
9151 EXPORT_SYMBOL(dev_get_mac_address);
9152 
9153 /**
9154  *      dev_change_carrier - Change device carrier
9155  *      @dev: device
9156  *      @new_carrier: new value
9157  *
9158  *      Change device carrier
9159  */
9160 int dev_change_carrier(struct net_device *dev, bool new_carrier)
9161 {
9162         const struct net_device_ops *ops = dev->netdev_ops;
9163 
9164         if (!ops->ndo_change_carrier)
9165                 return -EOPNOTSUPP;
9166         if (!netif_device_present(dev))
9167                 return -ENODEV;
9168         return ops->ndo_change_carrier(dev, new_carrier);
9169 }
9170 
9171 /**
9172  *      dev_get_phys_port_id - Get device physical port ID
9173  *      @dev: device
9174  *      @ppid: port ID
9175  *
9176  *      Get device physical port ID
9177  */
9178 int dev_get_phys_port_id(struct net_device *dev,
9179                          struct netdev_phys_item_id *ppid)
9180 {
9181         const struct net_device_ops *ops = dev->netdev_ops;
9182 
9183         if (!ops->ndo_get_phys_port_id)
9184                 return -EOPNOTSUPP;
9185         return ops->ndo_get_phys_port_id(dev, ppid);
9186 }
9187 
9188 /**
9189  *      dev_get_phys_port_name - Get device physical port name
9190  *      @dev: device
9191  *      @name: port name
9192  *      @len: limit of bytes to copy to name
9193  *
9194  *      Get device physical port name
9195  */
9196 int dev_get_phys_port_name(struct net_device *dev,
9197                            char *name, size_t len)
9198 {
9199         const struct net_device_ops *ops = dev->netdev_ops;
9200         int err;
9201 
9202         if (ops->ndo_get_phys_port_name) {
9203                 err = ops->ndo_get_phys_port_name(dev, name, len);
9204                 if (err != -EOPNOTSUPP)
9205                         return err;
9206         }
9207         return devlink_compat_phys_port_name_get(dev, name, len);
9208 }
9209 
9210 /**
9211  *      dev_get_port_parent_id - Get the device's port parent identifier
9212  *      @dev: network device
9213  *      @ppid: pointer to a storage for the port's parent identifier
9214  *      @recurse: allow/disallow recursion to lower devices
9215  *
9216  *      Get the devices's port parent identifier
9217  */
9218 int dev_get_port_parent_id(struct net_device *dev,
9219                            struct netdev_phys_item_id *ppid,
9220                            bool recurse)
9221 {
9222         const struct net_device_ops *ops = dev->netdev_ops;
9223         struct netdev_phys_item_id first = { };
9224         struct net_device *lower_dev;
9225         struct list_head *iter;
9226         int err;
9227 
9228         if (ops->ndo_get_port_parent_id) {
9229                 err = ops->ndo_get_port_parent_id(dev, ppid);
9230                 if (err != -EOPNOTSUPP)
9231                         return err;
9232         }
9233 
9234         err = devlink_compat_switch_id_get(dev, ppid);
9235         if (!recurse || err != -EOPNOTSUPP)
9236                 return err;
9237 
9238         netdev_for_each_lower_dev(dev, lower_dev, iter) {
9239                 err = dev_get_port_parent_id(lower_dev, ppid, true);
9240                 if (err)
9241                         break;
9242                 if (!first.id_len)
9243                         first = *ppid;
9244                 else if (memcmp(&first, ppid, sizeof(*ppid)))
9245                         return -EOPNOTSUPP;
9246         }
9247 
9248         return err;
9249 }
9250 EXPORT_SYMBOL(dev_get_port_parent_id);
9251 
9252 /**
9253  *      netdev_port_same_parent_id - Indicate if two network devices have
9254  *      the same port parent identifier
9255  *      @a: first network device
9256  *      @b: second network device
9257  */
9258 bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b)
9259 {
9260         struct netdev_phys_item_id a_id = { };
9261         struct netdev_phys_item_id b_id = { };
9262 
9263         if (dev_get_port_parent_id(a, &a_id, true) ||
9264             dev_get_port_parent_id(b, &b_id, true))
9265                 return false;
9266 
9267         return netdev_phys_item_id_same(&a_id, &b_id);
9268 }
9269 EXPORT_SYMBOL(netdev_port_same_parent_id);
9270 
9271 /**
9272  *      dev_change_proto_down - set carrier according to proto_down.
9273  *
9274  *      @dev: device
9275  *      @proto_down: new value
9276  */
9277 int dev_change_proto_down(struct net_device *dev, bool proto_down)
9278 {
9279         if (!(dev->priv_flags & IFF_CHANGE_PROTO_DOWN))
9280                 return -EOPNOTSUPP;
9281         if (!netif_device_present(dev))
9282                 return -ENODEV;
9283         if (proto_down)
9284                 netif_carrier_off(dev);
9285         else
9286                 netif_carrier_on(dev);
9287         WRITE_ONCE(dev->proto_down, proto_down);
9288         return 0;
9289 }
9290 
9291 /**
9292  *      dev_change_proto_down_reason - proto down reason
9293  *
9294  *      @dev: device
9295  *      @mask: proto down mask
9296  *      @value: proto down value
9297  */
9298 void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask,
9299                                   u32 value)
9300 {
9301         u32 proto_down_reason;
9302         int b;
9303 
9304         if (!mask) {
9305                 proto_down_reason = value;
9306         } else {
9307                 proto_down_reason = dev->proto_down_reason;
9308                 for_each_set_bit(b, &mask, 32) {
9309                         if (value & (1 << b))
9310                                 proto_down_reason |= BIT(b);
9311                         else
9312                                 proto_down_reason &= ~BIT(b);
9313                 }
9314         }
9315         WRITE_ONCE(dev->proto_down_reason, proto_down_reason);
9316 }
9317 
9318 struct bpf_xdp_link {
9319         struct bpf_link link;
9320         struct net_device *dev; /* protected by rtnl_lock, no refcnt held */
9321         int flags;
9322 };
9323 
9324 static enum bpf_xdp_mode dev_xdp_mode(struct net_device *dev, u32 flags)
9325 {
9326         if (flags & XDP_FLAGS_HW_MODE)
9327                 return XDP_MODE_HW;
9328         if (flags & XDP_FLAGS_DRV_MODE)
9329                 return XDP_MODE_DRV;
9330         if (flags & XDP_FLAGS_SKB_MODE)
9331                 return XDP_MODE_SKB;
9332         return dev->netdev_ops->ndo_bpf ? XDP_MODE_DRV : XDP_MODE_SKB;
9333 }
9334 
9335 static bpf_op_t dev_xdp_bpf_op(struct net_device *dev, enum bpf_xdp_mode mode)
9336 {
9337         switch (mode) {
9338         case XDP_MODE_SKB:
9339                 return generic_xdp_install;
9340         case XDP_MODE_DRV:
9341         case XDP_MODE_HW:
9342                 return dev->netdev_ops->ndo_bpf;
9343         default:
9344                 return NULL;
9345         }
9346 }
9347 
9348 static struct bpf_xdp_link *dev_xdp_link(struct net_device *dev,
9349                                          enum bpf_xdp_mode mode)
9350 {
9351         return dev->xdp_state[mode].link;
9352 }
9353 
9354 static struct bpf_prog *dev_xdp_prog(struct net_device *dev,
9355                                      enum bpf_xdp_mode mode)
9356 {
9357         struct bpf_xdp_link *link = dev_xdp_link(dev, mode);
9358 
9359         if (link)
9360                 return link->link.prog;
9361         return dev->xdp_state[mode].prog;
9362 }
9363 
9364 u8 dev_xdp_prog_count(struct net_device *dev)
9365 {
9366         u8 count = 0;
9367         int i;
9368 
9369         for (i = 0; i < __MAX_XDP_MODE; i++)
9370                 if (dev->xdp_state[i].prog || dev->xdp_state[i].link)
9371                         count++;
9372         return count;
9373 }
9374 EXPORT_SYMBOL_GPL(dev_xdp_prog_count);
9375 
9376 u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode)
9377 {
9378         struct bpf_prog *prog = dev_xdp_prog(dev, mode);
9379 
9380         return prog ? prog->aux->id : 0;
9381 }
9382 
9383 static void dev_xdp_set_link(struct net_device *dev, enum bpf_xdp_mode mode,
9384                              struct bpf_xdp_link *link)
9385 {
9386         dev->xdp_state[mode].link = link;
9387         dev->xdp_state[mode].prog = NULL;
9388 }
9389 
9390 static void dev_xdp_set_prog(struct net_device *dev, enum bpf_xdp_mode mode,
9391                              struct bpf_prog *prog)
9392 {
9393         dev->xdp_state[mode].link = NULL;
9394         dev->xdp_state[mode].prog = prog;
9395 }
9396 
9397 static int dev_xdp_install(struct net_device *dev, enum bpf_xdp_mode mode,
9398                            bpf_op_t bpf_op, struct netlink_ext_ack *extack,
9399                            u32 flags, struct bpf_prog *prog)
9400 {
9401         struct netdev_bpf xdp;
9402         int err;
9403 
9404         memset(&xdp, 0, sizeof(xdp));
9405         xdp.command = mode == XDP_MODE_HW ? XDP_SETUP_PROG_HW : XDP_SETUP_PROG;
9406         xdp.extack = extack;
9407         xdp.flags = flags;
9408         xdp.prog = prog;
9409 
9410         /* Drivers assume refcnt is already incremented (i.e, prog pointer is
9411          * "moved" into driver), so they don't increment it on their own, but
9412          * they do decrement refcnt when program is detached or replaced.
9413          * Given net_device also owns link/prog, we need to bump refcnt here
9414          * to prevent drivers from underflowing it.
9415          */
9416         if (prog)
9417                 bpf_prog_inc(prog);
9418         err = bpf_op(dev, &xdp);
9419         if (err) {
9420                 if (prog)
9421                         bpf_prog_put(prog);
9422                 return err;
9423         }
9424 
9425         if (mode != XDP_MODE_HW)
9426                 bpf_prog_change_xdp(dev_xdp_prog(dev, mode), prog);
9427 
9428         return 0;
9429 }
9430 
9431 static void dev_xdp_uninstall(struct net_device *dev)
9432 {
9433         struct bpf_xdp_link *link;
9434         struct bpf_prog *prog;
9435         enum bpf_xdp_mode mode;
9436         bpf_op_t bpf_op;
9437 
9438         ASSERT_RTNL();
9439 
9440         for (mode = XDP_MODE_SKB; mode < __MAX_XDP_MODE; mode++) {
9441                 prog = dev_xdp_prog(dev, mode);
9442                 if (!prog)
9443                         continue;
9444 
9445                 bpf_op = dev_xdp_bpf_op(dev, mode);
9446                 if (!bpf_op)
9447                         continue;
9448 
9449                 WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL));
9450 
9451                 /* auto-detach link from net device */
9452                 link = dev_xdp_link(dev, mode);
9453                 if (link)
9454                         link->dev = NULL;
9455                 else
9456                         bpf_prog_put(prog);
9457 
9458                 dev_xdp_set_link(dev, mode, NULL);
9459         }
9460 }
9461 
9462 static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack,
9463                           struct bpf_xdp_link *link, struct bpf_prog *new_prog,
9464                           struct bpf_prog *old_prog, u32 flags)
9465 {
9466         unsigned int num_modes = hweight32(flags & XDP_FLAGS_MODES);
9467         struct bpf_prog *cur_prog;
9468         struct net_device *upper;
9469         struct list_head *iter;
9470         enum bpf_xdp_mode mode;
9471         bpf_op_t bpf_op;
9472         int err;
9473 
9474         ASSERT_RTNL();
9475 
9476         /* either link or prog attachment, never both */
9477         if (link && (new_prog || old_prog))
9478                 return -EINVAL;
9479         /* link supports only XDP mode flags */
9480         if (link && (flags & ~XDP_FLAGS_MODES)) {
9481                 NL_SET_ERR_MSG(extack, "Invalid XDP flags for BPF link attachment");
9482                 return -EINVAL;
9483         }
9484         /* just one XDP mode bit should be set, zero defaults to drv/skb mode */
9485         if (num_modes > 1) {
9486                 NL_SET_ERR_MSG(extack, "Only one XDP mode flag can be set");
9487                 return -EINVAL;
9488         }
9489         /* avoid ambiguity if offload + drv/skb mode progs are both loaded */
9490         if (!num_modes && dev_xdp_prog_count(dev) > 1) {
9491                 NL_SET_ERR_MSG(extack,
9492                                "More than one program loaded, unset mode is ambiguous");
9493                 return -EINVAL;
9494         }
9495         /* old_prog != NULL implies XDP_FLAGS_REPLACE is set */
9496         if (old_prog && !(flags & XDP_FLAGS_REPLACE)) {
9497                 NL_SET_ERR_MSG(extack, "XDP_FLAGS_REPLACE is not specified");
9498                 return -EINVAL;
9499         }
9500 
9501         mode = dev_xdp_mode(dev, flags);
9502         /* can't replace attached link */
9503         if (dev_xdp_link(dev, mode)) {
9504                 NL_SET_ERR_MSG(extack, "Can't replace active BPF XDP link");
9505                 return -EBUSY;
9506         }
9507 
9508         /* don't allow if an upper device already has a program */
9509         netdev_for_each_upper_dev_rcu(dev, upper, iter) {
9510                 if (dev_xdp_prog_count(upper) > 0) {
9511                         NL_SET_ERR_MSG(extack, "Cannot attach when an upper device already has a program");
9512                         return -EEXIST;
9513                 }
9514         }
9515 
9516         cur_prog = dev_xdp_prog(dev, mode);
9517         /* can't replace attached prog with link */
9518         if (link && cur_prog) {
9519                 NL_SET_ERR_MSG(extack, "Can't replace active XDP program with BPF link");
9520                 return -EBUSY;
9521         }
9522         if ((flags & XDP_FLAGS_REPLACE) && cur_prog != old_prog) {
9523                 NL_SET_ERR_MSG(extack, "Active program does not match expected");
9524                 return -EEXIST;
9525         }
9526 
9527         /* put effective new program into new_prog */
9528         if (link)
9529                 new_prog = link->link.prog;
9530 
9531         if (new_prog) {
9532                 bool offload = mode == XDP_MODE_HW;
9533                 enum bpf_xdp_mode other_mode = mode == XDP_MODE_SKB
9534                                                ? XDP_MODE_DRV : XDP_MODE_SKB;
9535 
9536                 if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && cur_prog) {
9537                         NL_SET_ERR_MSG(extack, "XDP program already attached");
9538                         return -EBUSY;
9539                 }
9540                 if (!offload && dev_xdp_prog(dev, other_mode)) {
9541                         NL_SET_ERR_MSG(extack, "Native and generic XDP can't be active at the same time");
9542                         return -EEXIST;
9543                 }
9544                 if (!offload && bpf_prog_is_offloaded(new_prog->aux)) {
9545                         NL_SET_ERR_MSG(extack, "Using offloaded program without HW_MODE flag is not supported");
9546                         return -EINVAL;
9547                 }
9548                 if (bpf_prog_is_dev_bound(new_prog->aux) && !bpf_offload_dev_match(new_prog, dev)) {
9549                         NL_SET_ERR_MSG(extack, "Program bound to different device");
9550                         return -EINVAL;
9551                 }
9552                 if (new_prog->expected_attach_type == BPF_XDP_DEVMAP) {
9553                         NL_SET_ERR_MSG(extack, "BPF_XDP_DEVMAP programs can not be attached to a device");
9554                         return -EINVAL;
9555                 }
9556                 if (new_prog->expected_attach_type == BPF_XDP_CPUMAP) {
9557                         NL_SET_ERR_MSG(extack, "BPF_XDP_CPUMAP programs can not be attached to a device");
9558                         return -EINVAL;
9559                 }
9560         }
9561 
9562         /* don't call drivers if the effective program didn't change */
9563         if (new_prog != cur_prog) {
9564                 bpf_op = dev_xdp_bpf_op(dev, mode);
9565                 if (!bpf_op) {
9566                         NL_SET_ERR_MSG(extack, "Underlying driver does not support XDP in native mode");
9567                         return -EOPNOTSUPP;
9568                 }
9569 
9570                 err = dev_xdp_install(dev, mode, bpf_op, extack, flags, new_prog);
9571                 if (err)
9572                         return err;
9573         }
9574 
9575         if (link)
9576                 dev_xdp_set_link(dev, mode, link);
9577         else
9578                 dev_xdp_set_prog(dev, mode, new_prog);
9579         if (cur_prog)
9580                 bpf_prog_put(cur_prog);
9581 
9582         return 0;
9583 }
9584 
9585 static int dev_xdp_attach_link(struct net_device *dev,
9586                                struct netlink_ext_ack *extack,
9587                                struct bpf_xdp_link *link)
9588 {
9589         return dev_xdp_attach(dev, extack, link, NULL, NULL, link->flags);
9590 }
9591 
9592 static int dev_xdp_detach_link(struct net_device *dev,
9593                                struct netlink_ext_ack *extack,
9594                                struct bpf_xdp_link *link)
9595 {
9596         enum bpf_xdp_mode mode;
9597         bpf_op_t bpf_op;
9598 
9599         ASSERT_RTNL();
9600 
9601         mode = dev_xdp_mode(dev, link->flags);
9602         if (dev_xdp_link(dev, mode) != link)
9603                 return -EINVAL;
9604 
9605         bpf_op = dev_xdp_bpf_op(dev, mode);
9606         WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL));
9607         dev_xdp_set_link(dev, mode, NULL);
9608         return 0;
9609 }
9610 
9611 static void bpf_xdp_link_release(struct bpf_link *link)
9612 {
9613         struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
9614 
9615         rtnl_lock();
9616 
9617         /* if racing with net_device's tear down, xdp_link->dev might be
9618          * already NULL, in which case link was already auto-detached
9619          */
9620         if (xdp_link->dev) {
9621                 WARN_ON(dev_xdp_detach_link(xdp_link->dev, NULL, xdp_link));
9622                 xdp_link->dev = NULL;
9623         }
9624 
9625         rtnl_unlock();
9626 }
9627 
9628 static int bpf_xdp_link_detach(struct bpf_link *link)
9629 {
9630         bpf_xdp_link_release(link);
9631         return 0;
9632 }
9633 
9634 static void bpf_xdp_link_dealloc(struct bpf_link *link)
9635 {
9636         struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
9637 
9638         kfree(xdp_link);
9639 }
9640 
9641 static void bpf_xdp_link_show_fdinfo(const struct bpf_link *link,
9642                                      struct seq_file *seq)
9643 {
9644         struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
9645         u32 ifindex = 0;
9646 
9647         rtnl_lock();
9648         if (xdp_link->dev)
9649                 ifindex = xdp_link->dev->ifindex;
9650         rtnl_unlock();
9651 
9652         seq_printf(seq, "ifindex:\t%u\n", ifindex);
9653 }
9654 
9655 static int bpf_xdp_link_fill_link_info(const struct bpf_link *link,
9656                                        struct bpf_link_info *info)
9657 {
9658         struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
9659         u32 ifindex = 0;
9660 
9661         rtnl_lock();
9662         if (xdp_link->dev)
9663                 ifindex = xdp_link->dev->ifindex;
9664         rtnl_unlock();
9665 
9666         info->xdp.ifindex = ifindex;
9667         return 0;
9668 }
9669 
9670 static int bpf_xdp_link_update(struct bpf_link *link, struct bpf_prog *new_prog,
9671                                struct bpf_prog *old_prog)
9672 {
9673         struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
9674         enum bpf_xdp_mode mode;
9675         bpf_op_t bpf_op;
9676         int err = 0;
9677 
9678         rtnl_lock();
9679 
9680         /* link might have been auto-released already, so fail */
9681         if (!xdp_link->dev) {
9682                 err = -ENOLINK;
9683                 goto out_unlock;
9684         }
9685 
9686         if (old_prog && link->prog != old_prog) {
9687                 err = -EPERM;
9688                 goto out_unlock;
9689         }
9690         old_prog = link->prog;
9691         if (old_prog->type != new_prog->type ||
9692             old_prog->expected_attach_type != new_prog->expected_attach_type) {
9693                 err = -EINVAL;
9694                 goto out_unlock;
9695         }
9696 
9697         if (old_prog == new_prog) {
9698                 /* no-op, don't disturb drivers */
9699                 bpf_prog_put(new_prog);
9700                 goto out_unlock;
9701         }
9702 
9703         mode = dev_xdp_mode(xdp_link->dev, xdp_link->flags);
9704         bpf_op = dev_xdp_bpf_op(xdp_link->dev, mode);
9705         err = dev_xdp_install(xdp_link->dev, mode, bpf_op, NULL,
9706                               xdp_link->flags, new_prog);
9707         if (err)
9708                 goto out_unlock;
9709 
9710         old_prog = xchg(&link->prog, new_prog);
9711         bpf_prog_put(old_prog);
9712 
9713 out_unlock:
9714         rtnl_unlock();
9715         return err;
9716 }
9717 
9718 static const struct bpf_link_ops bpf_xdp_link_lops = {
9719         .release = bpf_xdp_link_release,
9720         .dealloc = bpf_xdp_link_dealloc,
9721         .detach = bpf_xdp_link_detach,
9722         .show_fdinfo = bpf_xdp_link_show_fdinfo,
9723         .fill_link_info = bpf_xdp_link_fill_link_info,
9724         .update_prog = bpf_xdp_link_update,
9725 };
9726 
9727 int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
9728 {
9729         struct net *net = current->nsproxy->net_ns;
9730         struct bpf_link_primer link_primer;
9731         struct netlink_ext_ack extack = {};
9732         struct bpf_xdp_link *link;
9733         struct net_device *dev;
9734         int err, fd;
9735 
9736         rtnl_lock();
9737         dev = dev_get_by_index(net, attr->link_create.target_ifindex);
9738         if (!dev) {
9739                 rtnl_unlock();
9740                 return -EINVAL;
9741         }
9742 
9743         link = kzalloc(sizeof(*link), GFP_USER);
9744         if (!link) {
9745                 err = -ENOMEM;
9746                 goto unlock;
9747         }
9748 
9749         bpf_link_init(&link->link, BPF_LINK_TYPE_XDP, &bpf_xdp_link_lops, prog);
9750         link->dev = dev;
9751         link->flags = attr->link_create.flags;
9752 
9753         err = bpf_link_prime(&link->link, &link_primer);
9754         if (err) {
9755                 kfree(link);
9756                 goto unlock;
9757         }
9758 
9759         err = dev_xdp_attach_link(dev, &extack, link);
9760         rtnl_unlock();
9761 
9762         if (err) {
9763                 link->dev = NULL;
9764                 bpf_link_cleanup(&link_primer);
9765                 trace_bpf_xdp_link_attach_failed(extack._msg);
9766                 goto out_put_dev;
9767         }
9768 
9769         fd = bpf_link_settle(&link_primer);
9770         /* link itself doesn't hold dev's refcnt to not complicate shutdown */
9771         dev_put(dev);
9772         return fd;
9773 
9774 unlock:
9775         rtnl_unlock();
9776 
9777 out_put_dev:
9778         dev_put(dev);
9779         return err;
9780 }
9781 
9782 /**
9783  *      dev_change_xdp_fd - set or clear a bpf program for a device rx path
9784  *      @dev: device
9785  *      @extack: netlink extended ack
9786  *      @fd: new program fd or negative value to clear
9787  *      @expected_fd: old program fd that userspace expects to replace or clear
9788  *      @flags: xdp-related flags
9789  *
9790  *      Set or clear a bpf program for a device
9791  */
9792 int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
9793                       int fd, int expected_fd, u32 flags)
9794 {
9795         enum bpf_xdp_mode mode = dev_xdp_mode(dev, flags);
9796         struct bpf_prog *new_prog = NULL, *old_prog = NULL;
9797         int err;
9798 
9799         ASSERT_RTNL();
9800 
9801         if (fd >= 0) {
9802                 new_prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP,
9803                                                  mode != XDP_MODE_SKB);
9804                 if (IS_ERR(new_prog))
9805                         return PTR_ERR(new_prog);
9806         }
9807 
9808         if (expected_fd >= 0) {
9809                 old_prog = bpf_prog_get_type_dev(expected_fd, BPF_PROG_TYPE_XDP,
9810                                                  mode != XDP_MODE_SKB);
9811                 if (IS_ERR(old_prog)) {
9812                         err = PTR_ERR(old_prog);
9813                         old_prog = NULL;
9814                         goto err_out;
9815                 }
9816         }
9817 
9818         err = dev_xdp_attach(dev, extack, NULL, new_prog, old_prog, flags);
9819 
9820 err_out:
9821         if (err && new_prog)
9822                 bpf_prog_put(new_prog);
9823         if (old_prog)
9824                 bpf_prog_put(old_prog);
9825         return err;
9826 }
9827 
9828 /**
9829  * dev_index_reserve() - allocate an ifindex in a namespace
9830  * @net: the applicable net namespace
9831  * @ifindex: requested ifindex, pass %0 to get one allocated
9832  *
9833  * Allocate a ifindex for a new device. Caller must either use the ifindex
9834  * to store the device (via list_netdevice()) or call dev_index_release()
9835  * to give the index up.
9836  *
9837  * Return: a suitable unique value for a new device interface number or -errno.
9838  */
9839 static int dev_index_reserve(struct net *net, u32 ifindex)
9840 {
9841         int err;
9842 
9843         if (ifindex > INT_MAX) {
9844                 DEBUG_NET_WARN_ON_ONCE(1);
9845                 return -EINVAL;
9846         }
9847 
9848         if (!ifindex)
9849                 err = xa_alloc_cyclic(&net->dev_by_index, &ifindex, NULL,
9850                                       xa_limit_31b, &net->ifindex, GFP_KERNEL);
9851         else
9852                 err = xa_insert(&net->dev_by_index, ifindex, NULL, GFP_KERNEL);
9853         if (err < 0)
9854                 return err;
9855 
9856         return ifindex;
9857 }
9858 
9859 static void dev_index_release(struct net *net, int ifindex)
9860 {
9861         /* Expect only unused indexes, unlist_netdevice() removes the used */
9862         WARN_ON(xa_erase(&net->dev_by_index, ifindex));
9863 }
9864 
9865 /* Delayed registration/unregisteration */
9866 LIST_HEAD(net_todo_list);
9867 DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
9868 atomic_t dev_unreg_count = ATOMIC_INIT(0);
9869 
9870 static void net_set_todo(struct net_device *dev)
9871 {
9872         list_add_tail(&dev->todo_list, &net_todo_list);
9873 }
9874 
9875 static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
9876         struct net_device *upper, netdev_features_t features)
9877 {
9878         netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
9879         netdev_features_t feature;
9880         int feature_bit;
9881 
9882         for_each_netdev_feature(upper_disables, feature_bit) {
9883                 feature = __NETIF_F_BIT(feature_bit);
9884                 if (!(upper->wanted_features & feature)
9885                     && (features & feature)) {
9886                         netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
9887                                    &feature, upper->name);
9888                         features &= ~feature;
9889                 }
9890         }
9891 
9892         return features;
9893 }
9894 
9895 static void netdev_sync_lower_features(struct net_device *upper,
9896         struct net_device *lower, netdev_features_t features)
9897 {
9898         netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
9899         netdev_features_t feature;
9900         int feature_bit;
9901 
9902         for_each_netdev_feature(upper_disables, feature_bit) {
9903                 feature = __NETIF_F_BIT(feature_bit);
9904                 if (!(features & feature) && (lower->features & feature)) {
9905                         netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
9906                                    &feature, lower->name);
9907                         lower->wanted_features &= ~feature;
9908                         __netdev_update_features(lower);
9909 
9910                         if (unlikely(lower->features & feature))
9911                                 netdev_WARN(upper, "failed to disable %pNF on %s!\n",
9912                                             &feature, lower->name);
9913                         else
9914                                 netdev_features_change(lower);
9915                 }
9916         }
9917 }
9918 
9919 static bool netdev_has_ip_or_hw_csum(netdev_features_t features)
9920 {
9921         netdev_features_t ip_csum_mask = NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM;
9922         bool ip_csum = (features & ip_csum_mask) == ip_csum_mask;
9923         bool hw_csum = features & NETIF_F_HW_CSUM;
9924 
9925         return ip_csum || hw_csum;
9926 }
9927 
9928 static netdev_features_t netdev_fix_features(struct net_device *dev,
9929         netdev_features_t features)
9930 {
9931         /* Fix illegal checksum combinations */
9932         if ((features & NETIF_F_HW_CSUM) &&
9933             (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
9934                 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
9935                 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
9936         }
9937 
9938         /* TSO requires that SG is present as well. */
9939         if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
9940                 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
9941                 features &= ~NETIF_F_ALL_TSO;
9942         }
9943 
9944         if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
9945                                         !(features & NETIF_F_IP_CSUM)) {
9946                 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
9947                 features &= ~NETIF_F_TSO;
9948                 features &= ~NETIF_F_TSO_ECN;
9949         }
9950 
9951         if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
9952                                          !(features & NETIF_F_IPV6_CSUM)) {
9953                 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
9954                 features &= ~NETIF_F_TSO6;
9955         }
9956 
9957         /* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */
9958         if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO))
9959                 features &= ~NETIF_F_TSO_MANGLEID;
9960 
9961         /* TSO ECN requires that TSO is present as well. */
9962         if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
9963                 features &= ~NETIF_F_TSO_ECN;
9964 
9965         /* Software GSO depends on SG. */
9966         if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
9967                 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
9968                 features &= ~NETIF_F_GSO;
9969         }
9970 
9971         /* GSO partial features require GSO partial be set */
9972         if ((features & dev->gso_partial_features) &&
9973             !(features & NETIF_F_GSO_PARTIAL)) {
9974                 netdev_dbg(dev,
9975                            "Dropping partially supported GSO features since no GSO partial.\n");
9976                 features &= ~dev->gso_partial_features;
9977         }
9978 
9979         if (!(features & NETIF_F_RXCSUM)) {
9980                 /* NETIF_F_GRO_HW implies doing RXCSUM since every packet
9981                  * successfully merged by hardware must also have the
9982                  * checksum verified by hardware.  If the user does not
9983                  * want to enable RXCSUM, logically, we should disable GRO_HW.
9984                  */
9985                 if (features & NETIF_F_GRO_HW) {
9986                         netdev_dbg(dev, "Dropping NETIF_F_GRO_HW since no RXCSUM feature.\n");
9987                         features &= ~NETIF_F_GRO_HW;
9988                 }
9989         }
9990 
9991         /* LRO/HW-GRO features cannot be combined with RX-FCS */
9992         if (features & NETIF_F_RXFCS) {
9993                 if (features & NETIF_F_LRO) {
9994                         netdev_dbg(dev, "Dropping LRO feature since RX-FCS is requested.\n");
9995                         features &= ~NETIF_F_LRO;
9996                 }
9997 
9998                 if (features & NETIF_F_GRO_HW) {
9999                         netdev_dbg(dev, "Dropping HW-GRO feature since RX-FCS is requested.\n");
10000                         features &= ~NETIF_F_GRO_HW;
10001                 }
10002         }
10003 
10004         if ((features & NETIF_F_GRO_HW) && (features & NETIF_F_LRO)) {
10005                 netdev_dbg(dev, "Dropping LRO feature since HW-GRO is requested.\n");
10006                 features &= ~NETIF_F_LRO;
10007         }
10008 
10009         if ((features & NETIF_F_HW_TLS_TX) && !netdev_has_ip_or_hw_csum(features)) {
10010                 netdev_dbg(dev, "Dropping TLS TX HW offload feature since no CSUM feature.\n");
10011                 features &= ~NETIF_F_HW_TLS_TX;
10012         }
10013 
10014         if ((features & NETIF_F_HW_TLS_RX) && !(features & NETIF_F_RXCSUM)) {
10015                 netdev_dbg(dev, "Dropping TLS RX HW offload feature since no RXCSUM feature.\n");
10016                 features &= ~NETIF_F_HW_TLS_RX;
10017         }
10018 
10019         if ((features & NETIF_F_GSO_UDP_L4) && !netdev_has_ip_or_hw_csum(features)) {
10020                 netdev_dbg(dev, "Dropping USO feature since no CSUM feature.\n");
10021                 features &= ~NETIF_F_GSO_UDP_L4;
10022         }
10023 
10024         return features;
10025 }
10026 
10027 int __netdev_update_features(struct net_device *dev)
10028 {
10029         struct net_device *upper, *lower;
10030         netdev_features_t features;
10031         struct list_head *iter;
10032         int err = -1;
10033 
10034         ASSERT_RTNL();
10035 
10036         features = netdev_get_wanted_features(dev);
10037 
10038         if (dev->netdev_ops->ndo_fix_features)
10039                 features = dev->netdev_ops->ndo_fix_features(dev, features);
10040 
10041         /* driver might be less strict about feature dependencies */
10042         features = netdev_fix_features(dev, features);
10043 
10044         /* some features can't be enabled if they're off on an upper device */
10045         netdev_for_each_upper_dev_rcu(dev, upper, iter)
10046                 features = netdev_sync_upper_features(dev, upper, features);
10047 
10048         if (dev->features == features)
10049                 goto sync_lower;
10050 
10051         netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
10052                 &dev->features, &features);
10053 
10054         if (dev->netdev_ops->ndo_set_features)
10055                 err = dev->netdev_ops->ndo_set_features(dev, features);
10056         else
10057                 err = 0;
10058 
10059         if (unlikely(err < 0)) {
10060                 netdev_err(dev,
10061                         "set_features() failed (%d); wanted %pNF, left %pNF\n",
10062                         err, &features, &dev->features);
10063                 /* return non-0 since some features might have changed and
10064                  * it's better to fire a spurious notification than miss it
10065                  */
10066                 return -1;
10067         }
10068 
10069 sync_lower:
10070         /* some features must be disabled on lower devices when disabled
10071          * on an upper device (think: bonding master or bridge)
10072          */
10073         netdev_for_each_lower_dev(dev, lower, iter)
10074                 netdev_sync_lower_features(dev, lower, features);
10075 
10076         if (!err) {
10077                 netdev_features_t diff = features ^ dev->features;
10078 
10079                 if (diff & NETIF_F_RX_UDP_TUNNEL_PORT) {
10080                         /* udp_tunnel_{get,drop}_rx_info both need
10081                          * NETIF_F_RX_UDP_TUNNEL_PORT enabled on the
10082                          * device, or they won't do anything.
10083                          * Thus we need to update dev->features
10084                          * *before* calling udp_tunnel_get_rx_info,
10085                          * but *after* calling udp_tunnel_drop_rx_info.
10086                          */
10087                         if (features & NETIF_F_RX_UDP_TUNNEL_PORT) {
10088                                 dev->features = features;
10089                                 udp_tunnel_get_rx_info(dev);
10090                         } else {
10091                                 udp_tunnel_drop_rx_info(dev);
10092                         }
10093                 }
10094 
10095                 if (diff & NETIF_F_HW_VLAN_CTAG_FILTER) {
10096                         if (features & NETIF_F_HW_VLAN_CTAG_FILTER) {
10097                                 dev->features = features;
10098                                 err |= vlan_get_rx_ctag_filter_info(dev);
10099                         } else {
10100                                 vlan_drop_rx_ctag_filter_info(dev);
10101                         }
10102                 }
10103 
10104                 if (diff & NETIF_F_HW_VLAN_STAG_FILTER) {
10105                         if (features & NETIF_F_HW_VLAN_STAG_FILTER) {
10106                                 dev->features = features;
10107                                 err |= vlan_get_rx_stag_filter_info(dev);
10108                         } else {
10109                                 vlan_drop_rx_stag_filter_info(dev);
10110                         }
10111                 }
10112 
10113                 dev->features = features;
10114         }
10115 
10116         return err < 0 ? 0 : 1;
10117 }
10118 
10119 /**
10120  *      netdev_update_features - recalculate device features
10121  *      @dev: the device to check
10122  *
10123  *      Recalculate dev->features set and send notifications if it
10124  *      has changed. Should be called after driver or hardware dependent
10125  *      conditions might have changed that influence the features.
10126  */
10127 void netdev_update_features(struct net_device *dev)
10128 {
10129         if (__netdev_update_features(dev))
10130                 netdev_features_change(dev);
10131 }
10132 EXPORT_SYMBOL(netdev_update_features);
10133 
10134 /**
10135  *      netdev_change_features - recalculate device features
10136  *      @dev: the device to check
10137  *
10138  *      Recalculate dev->features set and send notifications even
10139  *      if they have not changed. Should be called instead of
10140  *      netdev_update_features() if also dev->vlan_features might
10141  *      have changed to allow the changes to be propagated to stacked
10142  *      VLAN devices.
10143  */
10144 void netdev_change_features(struct net_device *dev)
10145 {
10146         __netdev_update_features(dev);
10147         netdev_features_change(dev);
10148 }
10149 EXPORT_SYMBOL(netdev_change_features);
10150 
10151 /**
10152  *      netif_stacked_transfer_operstate -      transfer operstate
10153  *      @rootdev: the root or lower level device to transfer state from
10154  *      @dev: the device to transfer operstate to
10155  *
10156  *      Transfer operational state from root to device. This is normally
10157  *      called when a stacking relationship exists between the root
10158  *      device and the device(a leaf device).
10159  */
10160 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
10161                                         struct net_device *dev)
10162 {
10163         if (rootdev->operstate == IF_OPER_DORMANT)
10164                 netif_dormant_on(dev);
10165         else
10166                 netif_dormant_off(dev);
10167 
10168         if (rootdev->operstate == IF_OPER_TESTING)
10169                 netif_testing_on(dev);
10170         else
10171                 netif_testing_off(dev);
10172 
10173         if (netif_carrier_ok(rootdev))
10174                 netif_carrier_on(dev);
10175         else
10176                 netif_carrier_off(dev);
10177 }
10178 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
10179 
10180 static int netif_alloc_rx_queues(struct net_device *dev)
10181 {
10182         unsigned int i, count = dev->num_rx_queues;
10183         struct netdev_rx_queue *rx;
10184         size_t sz = count * sizeof(*rx);
10185         int err = 0;
10186 
10187         BUG_ON(count < 1);
10188 
10189         rx = kvzalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
10190         if (!rx)
10191                 return -ENOMEM;
10192 
10193         dev->_rx = rx;
10194 
10195         for (i = 0; i < count; i++) {
10196                 rx[i].dev = dev;
10197 
10198                 /* XDP RX-queue setup */
10199                 err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i, 0);
10200                 if (err < 0)
10201                         goto err_rxq_info;
10202         }
10203         return 0;
10204 
10205 err_rxq_info:
10206         /* Rollback successful reg's and free other resources */
10207         while (i--)
10208                 xdp_rxq_info_unreg(&rx[i].xdp_rxq);
10209         kvfree(dev->_rx);
10210         dev->_rx = NULL;
10211         return err;
10212 }
10213 
10214 static void netif_free_rx_queues(struct net_device *dev)
10215 {
10216         unsigned int i, count = dev->num_rx_queues;
10217 
10218         /* netif_alloc_rx_queues alloc failed, resources have been unreg'ed */
10219         if (!dev->_rx)
10220                 return;
10221 
10222         for (i = 0; i < count; i++)
10223                 xdp_rxq_info_unreg(&dev->_rx[i].xdp_rxq);
10224 
10225         kvfree(dev->_rx);
10226 }
10227 
10228 static void netdev_init_one_queue(struct net_device *dev,
10229                                   struct netdev_queue *queue, void *_unused)
10230 {
10231         /* Initialize queue lock */
10232         spin_lock_init(&queue->_xmit_lock);
10233         netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
10234         queue->xmit_lock_owner = -1;
10235         netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
10236         queue->dev = dev;
10237 #ifdef CONFIG_BQL
10238         dql_init(&queue->dql, HZ);
10239 #endif
10240 }
10241 
10242 static void netif_free_tx_queues(struct net_device *dev)
10243 {
10244         kvfree(dev->_tx);
10245 }
10246 
10247 static int netif_alloc_netdev_queues(struct net_device *dev)
10248 {
10249         unsigned int count = dev->num_tx_queues;
10250         struct netdev_queue *tx;
10251         size_t sz = count * sizeof(*tx);
10252 
10253         if (count < 1 || count > 0xffff)
10254                 return -EINVAL;
10255 
10256         tx = kvzalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
10257         if (!tx)
10258                 return -ENOMEM;
10259 
10260         dev->_tx = tx;
10261 
10262         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
10263         spin_lock_init(&dev->tx_global_lock);
10264 
10265         return 0;
10266 }
10267 
10268 void netif_tx_stop_all_queues(struct net_device *dev)
10269 {
10270         unsigned int i;
10271 
10272         for (i = 0; i < dev->num_tx_queues; i++) {
10273                 struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
10274 
10275                 netif_tx_stop_queue(txq);
10276         }
10277 }
10278 EXPORT_SYMBOL(netif_tx_stop_all_queues);
10279 
10280 static int netdev_do_alloc_pcpu_stats(struct net_device *dev)
10281 {
10282         void __percpu *v;
10283 
10284         /* Drivers implementing ndo_get_peer_dev must support tstat
10285          * accounting, so that skb_do_redirect() can bump the dev's
10286          * RX stats upon network namespace switch.
10287          */
10288         if (dev->netdev_ops->ndo_get_peer_dev &&
10289             dev->pcpu_stat_type != NETDEV_PCPU_STAT_TSTATS)
10290                 return -EOPNOTSUPP;
10291 
10292         switch (dev->pcpu_stat_type) {
10293         case NETDEV_PCPU_STAT_NONE:
10294                 return 0;
10295         case NETDEV_PCPU_STAT_LSTATS:
10296                 v = dev->lstats = netdev_alloc_pcpu_stats(struct pcpu_lstats);
10297                 break;
10298         case NETDEV_PCPU_STAT_TSTATS:
10299                 v = dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
10300                 break;
10301         case NETDEV_PCPU_STAT_DSTATS:
10302                 v = dev->dstats = netdev_alloc_pcpu_stats(struct pcpu_dstats);
10303                 break;
10304         default:
10305                 return -EINVAL;
10306         }
10307 
10308         return v ? 0 : -ENOMEM;
10309 }
10310 
10311 static void netdev_do_free_pcpu_stats(struct net_device *dev)
10312 {
10313         switch (dev->pcpu_stat_type) {
10314         case NETDEV_PCPU_STAT_NONE:
10315                 return;
10316         case NETDEV_PCPU_STAT_LSTATS:
10317                 free_percpu(dev->lstats);
10318                 break;
10319         case NETDEV_PCPU_STAT_TSTATS:
10320                 free_percpu(dev->tstats);
10321                 break;
10322         case NETDEV_PCPU_STAT_DSTATS:
10323                 free_percpu(dev->dstats);
10324                 break;
10325         }
10326 }
10327 
10328 /**
10329  * register_netdevice() - register a network device
10330  * @dev: device to register
10331  *
10332  * Take a prepared network device structure and make it externally accessible.
10333  * A %NETDEV_REGISTER message is sent to the netdev notifier chain.
10334  * Callers must hold the rtnl lock - you may want register_netdev()
10335  * instead of this.
10336  */
10337 int register_netdevice(struct net_device *dev)
10338 {
10339         int ret;
10340         struct net *net = dev_net(dev);
10341 
10342         BUILD_BUG_ON(sizeof(netdev_features_t) * BITS_PER_BYTE <
10343                      NETDEV_FEATURE_COUNT);
10344         BUG_ON(dev_boot_phase);
10345         ASSERT_RTNL();
10346 
10347         might_sleep();
10348 
10349         /* When net_device's are persistent, this will be fatal. */
10350         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
10351         BUG_ON(!net);
10352 
10353         ret = ethtool_check_ops(dev->ethtool_ops);
10354         if (ret)
10355                 return ret;
10356 
10357         /* rss ctx ID 0 is reserved for the default context, start from 1 */
10358         xa_init_flags(&dev->ethtool->rss_ctx, XA_FLAGS_ALLOC1);
10359         mutex_init(&dev->ethtool->rss_lock);
10360 
10361         spin_lock_init(&dev->addr_list_lock);
10362         netdev_set_addr_lockdep_class(dev);
10363 
10364         ret = dev_get_valid_name(net, dev, dev->name);
10365         if (ret < 0)
10366                 goto out;
10367 
10368         ret = -ENOMEM;
10369         dev->name_node = netdev_name_node_head_alloc(dev);
10370         if (!dev->name_node)
10371                 goto out;
10372 
10373         /* Init, if this function is available */
10374         if (dev->netdev_ops->ndo_init) {
10375                 ret = dev->netdev_ops->ndo_init(dev);
10376                 if (ret) {
10377                         if (ret > 0)
10378                                 ret = -EIO;
10379                         goto err_free_name;
10380                 }
10381         }
10382 
10383         if (((dev->hw_features | dev->features) &
10384              NETIF_F_HW_VLAN_CTAG_FILTER) &&
10385             (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
10386              !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
10387                 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
10388                 ret = -EINVAL;
10389                 goto err_uninit;
10390         }
10391 
10392         ret = netdev_do_alloc_pcpu_stats(dev);
10393         if (ret)
10394                 goto err_uninit;
10395 
10396         ret = dev_index_reserve(net, dev->ifindex);
10397         if (ret < 0)
10398                 goto err_free_pcpu;
10399         dev->ifindex = ret;
10400 
10401         /* Transfer changeable features to wanted_features and enable
10402          * software offloads (GSO and GRO).
10403          */
10404         dev->hw_features |= (NETIF_F_SOFT_FEATURES | NETIF_F_SOFT_FEATURES_OFF);
10405         dev->features |= NETIF_F_SOFT_FEATURES;
10406 
10407         if (dev->udp_tunnel_nic_info) {
10408                 dev->features |= NETIF_F_RX_UDP_TUNNEL_PORT;
10409                 dev->hw_features |= NETIF_F_RX_UDP_TUNNEL_PORT;
10410         }
10411 
10412         dev->wanted_features = dev->features & dev->hw_features;
10413 
10414         if (!(dev->flags & IFF_LOOPBACK))
10415                 dev->hw_features |= NETIF_F_NOCACHE_COPY;
10416 
10417         /* If IPv4 TCP segmentation offload is supported we should also
10418          * allow the device to enable segmenting the frame with the option
10419          * of ignoring a static IP ID value.  This doesn't enable the
10420          * feature itself but allows the user to enable it later.
10421          */
10422         if (dev->hw_features & NETIF_F_TSO)
10423                 dev->hw_features |= NETIF_F_TSO_MANGLEID;
10424         if (dev->vlan_features & NETIF_F_TSO)
10425                 dev->vlan_features |= NETIF_F_TSO_MANGLEID;
10426         if (dev->mpls_features & NETIF_F_TSO)
10427                 dev->mpls_features |= NETIF_F_TSO_MANGLEID;
10428         if (dev->hw_enc_features & NETIF_F_TSO)
10429                 dev->hw_enc_features |= NETIF_F_TSO_MANGLEID;
10430 
10431         /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
10432          */
10433         dev->vlan_features |= NETIF_F_HIGHDMA;
10434 
10435         /* Make NETIF_F_SG inheritable to tunnel devices.
10436          */
10437         dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL;
10438 
10439         /* Make NETIF_F_SG inheritable to MPLS.
10440          */
10441         dev->mpls_features |= NETIF_F_SG;
10442 
10443         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
10444         ret = notifier_to_errno(ret);
10445         if (ret)
10446                 goto err_ifindex_release;
10447 
10448         ret = netdev_register_kobject(dev);
10449 
10450         WRITE_ONCE(dev->reg_state, ret ? NETREG_UNREGISTERED : NETREG_REGISTERED);
10451 
10452         if (ret)
10453                 goto err_uninit_notify;
10454 
10455         __netdev_update_features(dev);
10456 
10457         /*
10458          *      Default initial state at registry is that the
10459          *      device is present.
10460          */
10461 
10462         set_bit(__LINK_STATE_PRESENT, &dev->state);
10463 
10464         linkwatch_init_dev(dev);
10465 
10466         dev_init_scheduler(dev);
10467 
10468         netdev_hold(dev, &dev->dev_registered_tracker, GFP_KERNEL);
10469         list_netdevice(dev);
10470 
10471         add_device_randomness(dev->dev_addr, dev->addr_len);
10472 
10473         /* If the device has permanent device address, driver should
10474          * set dev_addr and also addr_assign_type should be set to
10475          * NET_ADDR_PERM (default value).
10476          */
10477         if (dev->addr_assign_type == NET_ADDR_PERM)
10478                 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
10479 
10480         /* Notify protocols, that a new device appeared. */
10481         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
10482         ret = notifier_to_errno(ret);
10483         if (ret) {
10484                 /* Expect explicit free_netdev() on failure */
10485                 dev->needs_free_netdev = false;
10486                 unregister_netdevice_queue(dev, NULL);
10487                 goto out;
10488         }
10489         /*
10490          *      Prevent userspace races by waiting until the network
10491          *      device is fully setup before sending notifications.
10492          */
10493         if (!dev->rtnl_link_ops ||
10494             dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
10495                 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL, 0, NULL);
10496 
10497 out:
10498         return ret;
10499 
10500 err_uninit_notify:
10501         call_netdevice_notifiers(NETDEV_PRE_UNINIT, dev);
10502 err_ifindex_release:
10503         dev_index_release(net, dev->ifindex);
10504 err_free_pcpu:
10505         netdev_do_free_pcpu_stats(dev);
10506 err_uninit:
10507         if (dev->netdev_ops->ndo_uninit)
10508                 dev->netdev_ops->ndo_uninit(dev);
10509         if (dev->priv_destructor)
10510                 dev->priv_destructor(dev);
10511 err_free_name:
10512         netdev_name_node_free(dev->name_node);
10513         goto out;
10514 }
10515 EXPORT_SYMBOL(register_netdevice);
10516 
10517 /* Initialize the core of a dummy net device.
10518  * This is useful if you are calling this function after alloc_netdev(),
10519  * since it does not memset the net_device fields.
10520  */
10521 static void init_dummy_netdev_core(struct net_device *dev)
10522 {
10523         /* make sure we BUG if trying to hit standard
10524          * register/unregister code path
10525          */
10526         dev->reg_state = NETREG_DUMMY;
10527 
10528         /* NAPI wants this */
10529         INIT_LIST_HEAD(&dev->napi_list);
10530 
10531         /* a dummy interface is started by default */
10532         set_bit(__LINK_STATE_PRESENT, &dev->state);
10533         set_bit(__LINK_STATE_START, &dev->state);
10534 
10535         /* napi_busy_loop stats accounting wants this */
10536         dev_net_set(dev, &init_net);
10537 
10538         /* Note : We dont allocate pcpu_refcnt for dummy devices,
10539          * because users of this 'device' dont need to change
10540          * its refcount.
10541          */
10542 }
10543 
10544 /**
10545  *      init_dummy_netdev       - init a dummy network device for NAPI
10546  *      @dev: device to init
10547  *
10548  *      This takes a network device structure and initializes the minimum
10549  *      amount of fields so it can be used to schedule NAPI polls without
10550  *      registering a full blown interface. This is to be used by drivers
10551  *      that need to tie several hardware interfaces to a single NAPI
10552  *      poll scheduler due to HW limitations.
10553  */
10554 void init_dummy_netdev(struct net_device *dev)
10555 {
10556         /* Clear everything. Note we don't initialize spinlocks
10557          * as they aren't supposed to be taken by any of the
10558          * NAPI code and this dummy netdev is supposed to be
10559          * only ever used for NAPI polls
10560          */
10561         memset(dev, 0, sizeof(struct net_device));
10562         init_dummy_netdev_core(dev);
10563 }
10564 EXPORT_SYMBOL_GPL(init_dummy_netdev);
10565 
10566 /**
10567  *      register_netdev - register a network device
10568  *      @dev: device to register
10569  *
10570  *      Take a completed network device structure and add it to the kernel
10571  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
10572  *      chain. 0 is returned on success. A negative errno code is returned
10573  *      on a failure to set up the device, or if the name is a duplicate.
10574  *
10575  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
10576  *      and expands the device name if you passed a format string to
10577  *      alloc_netdev.
10578  */
10579 int register_netdev(struct net_device *dev)
10580 {
10581         int err;
10582 
10583         if (rtnl_lock_killable())
10584                 return -EINTR;
10585         err = register_netdevice(dev);
10586         rtnl_unlock();
10587         return err;
10588 }
10589 EXPORT_SYMBOL(register_netdev);
10590 
10591 int netdev_refcnt_read(const struct net_device *dev)
10592 {
10593 #ifdef CONFIG_PCPU_DEV_REFCNT
10594         int i, refcnt = 0;
10595 
10596         for_each_possible_cpu(i)
10597                 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
10598         return refcnt;
10599 #else
10600         return refcount_read(&dev->dev_refcnt);
10601 #endif
10602 }
10603 EXPORT_SYMBOL(netdev_refcnt_read);
10604 
10605 int netdev_unregister_timeout_secs __read_mostly = 10;
10606 
10607 #define WAIT_REFS_MIN_MSECS 1
10608 #define WAIT_REFS_MAX_MSECS 250
10609 /**
10610  * netdev_wait_allrefs_any - wait until all references are gone.
10611  * @list: list of net_devices to wait on
10612  *
10613  * This is called when unregistering network devices.
10614  *
10615  * Any protocol or device that holds a reference should register
10616  * for netdevice notification, and cleanup and put back the
10617  * reference if they receive an UNREGISTER event.
10618  * We can get stuck here if buggy protocols don't correctly
10619  * call dev_put.
10620  */
10621 static struct net_device *netdev_wait_allrefs_any(struct list_head *list)
10622 {
10623         unsigned long rebroadcast_time, warning_time;
10624         struct net_device *dev;
10625         int wait = 0;
10626 
10627         rebroadcast_time = warning_time = jiffies;
10628 
10629         list_for_each_entry(dev, list, todo_list)
10630                 if (netdev_refcnt_read(dev) == 1)
10631                         return dev;
10632 
10633         while (true) {
10634                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
10635                         rtnl_lock();
10636 
10637                         /* Rebroadcast unregister notification */
10638                         list_for_each_entry(dev, list, todo_list)
10639                                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
10640 
10641                         __rtnl_unlock();
10642                         rcu_barrier();
10643                         rtnl_lock();
10644 
10645                         list_for_each_entry(dev, list, todo_list)
10646                                 if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
10647                                              &dev->state)) {
10648                                         /* We must not have linkwatch events
10649                                          * pending on unregister. If this
10650                                          * happens, we simply run the queue
10651                                          * unscheduled, resulting in a noop
10652                                          * for this device.
10653                                          */
10654                                         linkwatch_run_queue();
10655                                         break;
10656                                 }
10657 
10658                         __rtnl_unlock();
10659 
10660                         rebroadcast_time = jiffies;
10661                 }
10662 
10663                 rcu_barrier();
10664 
10665                 if (!wait) {
10666                         wait = WAIT_REFS_MIN_MSECS;
10667                 } else {
10668                         msleep(wait);
10669                         wait = min(wait << 1, WAIT_REFS_MAX_MSECS);
10670                 }
10671 
10672                 list_for_each_entry(dev, list, todo_list)
10673                         if (netdev_refcnt_read(dev) == 1)
10674                                 return dev;
10675 
10676                 if (time_after(jiffies, warning_time +
10677                                READ_ONCE(netdev_unregister_timeout_secs) * HZ)) {
10678                         list_for_each_entry(dev, list, todo_list) {
10679                                 pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
10680                                          dev->name, netdev_refcnt_read(dev));
10681                                 ref_tracker_dir_print(&dev->refcnt_tracker, 10);
10682                         }
10683 
10684                         warning_time = jiffies;
10685                 }
10686         }
10687 }
10688 
10689 /* The sequence is:
10690  *
10691  *      rtnl_lock();
10692  *      ...
10693  *      register_netdevice(x1);
10694  *      register_netdevice(x2);
10695  *      ...
10696  *      unregister_netdevice(y1);
10697  *      unregister_netdevice(y2);
10698  *      ...
10699  *      rtnl_unlock();
10700  *      free_netdev(y1);
10701  *      free_netdev(y2);
10702  *
10703  * We are invoked by rtnl_unlock().
10704  * This allows us to deal with problems:
10705  * 1) We can delete sysfs objects which invoke hotplug
10706  *    without deadlocking with linkwatch via keventd.
10707  * 2) Since we run with the RTNL semaphore not held, we can sleep
10708  *    safely in order to wait for the netdev refcnt to drop to zero.
10709  *
10710  * We must not return until all unregister events added during
10711  * the interval the lock was held have been completed.
10712  */
10713 void netdev_run_todo(void)
10714 {
10715         struct net_device *dev, *tmp;
10716         struct list_head list;
10717         int cnt;
10718 #ifdef CONFIG_LOCKDEP
10719         struct list_head unlink_list;
10720 
10721         list_replace_init(&net_unlink_list, &unlink_list);
10722 
10723         while (!list_empty(&unlink_list)) {
10724                 struct net_device *dev = list_first_entry(&unlink_list,
10725                                                           struct net_device,
10726                                                           unlink_list);
10727                 list_del_init(&dev->unlink_list);
10728                 dev->nested_level = dev->lower_level - 1;
10729         }
10730 #endif
10731 
10732         /* Snapshot list, allow later requests */
10733         list_replace_init(&net_todo_list, &list);
10734 
10735         __rtnl_unlock();
10736 
10737         /* Wait for rcu callbacks to finish before next phase */
10738         if (!list_empty(&list))
10739                 rcu_barrier();
10740 
10741         list_for_each_entry_safe(dev, tmp, &list, todo_list) {
10742                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
10743                         netdev_WARN(dev, "run_todo but not unregistering\n");
10744                         list_del(&dev->todo_list);
10745                         continue;
10746                 }
10747 
10748                 WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERED);
10749                 linkwatch_sync_dev(dev);
10750         }
10751 
10752         cnt = 0;
10753         while (!list_empty(&list)) {
10754                 dev = netdev_wait_allrefs_any(&list);
10755                 list_del(&dev->todo_list);
10756 
10757                 /* paranoia */
10758                 BUG_ON(netdev_refcnt_read(dev) != 1);
10759                 BUG_ON(!list_empty(&dev->ptype_all));
10760                 BUG_ON(!list_empty(&dev->ptype_specific));
10761                 WARN_ON(rcu_access_pointer(dev->ip_ptr));
10762                 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
10763 
10764                 netdev_do_free_pcpu_stats(dev);
10765                 if (dev->priv_destructor)
10766                         dev->priv_destructor(dev);
10767                 if (dev->needs_free_netdev)
10768                         free_netdev(dev);
10769 
10770                 cnt++;
10771 
10772                 /* Free network device */
10773                 kobject_put(&dev->dev.kobj);
10774         }
10775         if (cnt && atomic_sub_and_test(cnt, &dev_unreg_count))
10776                 wake_up(&netdev_unregistering_wq);
10777 }
10778 
10779 /* Collate per-cpu network dstats statistics
10780  *
10781  * Read per-cpu network statistics from dev->dstats and populate the related
10782  * fields in @s.
10783  */
10784 static void dev_fetch_dstats(struct rtnl_link_stats64 *s,
10785                              const struct pcpu_dstats __percpu *dstats)
10786 {
10787         int cpu;
10788 
10789         for_each_possible_cpu(cpu) {
10790                 u64 rx_packets, rx_bytes, rx_drops;
10791                 u64 tx_packets, tx_bytes, tx_drops;
10792                 const struct pcpu_dstats *stats;
10793                 unsigned int start;
10794 
10795                 stats = per_cpu_ptr(dstats, cpu);
10796                 do {
10797                         start = u64_stats_fetch_begin(&stats->syncp);
10798                         rx_packets = u64_stats_read(&stats->rx_packets);
10799                         rx_bytes   = u64_stats_read(&stats->rx_bytes);
10800                         rx_drops   = u64_stats_read(&stats->rx_drops);
10801                         tx_packets = u64_stats_read(&stats->tx_packets);
10802                         tx_bytes   = u64_stats_read(&stats->tx_bytes);
10803                         tx_drops   = u64_stats_read(&stats->tx_drops);
10804                 } while (u64_stats_fetch_retry(&stats->syncp, start));
10805 
10806                 s->rx_packets += rx_packets;
10807                 s->rx_bytes   += rx_bytes;
10808                 s->rx_dropped += rx_drops;
10809                 s->tx_packets += tx_packets;
10810                 s->tx_bytes   += tx_bytes;
10811                 s->tx_dropped += tx_drops;
10812         }
10813 }
10814 
10815 /* ndo_get_stats64 implementation for dtstats-based accounting.
10816  *
10817  * Populate @s from dev->stats and dev->dstats. This is used internally by the
10818  * core for NETDEV_PCPU_STAT_DSTAT-type stats collection.
10819  */
10820 static void dev_get_dstats64(const struct net_device *dev,
10821                              struct rtnl_link_stats64 *s)
10822 {
10823         netdev_stats_to_stats64(s, &dev->stats);
10824         dev_fetch_dstats(s, dev->dstats);
10825 }
10826 
10827 /* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
10828  * all the same fields in the same order as net_device_stats, with only
10829  * the type differing, but rtnl_link_stats64 may have additional fields
10830  * at the end for newer counters.
10831  */
10832 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
10833                              const struct net_device_stats *netdev_stats)
10834 {
10835         size_t i, n = sizeof(*netdev_stats) / sizeof(atomic_long_t);
10836         const atomic_long_t *src = (atomic_long_t *)netdev_stats;
10837         u64 *dst = (u64 *)stats64;
10838 
10839         BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
10840         for (i = 0; i < n; i++)
10841                 dst[i] = (unsigned long)atomic_long_read(&src[i]);
10842         /* zero out counters that only exist in rtnl_link_stats64 */
10843         memset((char *)stats64 + n * sizeof(u64), 0,
10844                sizeof(*stats64) - n * sizeof(u64));
10845 }
10846 EXPORT_SYMBOL(netdev_stats_to_stats64);
10847 
10848 static __cold struct net_device_core_stats __percpu *netdev_core_stats_alloc(
10849                 struct net_device *dev)
10850 {
10851         struct net_device_core_stats __percpu *p;
10852 
10853         p = alloc_percpu_gfp(struct net_device_core_stats,
10854                              GFP_ATOMIC | __GFP_NOWARN);
10855 
10856         if (p && cmpxchg(&dev->core_stats, NULL, p))
10857                 free_percpu(p);
10858 
10859         /* This READ_ONCE() pairs with the cmpxchg() above */
10860         return READ_ONCE(dev->core_stats);
10861 }
10862 
10863 noinline void netdev_core_stats_inc(struct net_device *dev, u32 offset)
10864 {
10865         /* This READ_ONCE() pairs with the write in netdev_core_stats_alloc() */
10866         struct net_device_core_stats __percpu *p = READ_ONCE(dev->core_stats);
10867         unsigned long __percpu *field;
10868 
10869         if (unlikely(!p)) {
10870                 p = netdev_core_stats_alloc(dev);
10871                 if (!p)
10872                         return;
10873         }
10874 
10875         field = (__force unsigned long __percpu *)((__force void *)p + offset);
10876         this_cpu_inc(*field);
10877 }
10878 EXPORT_SYMBOL_GPL(netdev_core_stats_inc);
10879 
10880 /**
10881  *      dev_get_stats   - get network device statistics
10882  *      @dev: device to get statistics from
10883  *      @storage: place to store stats
10884  *
10885  *      Get network statistics from device. Return @storage.
10886  *      The device driver may provide its own method by setting
10887  *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
10888  *      otherwise the internal statistics structure is used.
10889  */
10890 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
10891                                         struct rtnl_link_stats64 *storage)
10892 {
10893         const struct net_device_ops *ops = dev->netdev_ops;
10894         const struct net_device_core_stats __percpu *p;
10895 
10896         if (ops->ndo_get_stats64) {
10897                 memset(storage, 0, sizeof(*storage));
10898                 ops->ndo_get_stats64(dev, storage);
10899         } else if (ops->ndo_get_stats) {
10900                 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
10901         } else if (dev->pcpu_stat_type == NETDEV_PCPU_STAT_TSTATS) {
10902                 dev_get_tstats64(dev, storage);
10903         } else if (dev->pcpu_stat_type == NETDEV_PCPU_STAT_DSTATS) {
10904                 dev_get_dstats64(dev, storage);
10905         } else {
10906                 netdev_stats_to_stats64(storage, &dev->stats);
10907         }
10908 
10909         /* This READ_ONCE() pairs with the write in netdev_core_stats_alloc() */
10910         p = READ_ONCE(dev->core_stats);
10911         if (p) {
10912                 const struct net_device_core_stats *core_stats;
10913                 int i;
10914 
10915                 for_each_possible_cpu(i) {
10916                         core_stats = per_cpu_ptr(p, i);
10917                         storage->rx_dropped += READ_ONCE(core_stats->rx_dropped);
10918                         storage->tx_dropped += READ_ONCE(core_stats->tx_dropped);
10919                         storage->rx_nohandler += READ_ONCE(core_stats->rx_nohandler);
10920                         storage->rx_otherhost_dropped += READ_ONCE(core_stats->rx_otherhost_dropped);
10921                 }
10922         }
10923         return storage;
10924 }
10925 EXPORT_SYMBOL(dev_get_stats);
10926 
10927 /**
10928  *      dev_fetch_sw_netstats - get per-cpu network device statistics
10929  *      @s: place to store stats
10930  *      @netstats: per-cpu network stats to read from
10931  *
10932  *      Read per-cpu network statistics and populate the related fields in @s.
10933  */
10934 void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s,
10935                            const struct pcpu_sw_netstats __percpu *netstats)
10936 {
10937         int cpu;
10938 
10939         for_each_possible_cpu(cpu) {
10940                 u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
10941                 const struct pcpu_sw_netstats *stats;
10942                 unsigned int start;
10943 
10944                 stats = per_cpu_ptr(netstats, cpu);
10945                 do {
10946                         start = u64_stats_fetch_begin(&stats->syncp);
10947                         rx_packets = u64_stats_read(&stats->rx_packets);
10948                         rx_bytes   = u64_stats_read(&stats->rx_bytes);
10949                         tx_packets = u64_stats_read(&stats->tx_packets);
10950                         tx_bytes   = u64_stats_read(&stats->tx_bytes);
10951                 } while (u64_stats_fetch_retry(&stats->syncp, start));
10952 
10953                 s->rx_packets += rx_packets;
10954                 s->rx_bytes   += rx_bytes;
10955                 s->tx_packets += tx_packets;
10956                 s->tx_bytes   += tx_bytes;
10957         }
10958 }
10959 EXPORT_SYMBOL_GPL(dev_fetch_sw_netstats);
10960 
10961 /**
10962  *      dev_get_tstats64 - ndo_get_stats64 implementation
10963  *      @dev: device to get statistics from
10964  *      @s: place to store stats
10965  *
10966  *      Populate @s from dev->stats and dev->tstats. Can be used as
10967  *      ndo_get_stats64() callback.
10968  */
10969 void dev_get_tstats64(struct net_device *dev, struct rtnl_link_stats64 *s)
10970 {
10971         netdev_stats_to_stats64(s, &dev->stats);
10972         dev_fetch_sw_netstats(s, dev->tstats);
10973 }
10974 EXPORT_SYMBOL_GPL(dev_get_tstats64);
10975 
10976 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
10977 {
10978         struct netdev_queue *queue = dev_ingress_queue(dev);
10979 
10980 #ifdef CONFIG_NET_CLS_ACT
10981         if (queue)
10982                 return queue;
10983         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
10984         if (!queue)
10985                 return NULL;
10986         netdev_init_one_queue(dev, queue, NULL);
10987         RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
10988         RCU_INIT_POINTER(queue->qdisc_sleeping, &noop_qdisc);
10989         rcu_assign_pointer(dev->ingress_queue, queue);
10990 #endif
10991         return queue;
10992 }
10993 
10994 static const struct ethtool_ops default_ethtool_ops;
10995 
10996 void netdev_set_default_ethtool_ops(struct net_device *dev,
10997                                     const struct ethtool_ops *ops)
10998 {
10999         if (dev->ethtool_ops == &default_ethtool_ops)
11000                 dev->ethtool_ops = ops;
11001 }
11002 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
11003 
11004 /**
11005  * netdev_sw_irq_coalesce_default_on() - enable SW IRQ coalescing by default
11006  * @dev: netdev to enable the IRQ coalescing on
11007  *
11008  * Sets a conservative default for SW IRQ coalescing. Users can use
11009  * sysfs attributes to override the default values.
11010  */
11011 void netdev_sw_irq_coalesce_default_on(struct net_device *dev)
11012 {
11013         WARN_ON(dev->reg_state == NETREG_REGISTERED);
11014 
11015         if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
11016                 dev->gro_flush_timeout = 20000;
11017                 dev->napi_defer_hard_irqs = 1;
11018         }
11019 }
11020 EXPORT_SYMBOL_GPL(netdev_sw_irq_coalesce_default_on);
11021 
11022 /**
11023  * alloc_netdev_mqs - allocate network device
11024  * @sizeof_priv: size of private data to allocate space for
11025  * @name: device name format string
11026  * @name_assign_type: origin of device name
11027  * @setup: callback to initialize device
11028  * @txqs: the number of TX subqueues to allocate
11029  * @rxqs: the number of RX subqueues to allocate
11030  *
11031  * Allocates a struct net_device with private data area for driver use
11032  * and performs basic initialization.  Also allocates subqueue structs
11033  * for each queue on the device.
11034  */
11035 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
11036                 unsigned char name_assign_type,
11037                 void (*setup)(struct net_device *),
11038                 unsigned int txqs, unsigned int rxqs)
11039 {
11040         struct net_device *dev;
11041 
11042         BUG_ON(strlen(name) >= sizeof(dev->name));
11043 
11044         if (txqs < 1) {
11045                 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
11046                 return NULL;
11047         }
11048 
11049         if (rxqs < 1) {
11050                 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
11051                 return NULL;
11052         }
11053 
11054         dev = kvzalloc(struct_size(dev, priv, sizeof_priv),
11055                        GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
11056         if (!dev)
11057                 return NULL;
11058 
11059         dev->priv_len = sizeof_priv;
11060 
11061         ref_tracker_dir_init(&dev->refcnt_tracker, 128, name);
11062 #ifdef CONFIG_PCPU_DEV_REFCNT
11063         dev->pcpu_refcnt = alloc_percpu(int);
11064         if (!dev->pcpu_refcnt)
11065                 goto free_dev;
11066         __dev_hold(dev);
11067 #else
11068         refcount_set(&dev->dev_refcnt, 1);
11069 #endif
11070 
11071         if (dev_addr_init(dev))
11072                 goto free_pcpu;
11073 
11074         dev_mc_init(dev);
11075         dev_uc_init(dev);
11076 
11077         dev_net_set(dev, &init_net);
11078 
11079         dev->gso_max_size = GSO_LEGACY_MAX_SIZE;
11080         dev->xdp_zc_max_segs = 1;
11081         dev->gso_max_segs = GSO_MAX_SEGS;
11082         dev->gro_max_size = GRO_LEGACY_MAX_SIZE;
11083         dev->gso_ipv4_max_size = GSO_LEGACY_MAX_SIZE;
11084         dev->gro_ipv4_max_size = GRO_LEGACY_MAX_SIZE;
11085         dev->tso_max_size = TSO_LEGACY_MAX_SIZE;
11086         dev->tso_max_segs = TSO_MAX_SEGS;
11087         dev->upper_level = 1;
11088         dev->lower_level = 1;
11089 #ifdef CONFIG_LOCKDEP
11090         dev->nested_level = 0;
11091         INIT_LIST_HEAD(&dev->unlink_list);
11092 #endif
11093 
11094         INIT_LIST_HEAD(&dev->napi_list);
11095         INIT_LIST_HEAD(&dev->unreg_list);
11096         INIT_LIST_HEAD(&dev->close_list);
11097         INIT_LIST_HEAD(&dev->link_watch_list);
11098         INIT_LIST_HEAD(&dev->adj_list.upper);
11099         INIT_LIST_HEAD(&dev->adj_list.lower);
11100         INIT_LIST_HEAD(&dev->ptype_all);
11101         INIT_LIST_HEAD(&dev->ptype_specific);
11102         INIT_LIST_HEAD(&dev->net_notifier_list);
11103 #ifdef CONFIG_NET_SCHED
11104         hash_init(dev->qdisc_hash);
11105 #endif
11106         dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
11107         setup(dev);
11108 
11109         if (!dev->tx_queue_len) {
11110                 dev->priv_flags |= IFF_NO_QUEUE;
11111                 dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
11112         }
11113 
11114         dev->num_tx_queues = txqs;
11115         dev->real_num_tx_queues = txqs;
11116         if (netif_alloc_netdev_queues(dev))
11117                 goto free_all;
11118 
11119         dev->num_rx_queues = rxqs;
11120         dev->real_num_rx_queues = rxqs;
11121         if (netif_alloc_rx_queues(dev))
11122                 goto free_all;
11123         dev->ethtool = kzalloc(sizeof(*dev->ethtool), GFP_KERNEL_ACCOUNT);
11124         if (!dev->ethtool)
11125                 goto free_all;
11126 
11127         strcpy(dev->name, name);
11128         dev->name_assign_type = name_assign_type;
11129         dev->group = INIT_NETDEV_GROUP;
11130         if (!dev->ethtool_ops)
11131                 dev->ethtool_ops = &default_ethtool_ops;
11132 
11133         nf_hook_netdev_init(dev);
11134 
11135         return dev;
11136 
11137 free_all:
11138         free_netdev(dev);
11139         return NULL;
11140 
11141 free_pcpu:
11142 #ifdef CONFIG_PCPU_DEV_REFCNT
11143         free_percpu(dev->pcpu_refcnt);
11144 free_dev:
11145 #endif
11146         kvfree(dev);
11147         return NULL;
11148 }
11149 EXPORT_SYMBOL(alloc_netdev_mqs);
11150 
11151 /**
11152  * free_netdev - free network device
11153  * @dev: device
11154  *
11155  * This function does the last stage of destroying an allocated device
11156  * interface. The reference to the device object is released. If this
11157  * is the last reference then it will be freed.Must be called in process
11158  * context.
11159  */
11160 void free_netdev(struct net_device *dev)
11161 {
11162         struct napi_struct *p, *n;
11163 
11164         might_sleep();
11165 
11166         /* When called immediately after register_netdevice() failed the unwind
11167          * handling may still be dismantling the device. Handle that case by
11168          * deferring the free.
11169          */
11170         if (dev->reg_state == NETREG_UNREGISTERING) {
11171                 ASSERT_RTNL();
11172                 dev->needs_free_netdev = true;
11173                 return;
11174         }
11175 
11176         kfree(dev->ethtool);
11177         netif_free_tx_queues(dev);
11178         netif_free_rx_queues(dev);
11179 
11180         kfree(rcu_dereference_protected(dev->ingress_queue, 1));
11181 
11182         /* Flush device addresses */
11183         dev_addr_flush(dev);
11184 
11185         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
11186                 netif_napi_del(p);
11187 
11188         ref_tracker_dir_exit(&dev->refcnt_tracker);
11189 #ifdef CONFIG_PCPU_DEV_REFCNT
11190         free_percpu(dev->pcpu_refcnt);
11191         dev->pcpu_refcnt = NULL;
11192 #endif
11193         free_percpu(dev->core_stats);
11194         dev->core_stats = NULL;
11195         free_percpu(dev->xdp_bulkq);
11196         dev->xdp_bulkq = NULL;
11197 
11198         /*  Compatibility with error handling in drivers */
11199         if (dev->reg_state == NETREG_UNINITIALIZED ||
11200             dev->reg_state == NETREG_DUMMY) {
11201                 kvfree(dev);
11202                 return;
11203         }
11204 
11205         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
11206         WRITE_ONCE(dev->reg_state, NETREG_RELEASED);
11207 
11208         /* will free via device release */
11209         put_device(&dev->dev);
11210 }
11211 EXPORT_SYMBOL(free_netdev);
11212 
11213 /**
11214  * alloc_netdev_dummy - Allocate and initialize a dummy net device.
11215  * @sizeof_priv: size of private data to allocate space for
11216  *
11217  * Return: the allocated net_device on success, NULL otherwise
11218  */
11219 struct net_device *alloc_netdev_dummy(int sizeof_priv)
11220 {
11221         return alloc_netdev(sizeof_priv, "dummy#", NET_NAME_UNKNOWN,
11222                             init_dummy_netdev_core);
11223 }
11224 EXPORT_SYMBOL_GPL(alloc_netdev_dummy);
11225 
11226 /**
11227  *      synchronize_net -  Synchronize with packet receive processing
11228  *
11229  *      Wait for packets currently being received to be done.
11230  *      Does not block later packets from starting.
11231  */
11232 void synchronize_net(void)
11233 {
11234         might_sleep();
11235         if (rtnl_is_locked())
11236                 synchronize_rcu_expedited();
11237         else
11238                 synchronize_rcu();
11239 }
11240 EXPORT_SYMBOL(synchronize_net);
11241 
11242 static void netdev_rss_contexts_free(struct net_device *dev)
11243 {
11244         struct ethtool_rxfh_context *ctx;
11245         unsigned long context;
11246 
11247         mutex_lock(&dev->ethtool->rss_lock);
11248         xa_for_each(&dev->ethtool->rss_ctx, context, ctx) {
11249                 struct ethtool_rxfh_param rxfh;
11250 
11251                 rxfh.indir = ethtool_rxfh_context_indir(ctx);
11252                 rxfh.key = ethtool_rxfh_context_key(ctx);
11253                 rxfh.hfunc = ctx->hfunc;
11254                 rxfh.input_xfrm = ctx->input_xfrm;
11255                 rxfh.rss_context = context;
11256                 rxfh.rss_delete = true;
11257 
11258                 xa_erase(&dev->ethtool->rss_ctx, context);
11259                 if (dev->ethtool_ops->create_rxfh_context)
11260                         dev->ethtool_ops->remove_rxfh_context(dev, ctx,
11261                                                               context, NULL);
11262                 else
11263                         dev->ethtool_ops->set_rxfh(dev, &rxfh, NULL);
11264                 kfree(ctx);
11265         }
11266         xa_destroy(&dev->ethtool->rss_ctx);
11267         mutex_unlock(&dev->ethtool->rss_lock);
11268 }
11269 
11270 /**
11271  *      unregister_netdevice_queue - remove device from the kernel
11272  *      @dev: device
11273  *      @head: list
11274  *
11275  *      This function shuts down a device interface and removes it
11276  *      from the kernel tables.
11277  *      If head not NULL, device is queued to be unregistered later.
11278  *
11279  *      Callers must hold the rtnl semaphore.  You may want
11280  *      unregister_netdev() instead of this.
11281  */
11282 
11283 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
11284 {
11285         ASSERT_RTNL();
11286 
11287         if (head) {
11288                 list_move_tail(&dev->unreg_list, head);
11289         } else {
11290                 LIST_HEAD(single);
11291 
11292                 list_add(&dev->unreg_list, &single);
11293                 unregister_netdevice_many(&single);
11294         }
11295 }
11296 EXPORT_SYMBOL(unregister_netdevice_queue);
11297 
11298 void unregister_netdevice_many_notify(struct list_head *head,
11299                                       u32 portid, const struct nlmsghdr *nlh)
11300 {
11301         struct net_device *dev, *tmp;
11302         LIST_HEAD(close_head);
11303         int cnt = 0;
11304 
11305         BUG_ON(dev_boot_phase);
11306         ASSERT_RTNL();
11307 
11308         if (list_empty(head))
11309                 return;
11310 
11311         list_for_each_entry_safe(dev, tmp, head, unreg_list) {
11312                 /* Some devices call without registering
11313                  * for initialization unwind. Remove those
11314                  * devices and proceed with the remaining.
11315                  */
11316                 if (dev->reg_state == NETREG_UNINITIALIZED) {
11317                         pr_debug("unregister_netdevice: device %s/%p never was registered\n",
11318                                  dev->name, dev);
11319 
11320                         WARN_ON(1);
11321                         list_del(&dev->unreg_list);
11322                         continue;
11323                 }
11324                 dev->dismantle = true;
11325                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
11326         }
11327 
11328         /* If device is running, close it first. */
11329         list_for_each_entry(dev, head, unreg_list)
11330                 list_add_tail(&dev->close_list, &close_head);
11331         dev_close_many(&close_head, true);
11332 
11333         list_for_each_entry(dev, head, unreg_list) {
11334                 /* And unlink it from device chain. */
11335                 unlist_netdevice(dev);
11336                 WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERING);
11337         }
11338         flush_all_backlogs();
11339 
11340         synchronize_net();
11341 
11342         list_for_each_entry(dev, head, unreg_list) {
11343                 struct sk_buff *skb = NULL;
11344 
11345                 /* Shutdown queueing discipline. */
11346                 dev_shutdown(dev);
11347                 dev_tcx_uninstall(dev);
11348                 dev_xdp_uninstall(dev);
11349                 bpf_dev_bound_netdev_unregister(dev);
11350 
11351                 netdev_offload_xstats_disable_all(dev);
11352 
11353                 /* Notify protocols, that we are about to destroy
11354                  * this device. They should clean all the things.
11355                  */
11356                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
11357 
11358                 if (!dev->rtnl_link_ops ||
11359                     dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
11360                         skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0,
11361                                                      GFP_KERNEL, NULL, 0,
11362                                                      portid, nlh);
11363 
11364                 /*
11365                  *      Flush the unicast and multicast chains
11366                  */
11367                 dev_uc_flush(dev);
11368                 dev_mc_flush(dev);
11369 
11370                 netdev_name_node_alt_flush(dev);
11371                 netdev_name_node_free(dev->name_node);
11372 
11373                 netdev_rss_contexts_free(dev);
11374 
11375                 call_netdevice_notifiers(NETDEV_PRE_UNINIT, dev);
11376 
11377                 if (dev->netdev_ops->ndo_uninit)
11378                         dev->netdev_ops->ndo_uninit(dev);
11379 
11380                 mutex_destroy(&dev->ethtool->rss_lock);
11381 
11382                 if (skb)
11383                         rtmsg_ifinfo_send(skb, dev, GFP_KERNEL, portid, nlh);
11384 
11385                 /* Notifier chain MUST detach us all upper devices. */
11386                 WARN_ON(netdev_has_any_upper_dev(dev));
11387                 WARN_ON(netdev_has_any_lower_dev(dev));
11388 
11389                 /* Remove entries from kobject tree */
11390                 netdev_unregister_kobject(dev);
11391 #ifdef CONFIG_XPS
11392                 /* Remove XPS queueing entries */
11393                 netif_reset_xps_queues_gt(dev, 0);
11394 #endif
11395         }
11396 
11397         synchronize_net();
11398 
11399         list_for_each_entry(dev, head, unreg_list) {
11400                 netdev_put(dev, &dev->dev_registered_tracker);
11401                 net_set_todo(dev);
11402                 cnt++;
11403         }
11404         atomic_add(cnt, &dev_unreg_count);
11405 
11406         list_del(head);
11407 }
11408 
11409 /**
11410  *      unregister_netdevice_many - unregister many devices
11411  *      @head: list of devices
11412  *
11413  *  Note: As most callers use a stack allocated list_head,
11414  *  we force a list_del() to make sure stack wont be corrupted later.
11415  */
11416 void unregister_netdevice_many(struct list_head *head)
11417 {
11418         unregister_netdevice_many_notify(head, 0, NULL);
11419 }
11420 EXPORT_SYMBOL(unregister_netdevice_many);
11421 
11422 /**
11423  *      unregister_netdev - remove device from the kernel
11424  *      @dev: device
11425  *
11426  *      This function shuts down a device interface and removes it
11427  *      from the kernel tables.
11428  *
11429  *      This is just a wrapper for unregister_netdevice that takes
11430  *      the rtnl semaphore.  In general you want to use this and not
11431  *      unregister_netdevice.
11432  */
11433 void unregister_netdev(struct net_device *dev)
11434 {
11435         rtnl_lock();
11436         unregister_netdevice(dev);
11437         rtnl_unlock();
11438 }
11439 EXPORT_SYMBOL(unregister_netdev);
11440 
11441 /**
11442  *      __dev_change_net_namespace - move device to different nethost namespace
11443  *      @dev: device
11444  *      @net: network namespace
11445  *      @pat: If not NULL name pattern to try if the current device name
11446  *            is already taken in the destination network namespace.
11447  *      @new_ifindex: If not zero, specifies device index in the target
11448  *                    namespace.
11449  *
11450  *      This function shuts down a device interface and moves it
11451  *      to a new network namespace. On success 0 is returned, on
11452  *      a failure a netagive errno code is returned.
11453  *
11454  *      Callers must hold the rtnl semaphore.
11455  */
11456 
11457 int __dev_change_net_namespace(struct net_device *dev, struct net *net,
11458                                const char *pat, int new_ifindex)
11459 {
11460         struct netdev_name_node *name_node;
11461         struct net *net_old = dev_net(dev);
11462         char new_name[IFNAMSIZ] = {};
11463         int err, new_nsid;
11464 
11465         ASSERT_RTNL();
11466 
11467         /* Don't allow namespace local devices to be moved. */
11468         err = -EINVAL;
11469         if (dev->features & NETIF_F_NETNS_LOCAL)
11470                 goto out;
11471 
11472         /* Ensure the device has been registrered */
11473         if (dev->reg_state != NETREG_REGISTERED)
11474                 goto out;
11475 
11476         /* Get out if there is nothing todo */
11477         err = 0;
11478         if (net_eq(net_old, net))
11479                 goto out;
11480 
11481         /* Pick the destination device name, and ensure
11482          * we can use it in the destination network namespace.
11483          */
11484         err = -EEXIST;
11485         if (netdev_name_in_use(net, dev->name)) {
11486                 /* We get here if we can't use the current device name */
11487                 if (!pat)
11488                         goto out;
11489                 err = dev_prep_valid_name(net, dev, pat, new_name, EEXIST);
11490                 if (err < 0)
11491                         goto out;
11492         }
11493         /* Check that none of the altnames conflicts. */
11494         err = -EEXIST;
11495         netdev_for_each_altname(dev, name_node)
11496                 if (netdev_name_in_use(net, name_node->name))
11497                         goto out;
11498 
11499         /* Check that new_ifindex isn't used yet. */
11500         if (new_ifindex) {
11501                 err = dev_index_reserve(net, new_ifindex);
11502                 if (err < 0)
11503                         goto out;
11504         } else {
11505                 /* If there is an ifindex conflict assign a new one */
11506                 err = dev_index_reserve(net, dev->ifindex);
11507                 if (err == -EBUSY)
11508                         err = dev_index_reserve(net, 0);
11509                 if (err < 0)
11510                         goto out;
11511                 new_ifindex = err;
11512         }
11513 
11514         /*
11515          * And now a mini version of register_netdevice unregister_netdevice.
11516          */
11517 
11518         /* If device is running close it first. */
11519         dev_close(dev);
11520 
11521         /* And unlink it from device chain */
11522         unlist_netdevice(dev);
11523 
11524         synchronize_net();
11525 
11526         /* Shutdown queueing discipline. */
11527         dev_shutdown(dev);
11528 
11529         /* Notify protocols, that we are about to destroy
11530          * this device. They should clean all the things.
11531          *
11532          * Note that dev->reg_state stays at NETREG_REGISTERED.
11533          * This is wanted because this way 8021q and macvlan know
11534          * the device is just moving and can keep their slaves up.
11535          */
11536         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
11537         rcu_barrier();
11538 
11539         new_nsid = peernet2id_alloc(dev_net(dev), net, GFP_KERNEL);
11540 
11541         rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid,
11542                             new_ifindex);
11543 
11544         /*
11545          *      Flush the unicast and multicast chains
11546          */
11547         dev_uc_flush(dev);
11548         dev_mc_flush(dev);
11549 
11550         /* Send a netdev-removed uevent to the old namespace */
11551         kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
11552         netdev_adjacent_del_links(dev);
11553 
11554         /* Move per-net netdevice notifiers that are following the netdevice */
11555         move_netdevice_notifiers_dev_net(dev, net);
11556 
11557         /* Actually switch the network namespace */
11558         dev_net_set(dev, net);
11559         dev->ifindex = new_ifindex;
11560 
11561         if (new_name[0]) {
11562                 /* Rename the netdev to prepared name */
11563                 write_seqlock_bh(&netdev_rename_lock);
11564                 strscpy(dev->name, new_name, IFNAMSIZ);
11565                 write_sequnlock_bh(&netdev_rename_lock);
11566         }
11567 
11568         /* Fixup kobjects */
11569         dev_set_uevent_suppress(&dev->dev, 1);
11570         err = device_rename(&dev->dev, dev->name);
11571         dev_set_uevent_suppress(&dev->dev, 0);
11572         WARN_ON(err);
11573 
11574         /* Send a netdev-add uevent to the new namespace */
11575         kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
11576         netdev_adjacent_add_links(dev);
11577 
11578         /* Adapt owner in case owning user namespace of target network
11579          * namespace is different from the original one.
11580          */
11581         err = netdev_change_owner(dev, net_old, net);
11582         WARN_ON(err);
11583 
11584         /* Add the device back in the hashes */
11585         list_netdevice(dev);
11586 
11587         /* Notify protocols, that a new device appeared. */
11588         call_netdevice_notifiers(NETDEV_REGISTER, dev);
11589 
11590         /*
11591          *      Prevent userspace races by waiting until the network
11592          *      device is fully setup before sending notifications.
11593          */
11594         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL, 0, NULL);
11595 
11596         synchronize_net();
11597         err = 0;
11598 out:
11599         return err;
11600 }
11601 EXPORT_SYMBOL_GPL(__dev_change_net_namespace);
11602 
11603 static int dev_cpu_dead(unsigned int oldcpu)
11604 {
11605         struct sk_buff **list_skb;
11606         struct sk_buff *skb;
11607         unsigned int cpu;
11608         struct softnet_data *sd, *oldsd, *remsd = NULL;
11609 
11610         local_irq_disable();
11611         cpu = smp_processor_id();
11612         sd = &per_cpu(softnet_data, cpu);
11613         oldsd = &per_cpu(softnet_data, oldcpu);
11614 
11615         /* Find end of our completion_queue. */
11616         list_skb = &sd->completion_queue;
11617         while (*list_skb)
11618                 list_skb = &(*list_skb)->next;
11619         /* Append completion queue from offline CPU. */
11620         *list_skb = oldsd->completion_queue;
11621         oldsd->completion_queue = NULL;
11622 
11623         /* Append output queue from offline CPU. */
11624         if (oldsd->output_queue) {
11625                 *sd->output_queue_tailp = oldsd->output_queue;
11626                 sd->output_queue_tailp = oldsd->output_queue_tailp;
11627                 oldsd->output_queue = NULL;
11628                 oldsd->output_queue_tailp = &oldsd->output_queue;
11629         }
11630         /* Append NAPI poll list from offline CPU, with one exception :
11631          * process_backlog() must be called by cpu owning percpu backlog.
11632          * We properly handle process_queue & input_pkt_queue later.
11633          */
11634         while (!list_empty(&oldsd->poll_list)) {
11635                 struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
11636                                                             struct napi_struct,
11637                                                             poll_list);
11638 
11639                 list_del_init(&napi->poll_list);
11640                 if (napi->poll == process_backlog)
11641                         napi->state &= NAPIF_STATE_THREADED;
11642                 else
11643                         ____napi_schedule(sd, napi);
11644         }
11645 
11646         raise_softirq_irqoff(NET_TX_SOFTIRQ);
11647         local_irq_enable();
11648 
11649         if (!use_backlog_threads()) {
11650 #ifdef CONFIG_RPS
11651                 remsd = oldsd->rps_ipi_list;
11652                 oldsd->rps_ipi_list = NULL;
11653 #endif
11654                 /* send out pending IPI's on offline CPU */
11655                 net_rps_send_ipi(remsd);
11656         }
11657 
11658         /* Process offline CPU's input_pkt_queue */
11659         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
11660                 netif_rx(skb);
11661                 rps_input_queue_head_incr(oldsd);
11662         }
11663         while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
11664                 netif_rx(skb);
11665                 rps_input_queue_head_incr(oldsd);
11666         }
11667 
11668         return 0;
11669 }
11670 
11671 /**
11672  *      netdev_increment_features - increment feature set by one
11673  *      @all: current feature set
11674  *      @one: new feature set
11675  *      @mask: mask feature set
11676  *
11677  *      Computes a new feature set after adding a device with feature set
11678  *      @one to the master device with current feature set @all.  Will not
11679  *      enable anything that is off in @mask. Returns the new feature set.
11680  */
11681 netdev_features_t netdev_increment_features(netdev_features_t all,
11682         netdev_features_t one, netdev_features_t mask)
11683 {
11684         if (mask & NETIF_F_HW_CSUM)
11685                 mask |= NETIF_F_CSUM_MASK;
11686         mask |= NETIF_F_VLAN_CHALLENGED;
11687 
11688         all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
11689         all &= one | ~NETIF_F_ALL_FOR_ALL;
11690 
11691         /* If one device supports hw checksumming, set for all. */
11692         if (all & NETIF_F_HW_CSUM)
11693                 all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM);
11694 
11695         return all;
11696 }
11697 EXPORT_SYMBOL(netdev_increment_features);
11698 
11699 static struct hlist_head * __net_init netdev_create_hash(void)
11700 {
11701         int i;
11702         struct hlist_head *hash;
11703 
11704         hash = kmalloc_array(NETDEV_HASHENTRIES, sizeof(*hash), GFP_KERNEL);
11705         if (hash != NULL)
11706                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
11707                         INIT_HLIST_HEAD(&hash[i]);
11708 
11709         return hash;
11710 }
11711 
11712 /* Initialize per network namespace state */
11713 static int __net_init netdev_init(struct net *net)
11714 {
11715         BUILD_BUG_ON(GRO_HASH_BUCKETS >
11716                      8 * sizeof_field(struct napi_struct, gro_bitmask));
11717 
11718         INIT_LIST_HEAD(&net->dev_base_head);
11719 
11720         net->dev_name_head = netdev_create_hash();
11721         if (net->dev_name_head == NULL)
11722                 goto err_name;
11723 
11724         net->dev_index_head = netdev_create_hash();
11725         if (net->dev_index_head == NULL)
11726                 goto err_idx;
11727 
11728         xa_init_flags(&net->dev_by_index, XA_FLAGS_ALLOC1);
11729 
11730         RAW_INIT_NOTIFIER_HEAD(&net->netdev_chain);
11731 
11732         return 0;
11733 
11734 err_idx:
11735         kfree(net->dev_name_head);
11736 err_name:
11737         return -ENOMEM;
11738 }
11739 
11740 /**
11741  *      netdev_drivername - network driver for the device
11742  *      @dev: network device
11743  *
11744  *      Determine network driver for device.
11745  */
11746 const char *netdev_drivername(const struct net_device *dev)
11747 {
11748         const struct device_driver *driver;
11749         const struct device *parent;
11750         const char *empty = "";
11751 
11752         parent = dev->dev.parent;
11753         if (!parent)
11754                 return empty;
11755 
11756         driver = parent->driver;
11757         if (driver && driver->name)
11758                 return driver->name;
11759         return empty;
11760 }
11761 
11762 static void __netdev_printk(const char *level, const struct net_device *dev,
11763                             struct va_format *vaf)
11764 {
11765         if (dev && dev->dev.parent) {
11766                 dev_printk_emit(level[1] - '',
11767                                 dev->dev.parent,
11768                                 "%s %s %s%s: %pV",
11769                                 dev_driver_string(dev->dev.parent),
11770                                 dev_name(dev->dev.parent),
11771                                 netdev_name(dev), netdev_reg_state(dev),
11772                                 vaf);
11773         } else if (dev) {
11774                 printk("%s%s%s: %pV",
11775                        level, netdev_name(dev), netdev_reg_state(dev), vaf);
11776         } else {
11777                 printk("%s(NULL net_device): %pV", level, vaf);
11778         }
11779 }
11780 
11781 void netdev_printk(const char *level, const struct net_device *dev,
11782                    const char *format, ...)
11783 {
11784         struct va_format vaf;
11785         va_list args;
11786 
11787         va_start(args, format);
11788 
11789         vaf.fmt = format;
11790         vaf.va = &args;
11791 
11792         __netdev_printk(level, dev, &vaf);
11793 
11794         va_end(args);
11795 }
11796 EXPORT_SYMBOL(netdev_printk);
11797 
11798 #define define_netdev_printk_level(func, level)                 \
11799 void func(const struct net_device *dev, const char *fmt, ...)   \
11800 {                                                               \
11801         struct va_format vaf;                                   \
11802         va_list args;                                           \
11803                                                                 \
11804         va_start(args, fmt);                                    \
11805                                                                 \
11806         vaf.fmt = fmt;                                          \
11807         vaf.va = &args;                                         \
11808                                                                 \
11809         __netdev_printk(level, dev, &vaf);                      \
11810                                                                 \
11811         va_end(args);                                           \
11812 }                                                               \
11813 EXPORT_SYMBOL(func);
11814 
11815 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
11816 define_netdev_printk_level(netdev_alert, KERN_ALERT);
11817 define_netdev_printk_level(netdev_crit, KERN_CRIT);
11818 define_netdev_printk_level(netdev_err, KERN_ERR);
11819 define_netdev_printk_level(netdev_warn, KERN_WARNING);
11820 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
11821 define_netdev_printk_level(netdev_info, KERN_INFO);
11822 
11823 static void __net_exit netdev_exit(struct net *net)
11824 {
11825         kfree(net->dev_name_head);
11826         kfree(net->dev_index_head);
11827         xa_destroy(&net->dev_by_index);
11828         if (net != &init_net)
11829                 WARN_ON_ONCE(!list_empty(&net->dev_base_head));
11830 }
11831 
11832 static struct pernet_operations __net_initdata netdev_net_ops = {
11833         .init = netdev_init,
11834         .exit = netdev_exit,
11835 };
11836 
11837 static void __net_exit default_device_exit_net(struct net *net)
11838 {
11839         struct netdev_name_node *name_node, *tmp;
11840         struct net_device *dev, *aux;
11841         /*
11842          * Push all migratable network devices back to the
11843          * initial network namespace
11844          */
11845         ASSERT_RTNL();
11846         for_each_netdev_safe(net, dev, aux) {
11847                 int err;
11848                 char fb_name[IFNAMSIZ];
11849 
11850                 /* Ignore unmoveable devices (i.e. loopback) */
11851                 if (dev->features & NETIF_F_NETNS_LOCAL)
11852                         continue;
11853 
11854                 /* Leave virtual devices for the generic cleanup */
11855                 if (dev->rtnl_link_ops && !dev->rtnl_link_ops->netns_refund)
11856                         continue;
11857 
11858                 /* Push remaining network devices to init_net */
11859                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
11860                 if (netdev_name_in_use(&init_net, fb_name))
11861                         snprintf(fb_name, IFNAMSIZ, "dev%%d");
11862 
11863                 netdev_for_each_altname_safe(dev, name_node, tmp)
11864                         if (netdev_name_in_use(&init_net, name_node->name))
11865                                 __netdev_name_node_alt_destroy(name_node);
11866 
11867                 err = dev_change_net_namespace(dev, &init_net, fb_name);
11868                 if (err) {
11869                         pr_emerg("%s: failed to move %s to init_net: %d\n",
11870                                  __func__, dev->name, err);
11871                         BUG();
11872                 }
11873         }
11874 }
11875 
11876 static void __net_exit default_device_exit_batch(struct list_head *net_list)
11877 {
11878         /* At exit all network devices most be removed from a network
11879          * namespace.  Do this in the reverse order of registration.
11880          * Do this across as many network namespaces as possible to
11881          * improve batching efficiency.
11882          */
11883         struct net_device *dev;
11884         struct net *net;
11885         LIST_HEAD(dev_kill_list);
11886 
11887         rtnl_lock();
11888         list_for_each_entry(net, net_list, exit_list) {
11889                 default_device_exit_net(net);
11890                 cond_resched();
11891         }
11892 
11893         list_for_each_entry(net, net_list, exit_list) {
11894                 for_each_netdev_reverse(net, dev) {
11895                         if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
11896                                 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
11897                         else
11898                                 unregister_netdevice_queue(dev, &dev_kill_list);
11899                 }
11900         }
11901         unregister_netdevice_many(&dev_kill_list);
11902         rtnl_unlock();
11903 }
11904 
11905 static struct pernet_operations __net_initdata default_device_ops = {
11906         .exit_batch = default_device_exit_batch,
11907 };
11908 
11909 static void __init net_dev_struct_check(void)
11910 {
11911         /* TX read-mostly hotpath */
11912         CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, priv_flags);
11913         CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, netdev_ops);
11914         CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, header_ops);
11915         CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, _tx);
11916         CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, real_num_tx_queues);
11917         CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_max_size);
11918         CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_ipv4_max_size);
11919         CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_max_segs);
11920         CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_partial_features);
11921         CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, num_tc);
11922         CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, mtu);
11923         CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, needed_headroom);
11924         CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, tc_to_txq);
11925 #ifdef CONFIG_XPS
11926         CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, xps_maps);
11927 #endif
11928 #ifdef CONFIG_NETFILTER_EGRESS
11929         CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, nf_hooks_egress);
11930 #endif
11931 #ifdef CONFIG_NET_XGRESS
11932         CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, tcx_egress);
11933 #endif
11934         CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_tx, 160);
11935 
11936         /* TXRX read-mostly hotpath */
11937         CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, lstats);
11938         CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, state);
11939         CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, flags);
11940         CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, hard_header_len);
11941         CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, features);
11942         CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, ip6_ptr);
11943         CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_txrx, 46);
11944 
11945         /* RX read-mostly hotpath */
11946         CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, ptype_specific);
11947         CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, ifindex);
11948         CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, real_num_rx_queues);
11949         CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, _rx);
11950         CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_flush_timeout);
11951         CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, napi_defer_hard_irqs);
11952         CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_max_size);
11953         CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_ipv4_max_size);
11954         CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, rx_handler);
11955         CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, rx_handler_data);
11956         CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, nd_net);
11957 #ifdef CONFIG_NETPOLL
11958         CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, npinfo);
11959 #endif
11960 #ifdef CONFIG_NET_XGRESS
11961         CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, tcx_ingress);
11962 #endif
11963         CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_rx, 104);
11964 }
11965 
11966 /*
11967  *      Initialize the DEV module. At boot time this walks the device list and
11968  *      unhooks any devices that fail to initialise (normally hardware not
11969  *      present) and leaves us with a valid list of present and active devices.
11970  *
11971  */
11972 
11973 /* We allocate 256 pages for each CPU if PAGE_SHIFT is 12 */
11974 #define SYSTEM_PERCPU_PAGE_POOL_SIZE    ((1 << 20) / PAGE_SIZE)
11975 
11976 static int net_page_pool_create(int cpuid)
11977 {
11978 #if IS_ENABLED(CONFIG_PAGE_POOL)
11979         struct page_pool_params page_pool_params = {
11980                 .pool_size = SYSTEM_PERCPU_PAGE_POOL_SIZE,
11981                 .flags = PP_FLAG_SYSTEM_POOL,
11982                 .nid = cpu_to_mem(cpuid),
11983         };
11984         struct page_pool *pp_ptr;
11985 
11986         pp_ptr = page_pool_create_percpu(&page_pool_params, cpuid);
11987         if (IS_ERR(pp_ptr))
11988                 return -ENOMEM;
11989 
11990         per_cpu(system_page_pool, cpuid) = pp_ptr;
11991 #endif
11992         return 0;
11993 }
11994 
11995 static int backlog_napi_should_run(unsigned int cpu)
11996 {
11997         struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu);
11998         struct napi_struct *napi = &sd->backlog;
11999 
12000         return test_bit(NAPI_STATE_SCHED_THREADED, &napi->state);
12001 }
12002 
12003 static void run_backlog_napi(unsigned int cpu)
12004 {
12005         struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu);
12006 
12007         napi_threaded_poll_loop(&sd->backlog);
12008 }
12009 
12010 static void backlog_napi_setup(unsigned int cpu)
12011 {
12012         struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu);
12013         struct napi_struct *napi = &sd->backlog;
12014 
12015         napi->thread = this_cpu_read(backlog_napi);
12016         set_bit(NAPI_STATE_THREADED, &napi->state);
12017 }
12018 
12019 static struct smp_hotplug_thread backlog_threads = {
12020         .store                  = &backlog_napi,
12021         .thread_should_run      = backlog_napi_should_run,
12022         .thread_fn              = run_backlog_napi,
12023         .thread_comm            = "backlog_napi/%u",
12024         .setup                  = backlog_napi_setup,
12025 };
12026 
12027 /*
12028  *       This is called single threaded during boot, so no need
12029  *       to take the rtnl semaphore.
12030  */
12031 static int __init net_dev_init(void)
12032 {
12033         int i, rc = -ENOMEM;
12034 
12035         BUG_ON(!dev_boot_phase);
12036 
12037         net_dev_struct_check();
12038 
12039         if (dev_proc_init())
12040                 goto out;
12041 
12042         if (netdev_kobject_init())
12043                 goto out;
12044 
12045         for (i = 0; i < PTYPE_HASH_SIZE; i++)
12046                 INIT_LIST_HEAD(&ptype_base[i]);
12047 
12048         if (register_pernet_subsys(&netdev_net_ops))
12049                 goto out;
12050 
12051         /*
12052          *      Initialise the packet receive queues.
12053          */
12054 
12055         for_each_possible_cpu(i) {
12056                 struct work_struct *flush = per_cpu_ptr(&flush_works, i);
12057                 struct softnet_data *sd = &per_cpu(softnet_data, i);
12058 
12059                 INIT_WORK(flush, flush_backlog);
12060 
12061                 skb_queue_head_init(&sd->input_pkt_queue);
12062                 skb_queue_head_init(&sd->process_queue);
12063 #ifdef CONFIG_XFRM_OFFLOAD
12064                 skb_queue_head_init(&sd->xfrm_backlog);
12065 #endif
12066                 INIT_LIST_HEAD(&sd->poll_list);
12067                 sd->output_queue_tailp = &sd->output_queue;
12068 #ifdef CONFIG_RPS
12069                 INIT_CSD(&sd->csd, rps_trigger_softirq, sd);
12070                 sd->cpu = i;
12071 #endif
12072                 INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd);
12073                 spin_lock_init(&sd->defer_lock);
12074 
12075                 init_gro_hash(&sd->backlog);
12076                 sd->backlog.poll = process_backlog;
12077                 sd->backlog.weight = weight_p;
12078                 INIT_LIST_HEAD(&sd->backlog.poll_list);
12079 
12080                 if (net_page_pool_create(i))
12081                         goto out;
12082         }
12083         if (use_backlog_threads())
12084                 smpboot_register_percpu_thread(&backlog_threads);
12085 
12086         dev_boot_phase = 0;
12087 
12088         /* The loopback device is special if any other network devices
12089          * is present in a network namespace the loopback device must
12090          * be present. Since we now dynamically allocate and free the
12091          * loopback device ensure this invariant is maintained by
12092          * keeping the loopback device as the first device on the
12093          * list of network devices.  Ensuring the loopback devices
12094          * is the first device that appears and the last network device
12095          * that disappears.
12096          */
12097         if (register_pernet_device(&loopback_net_ops))
12098                 goto out;
12099 
12100         if (register_pernet_device(&default_device_ops))
12101                 goto out;
12102 
12103         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
12104         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
12105 
12106         rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead",
12107                                        NULL, dev_cpu_dead);
12108         WARN_ON(rc < 0);
12109         rc = 0;
12110 
12111         /* avoid static key IPIs to isolated CPUs */
12112         if (housekeeping_enabled(HK_TYPE_MISC))
12113                 net_enable_timestamp();
12114 out:
12115         if (rc < 0) {
12116                 for_each_possible_cpu(i) {
12117                         struct page_pool *pp_ptr;
12118 
12119                         pp_ptr = per_cpu(system_page_pool, i);
12120                         if (!pp_ptr)
12121                                 continue;
12122 
12123                         page_pool_destroy(pp_ptr);
12124                         per_cpu(system_page_pool, i) = NULL;
12125                 }
12126         }
12127 
12128         return rc;
12129 }
12130 
12131 subsys_initcall(net_dev_init);
12132 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

sflogo.php