2 * NET3 Protocol independent device support routines.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
9 * Derived from the non IP parts of dev.c 1.0.19
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
15 * Florian la Roche <rzsfl@rz.uni-sb.de>
16 * Alan Cox <gw4pts@gw4pts.ampr.org>
17 * David Hinds <dahinds@users.sourceforge.net>
18 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19 * Adam Sulmicki <adam@cfar.umd.edu>
20 * Pekka Riikonen <priikone@poesidon.pspt.fi>
23 * D.J. Barrow : Fixed bug where dev->refcnt gets set
24 * to 2 if register_netdev gets called
25 * before net_dev_init & also removed a
26 * few lines of code in the process.
27 * Alan Cox : device private ioctl copies fields back.
28 * Alan Cox : Transmit queue code does relevant
29 * stunts to keep the queue safe.
30 * Alan Cox : Fixed double lock.
31 * Alan Cox : Fixed promisc NULL pointer trap
32 * ???????? : Support the full private ioctl range
33 * Alan Cox : Moved ioctl permission check into
35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
36 * Alan Cox : 100 backlog just doesn't cut it when
37 * you start doing multicast video 8)
38 * Alan Cox : Rewrote net_bh and list manager.
39 * Alan Cox : Fix ETH_P_ALL echoback lengths.
40 * Alan Cox : Took out transmit every packet pass
41 * Saved a few bytes in the ioctl handler
42 * Alan Cox : Network driver sets packet type before
43 * calling netif_rx. Saves a function
45 * Alan Cox : Hashed net_bh()
46 * Richard Kooijman: Timestamp fixes.
47 * Alan Cox : Wrong field in SIOCGIFDSTADDR
48 * Alan Cox : Device lock protection.
49 * Alan Cox : Fixed nasty side effect of device close
51 * Rudi Cilibrasi : Pass the right thing to
53 * Dave Miller : 32bit quantity for the device lock to
54 * make it work out on a Sparc.
55 * Bjorn Ekwall : Added KERNELD hack.
56 * Alan Cox : Cleaned up the backlog initialise.
57 * Craig Metz : SIOCGIFCONF fix if space for under
59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
60 * is no device open function.
61 * Andi Kleen : Fix error reporting for SIOCGIFCONF
62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
63 * Cyrus Durgin : Cleaned for KMOD
64 * Adam Sulmicki : Bug Fix : Network Device Unload
65 * A network device unload needs to purge
67 * Paul Rusty Russell : SIOCSIFNAME
68 * Pekka Riikonen : Netdev boot-time settings code
69 * Andrew Morton : Make unregister_netdevice wait
70 * indefinitely on dev->refcnt
71 * J Hadi Salim : - Backlog queue sampling
72 * - netif_rx() feedback
75 #include <asm/uaccess.h>
76 #include <asm/system.h>
77 #include <linux/bitops.h>
78 #include <linux/capability.h>
79 #include <linux/cpu.h>
80 #include <linux/types.h>
81 #include <linux/kernel.h>
82 #include <linux/hash.h>
83 #include <linux/slab.h>
84 #include <linux/sched.h>
85 #include <linux/mutex.h>
86 #include <linux/string.h>
88 #include <linux/socket.h>
89 #include <linux/sockios.h>
90 #include <linux/errno.h>
91 #include <linux/interrupt.h>
92 #include <linux/if_ether.h>
93 #include <linux/netdevice.h>
94 #include <linux/etherdevice.h>
95 #include <linux/ethtool.h>
96 #include <linux/notifier.h>
97 #include <linux/skbuff.h>
98 #include <net/net_namespace.h>
100 #include <linux/rtnetlink.h>
101 #include <linux/proc_fs.h>
102 #include <linux/seq_file.h>
103 #include <linux/stat.h>
105 #include <net/pkt_sched.h>
106 #include <net/checksum.h>
107 #include <net/xfrm.h>
108 #include <linux/highmem.h>
109 #include <linux/init.h>
110 #include <linux/kmod.h>
111 #include <linux/module.h>
112 #include <linux/netpoll.h>
113 #include <linux/rcupdate.h>
114 #include <linux/delay.h>
115 #include <net/wext.h>
116 #include <net/iw_handler.h>
117 #include <asm/current.h>
118 #include <linux/audit.h>
119 #include <linux/dmaengine.h>
120 #include <linux/err.h>
121 #include <linux/ctype.h>
122 #include <linux/if_arp.h>
123 #include <linux/if_vlan.h>
124 #include <linux/ip.h>
126 #include <linux/ipv6.h>
127 #include <linux/in.h>
128 #include <linux/jhash.h>
129 #include <linux/random.h>
130 #include <trace/events/napi.h>
131 #include <trace/events/net.h>
132 #include <trace/events/skb.h>
133 #include <linux/pci.h>
134 #include <linux/inetdevice.h>
135 #include <linux/cpu_rmap.h>
137 #include "net-sysfs.h"
139 /* Instead of increasing this, you should create a hash table. */
140 #define MAX_GRO_SKBS 8
142 /* This should be increased if a protocol with a bigger head is added. */
143 #define GRO_MAX_HEAD (MAX_HEADER + 128)
146 * The list of packet types we will receive (as opposed to discard)
147 * and the routines to invoke.
149 * Why 16. Because with 16 the only overlap we get on a hash of the
150 * low nibble of the protocol value is RARP/SNAP/X.25.
152 * NOTE: That is no longer true with the addition of VLAN tags. Not
153 * sure which should go first, but I bet it won't make much
154 * difference if we are running VLANs. The good news is that
155 * this protocol won't be in the list unless compiled in, so
156 * the average user (w/out VLANs) will not be adversely affected.
173 #define PTYPE_HASH_SIZE (16)
174 #define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
176 static DEFINE_SPINLOCK(ptype_lock);
177 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
178 static struct list_head ptype_all __read_mostly; /* Taps */
181 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
184 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
186 * Writers must hold the rtnl semaphore while they loop through the
187 * dev_base_head list, and hold dev_base_lock for writing when they do the
188 * actual updates. This allows pure readers to access the list even
189 * while a writer is preparing to update it.
191 * To put it another way, dev_base_lock is held for writing only to
192 * protect against pure readers; the rtnl semaphore provides the
193 * protection against other writers.
195 * See, for example usages, register_netdevice() and
196 * unregister_netdevice(), which must be called with the rtnl
199 DEFINE_RWLOCK(dev_base_lock);
200 EXPORT_SYMBOL(dev_base_lock);
202 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
204 unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
205 return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
208 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
210 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
213 static inline void rps_lock(struct softnet_data *sd)
216 spin_lock(&sd->input_pkt_queue.lock);
220 static inline void rps_unlock(struct softnet_data *sd)
223 spin_unlock(&sd->input_pkt_queue.lock);
227 /* Device list insertion */
228 static int list_netdevice(struct net_device *dev)
230 struct net *net = dev_net(dev);
234 write_lock_bh(&dev_base_lock);
235 list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
236 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
237 hlist_add_head_rcu(&dev->index_hlist,
238 dev_index_hash(net, dev->ifindex));
239 write_unlock_bh(&dev_base_lock);
243 /* Device list removal
244 * caller must respect a RCU grace period before freeing/reusing dev
246 static void unlist_netdevice(struct net_device *dev)
250 /* Unlink dev from the device chain */
251 write_lock_bh(&dev_base_lock);
252 list_del_rcu(&dev->dev_list);
253 hlist_del_rcu(&dev->name_hlist);
254 hlist_del_rcu(&dev->index_hlist);
255 write_unlock_bh(&dev_base_lock);
262 static RAW_NOTIFIER_HEAD(netdev_chain);
265 * Device drivers call our routines to queue packets here. We empty the
266 * queue in the local softnet handler.
269 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
270 EXPORT_PER_CPU_SYMBOL(softnet_data);
272 #ifdef CONFIG_LOCKDEP
274 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
275 * according to dev->type
277 static const unsigned short netdev_lock_type[] =
278 {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
279 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
280 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
281 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
282 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
283 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
284 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
285 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
286 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
287 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
288 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
289 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
290 ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
291 ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
292 ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,
293 ARPHRD_VOID, ARPHRD_NONE};
295 static const char *const netdev_lock_name[] =
296 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
297 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
298 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
299 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
300 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
301 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
302 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
303 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
304 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
305 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
306 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
307 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
308 "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
309 "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
310 "_xmit_PHONET_PIPE", "_xmit_IEEE802154",
311 "_xmit_VOID", "_xmit_NONE"};
313 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
314 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
316 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
320 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
321 if (netdev_lock_type[i] == dev_type)
323 /* the last key is used by default */
324 return ARRAY_SIZE(netdev_lock_type) - 1;
327 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
328 unsigned short dev_type)
332 i = netdev_lock_pos(dev_type);
333 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
334 netdev_lock_name[i]);
337 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
341 i = netdev_lock_pos(dev->type);
342 lockdep_set_class_and_name(&dev->addr_list_lock,
343 &netdev_addr_lock_key[i],
344 netdev_lock_name[i]);
347 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
348 unsigned short dev_type)
351 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
356 /*******************************************************************************
358 Protocol management and registration routines
360 *******************************************************************************/
363 * Add a protocol ID to the list. Now that the input handler is
364 * smarter we can dispense with all the messy stuff that used to be
367 * BEWARE!!! Protocol handlers, mangling input packets,
368 * MUST BE last in hash buckets and checking protocol handlers
369 * MUST start from promiscuous ptype_all chain in net_bh.
370 * It is true now, do not change it.
371 * Explanation follows: if protocol handler, mangling packet, will
372 * be the first on list, it is not able to sense, that packet
373 * is cloned and should be copied-on-write, so that it will
374 * change it and subsequent readers will get broken packet.
378 static inline struct list_head *ptype_head(const struct packet_type *pt)
380 if (pt->type == htons(ETH_P_ALL))
383 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
387 * dev_add_pack - add packet handler
388 * @pt: packet type declaration
390 * Add a protocol handler to the networking stack. The passed &packet_type
391 * is linked into kernel lists and may not be freed until it has been
392 * removed from the kernel lists.
394 * This call does not sleep therefore it can not
395 * guarantee all CPU's that are in middle of receiving packets
396 * will see the new packet type (until the next received packet).
399 void dev_add_pack(struct packet_type *pt)
401 struct list_head *head = ptype_head(pt);
403 spin_lock(&ptype_lock);
404 list_add_rcu(&pt->list, head);
405 spin_unlock(&ptype_lock);
407 EXPORT_SYMBOL(dev_add_pack);
410 * __dev_remove_pack - remove packet handler
411 * @pt: packet type declaration
413 * Remove a protocol handler that was previously added to the kernel
414 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
415 * from the kernel lists and can be freed or reused once this function
418 * The packet type might still be in use by receivers
419 * and must not be freed until after all the CPU's have gone
420 * through a quiescent state.
422 void __dev_remove_pack(struct packet_type *pt)
424 struct list_head *head = ptype_head(pt);
425 struct packet_type *pt1;
427 spin_lock(&ptype_lock);
429 list_for_each_entry(pt1, head, list) {
431 list_del_rcu(&pt->list);
436 printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
438 spin_unlock(&ptype_lock);
440 EXPORT_SYMBOL(__dev_remove_pack);
443 * dev_remove_pack - remove packet handler
444 * @pt: packet type declaration
446 * Remove a protocol handler that was previously added to the kernel
447 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
448 * from the kernel lists and can be freed or reused once this function
451 * This call sleeps to guarantee that no CPU is looking at the packet
454 void dev_remove_pack(struct packet_type *pt)
456 __dev_remove_pack(pt);
460 EXPORT_SYMBOL(dev_remove_pack);
462 /******************************************************************************
464 Device Boot-time Settings Routines
466 *******************************************************************************/
468 /* Boot time configuration table */
469 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
472 * netdev_boot_setup_add - add new setup entry
473 * @name: name of the device
474 * @map: configured settings for the device
476 * Adds new setup entry to the dev_boot_setup list. The function
477 * returns 0 on error and 1 on success. This is a generic routine to
480 static int netdev_boot_setup_add(char *name, struct ifmap *map)
482 struct netdev_boot_setup *s;
486 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
487 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
488 memset(s[i].name, 0, sizeof(s[i].name));
489 strlcpy(s[i].name, name, IFNAMSIZ);
490 memcpy(&s[i].map, map, sizeof(s[i].map));
495 return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
499 * netdev_boot_setup_check - check boot time settings
500 * @dev: the netdevice
502 * Check boot time settings for the device.
503 * The found settings are set for the device to be used
504 * later in the device probing.
505 * Returns 0 if no settings found, 1 if they are.
507 int netdev_boot_setup_check(struct net_device *dev)
509 struct netdev_boot_setup *s = dev_boot_setup;
512 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
513 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
514 !strcmp(dev->name, s[i].name)) {
515 dev->irq = s[i].map.irq;
516 dev->base_addr = s[i].map.base_addr;
517 dev->mem_start = s[i].map.mem_start;
518 dev->mem_end = s[i].map.mem_end;
524 EXPORT_SYMBOL(netdev_boot_setup_check);
528 * netdev_boot_base - get address from boot time settings
529 * @prefix: prefix for network device
530 * @unit: id for network device
532 * Check boot time settings for the base address of device.
533 * The found settings are set for the device to be used
534 * later in the device probing.
535 * Returns 0 if no settings found.
537 unsigned long netdev_boot_base(const char *prefix, int unit)
539 const struct netdev_boot_setup *s = dev_boot_setup;
543 sprintf(name, "%s%d", prefix, unit);
546 * If device already registered then return base of 1
547 * to indicate not to probe for this interface
549 if (__dev_get_by_name(&init_net, name))
552 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
553 if (!strcmp(name, s[i].name))
554 return s[i].map.base_addr;
559 * Saves at boot time configured settings for any netdevice.
561 int __init netdev_boot_setup(char *str)
566 str = get_options(str, ARRAY_SIZE(ints), ints);
571 memset(&map, 0, sizeof(map));
575 map.base_addr = ints[2];
577 map.mem_start = ints[3];
579 map.mem_end = ints[4];
581 /* Add new entry to the list */
582 return netdev_boot_setup_add(str, &map);
585 __setup("netdev=", netdev_boot_setup);
587 /*******************************************************************************
589 Device Interface Subroutines
591 *******************************************************************************/
594 * __dev_get_by_name - find a device by its name
595 * @net: the applicable net namespace
596 * @name: name to find
598 * Find an interface by name. Must be called under RTNL semaphore
599 * or @dev_base_lock. If the name is found a pointer to the device
600 * is returned. If the name is not found then %NULL is returned. The
601 * reference counters are not incremented so the caller must be
602 * careful with locks.
605 struct net_device *__dev_get_by_name(struct net *net, const char *name)
607 struct hlist_node *p;
608 struct net_device *dev;
609 struct hlist_head *head = dev_name_hash(net, name);
611 hlist_for_each_entry(dev, p, head, name_hlist)
612 if (!strncmp(dev->name, name, IFNAMSIZ))
617 EXPORT_SYMBOL(__dev_get_by_name);
620 * dev_get_by_name_rcu - find a device by its name
621 * @net: the applicable net namespace
622 * @name: name to find
624 * Find an interface by name.
625 * If the name is found a pointer to the device is returned.
626 * If the name is not found then %NULL is returned.
627 * The reference counters are not incremented so the caller must be
628 * careful with locks. The caller must hold RCU lock.
631 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
633 struct hlist_node *p;
634 struct net_device *dev;
635 struct hlist_head *head = dev_name_hash(net, name);
637 hlist_for_each_entry_rcu(dev, p, head, name_hlist)
638 if (!strncmp(dev->name, name, IFNAMSIZ))
643 EXPORT_SYMBOL(dev_get_by_name_rcu);
646 * dev_get_by_name - find a device by its name
647 * @net: the applicable net namespace
648 * @name: name to find
650 * Find an interface by name. This can be called from any
651 * context and does its own locking. The returned handle has
652 * the usage count incremented and the caller must use dev_put() to
653 * release it when it is no longer needed. %NULL is returned if no
654 * matching device is found.
657 struct net_device *dev_get_by_name(struct net *net, const char *name)
659 struct net_device *dev;
662 dev = dev_get_by_name_rcu(net, name);
668 EXPORT_SYMBOL(dev_get_by_name);
671 * __dev_get_by_index - find a device by its ifindex
672 * @net: the applicable net namespace
673 * @ifindex: index of device
675 * Search for an interface by index. Returns %NULL if the device
676 * is not found or a pointer to the device. The device has not
677 * had its reference counter increased so the caller must be careful
678 * about locking. The caller must hold either the RTNL semaphore
682 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
684 struct hlist_node *p;
685 struct net_device *dev;
686 struct hlist_head *head = dev_index_hash(net, ifindex);
688 hlist_for_each_entry(dev, p, head, index_hlist)
689 if (dev->ifindex == ifindex)
694 EXPORT_SYMBOL(__dev_get_by_index);
697 * dev_get_by_index_rcu - find a device by its ifindex
698 * @net: the applicable net namespace
699 * @ifindex: index of device
701 * Search for an interface by index. Returns %NULL if the device
702 * is not found or a pointer to the device. The device has not
703 * had its reference counter increased so the caller must be careful
704 * about locking. The caller must hold RCU lock.
707 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
709 struct hlist_node *p;
710 struct net_device *dev;
711 struct hlist_head *head = dev_index_hash(net, ifindex);
713 hlist_for_each_entry_rcu(dev, p, head, index_hlist)
714 if (dev->ifindex == ifindex)
719 EXPORT_SYMBOL(dev_get_by_index_rcu);
723 * dev_get_by_index - find a device by its ifindex
724 * @net: the applicable net namespace
725 * @ifindex: index of device
727 * Search for an interface by index. Returns NULL if the device
728 * is not found or a pointer to the device. The device returned has
729 * had a reference added and the pointer is safe until the user calls
730 * dev_put to indicate they have finished with it.
733 struct net_device *dev_get_by_index(struct net *net, int ifindex)
735 struct net_device *dev;
738 dev = dev_get_by_index_rcu(net, ifindex);
744 EXPORT_SYMBOL(dev_get_by_index);
747 * dev_getbyhwaddr_rcu - find a device by its hardware address
748 * @net: the applicable net namespace
749 * @type: media type of device
750 * @ha: hardware address
752 * Search for an interface by MAC address. Returns NULL if the device
753 * is not found or a pointer to the device.
754 * The caller must hold RCU or RTNL.
755 * The returned device has not had its ref count increased
756 * and the caller must therefore be careful about locking
760 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
763 struct net_device *dev;
765 for_each_netdev_rcu(net, dev)
766 if (dev->type == type &&
767 !memcmp(dev->dev_addr, ha, dev->addr_len))
772 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
774 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
776 struct net_device *dev;
779 for_each_netdev(net, dev)
780 if (dev->type == type)
785 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
787 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
789 struct net_device *dev, *ret = NULL;
792 for_each_netdev_rcu(net, dev)
793 if (dev->type == type) {
801 EXPORT_SYMBOL(dev_getfirstbyhwtype);
804 * dev_get_by_flags_rcu - find any device with given flags
805 * @net: the applicable net namespace
806 * @if_flags: IFF_* values
807 * @mask: bitmask of bits in if_flags to check
809 * Search for any interface with the given flags. Returns NULL if a device
810 * is not found or a pointer to the device. Must be called inside
811 * rcu_read_lock(), and result refcount is unchanged.
814 struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
817 struct net_device *dev, *ret;
820 for_each_netdev_rcu(net, dev) {
821 if (((dev->flags ^ if_flags) & mask) == 0) {
828 EXPORT_SYMBOL(dev_get_by_flags_rcu);
831 * dev_valid_name - check if name is okay for network device
834 * Network device names need to be valid file names to
835 * to allow sysfs to work. We also disallow any kind of
838 int dev_valid_name(const char *name)
842 if (strlen(name) >= IFNAMSIZ)
844 if (!strcmp(name, ".") || !strcmp(name, ".."))
848 if (*name == '/' || isspace(*name))
854 EXPORT_SYMBOL(dev_valid_name);
857 * __dev_alloc_name - allocate a name for a device
858 * @net: network namespace to allocate the device name in
859 * @name: name format string
860 * @buf: scratch buffer and result name string
862 * Passed a format string - eg "lt%d" it will try and find a suitable
863 * id. It scans list of devices to build up a free map, then chooses
864 * the first empty slot. The caller must hold the dev_base or rtnl lock
865 * while allocating the name and adding the device in order to avoid
867 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
868 * Returns the number of the unit assigned or a negative errno code.
871 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
875 const int max_netdevices = 8*PAGE_SIZE;
876 unsigned long *inuse;
877 struct net_device *d;
879 p = strnchr(name, IFNAMSIZ-1, '%');
882 * Verify the string as this thing may have come from
883 * the user. There must be either one "%d" and no other "%"
886 if (p[1] != 'd' || strchr(p + 2, '%'))
889 /* Use one page as a bit array of possible slots */
890 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
894 for_each_netdev(net, d) {
895 if (!sscanf(d->name, name, &i))
897 if (i < 0 || i >= max_netdevices)
900 /* avoid cases where sscanf is not exact inverse of printf */
901 snprintf(buf, IFNAMSIZ, name, i);
902 if (!strncmp(buf, d->name, IFNAMSIZ))
906 i = find_first_zero_bit(inuse, max_netdevices);
907 free_page((unsigned long) inuse);
911 snprintf(buf, IFNAMSIZ, name, i);
912 if (!__dev_get_by_name(net, buf))
915 /* It is possible to run out of possible slots
916 * when the name is long and there isn't enough space left
917 * for the digits, or if all bits are used.
923 * dev_alloc_name - allocate a name for a device
925 * @name: name format string
927 * Passed a format string - eg "lt%d" it will try and find a suitable
928 * id. It scans list of devices to build up a free map, then chooses
929 * the first empty slot. The caller must hold the dev_base or rtnl lock
930 * while allocating the name and adding the device in order to avoid
932 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
933 * Returns the number of the unit assigned or a negative errno code.
936 int dev_alloc_name(struct net_device *dev, const char *name)
942 BUG_ON(!dev_net(dev));
944 ret = __dev_alloc_name(net, name, buf);
946 strlcpy(dev->name, buf, IFNAMSIZ);
949 EXPORT_SYMBOL(dev_alloc_name);
951 static int dev_get_valid_name(struct net_device *dev, const char *name, bool fmt)
955 BUG_ON(!dev_net(dev));
958 if (!dev_valid_name(name))
961 if (fmt && strchr(name, '%'))
962 return dev_alloc_name(dev, name);
963 else if (__dev_get_by_name(net, name))
965 else if (dev->name != name)
966 strlcpy(dev->name, name, IFNAMSIZ);
972 * dev_change_name - change name of a device
974 * @newname: name (or format string) must be at least IFNAMSIZ
976 * Change name of a device, can pass format strings "eth%d".
979 int dev_change_name(struct net_device *dev, const char *newname)
981 char oldname[IFNAMSIZ];
987 BUG_ON(!dev_net(dev));
990 if (dev->flags & IFF_UP)
993 if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
996 memcpy(oldname, dev->name, IFNAMSIZ);
998 err = dev_get_valid_name(dev, newname, 1);
1003 ret = device_rename(&dev->dev, dev->name);
1005 memcpy(dev->name, oldname, IFNAMSIZ);
1009 write_lock_bh(&dev_base_lock);
1010 hlist_del(&dev->name_hlist);
1011 write_unlock_bh(&dev_base_lock);
1015 write_lock_bh(&dev_base_lock);
1016 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1017 write_unlock_bh(&dev_base_lock);
1019 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1020 ret = notifier_to_errno(ret);
1023 /* err >= 0 after dev_alloc_name() or stores the first errno */
1026 memcpy(dev->name, oldname, IFNAMSIZ);
1030 "%s: name change rollback failed: %d.\n",
1039 * dev_set_alias - change ifalias of a device
1041 * @alias: name up to IFALIASZ
1042 * @len: limit of bytes to copy from info
1044 * Set ifalias for a device,
1046 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1050 if (len >= IFALIASZ)
1055 kfree(dev->ifalias);
1056 dev->ifalias = NULL;
1061 dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1065 strlcpy(dev->ifalias, alias, len+1);
1071 * netdev_features_change - device changes features
1072 * @dev: device to cause notification
1074 * Called to indicate a device has changed features.
1076 void netdev_features_change(struct net_device *dev)
1078 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1080 EXPORT_SYMBOL(netdev_features_change);
1083 * netdev_state_change - device changes state
1084 * @dev: device to cause notification
1086 * Called to indicate a device has changed state. This function calls
1087 * the notifier chains for netdev_chain and sends a NEWLINK message
1088 * to the routing socket.
1090 void netdev_state_change(struct net_device *dev)
1092 if (dev->flags & IFF_UP) {
1093 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1094 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1097 EXPORT_SYMBOL(netdev_state_change);
1099 int netdev_bonding_change(struct net_device *dev, unsigned long event)
1101 return call_netdevice_notifiers(event, dev);
1103 EXPORT_SYMBOL(netdev_bonding_change);
1106 * dev_load - load a network module
1107 * @net: the applicable net namespace
1108 * @name: name of interface
1110 * If a network interface is not present and the process has suitable
1111 * privileges this function loads the module. If module loading is not
1112 * available in this kernel then it becomes a nop.
1115 void dev_load(struct net *net, const char *name)
1117 struct net_device *dev;
1120 dev = dev_get_by_name_rcu(net, name);
1123 if (!dev && capable(CAP_NET_ADMIN))
1124 request_module("%s", name);
1126 EXPORT_SYMBOL(dev_load);
1128 static int __dev_open(struct net_device *dev)
1130 const struct net_device_ops *ops = dev->netdev_ops;
1136 * Is it even present?
1138 if (!netif_device_present(dev))
1141 ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1142 ret = notifier_to_errno(ret);
1147 * Call device private open method
1149 set_bit(__LINK_STATE_START, &dev->state);
1151 if (ops->ndo_validate_addr)
1152 ret = ops->ndo_validate_addr(dev);
1154 if (!ret && ops->ndo_open)
1155 ret = ops->ndo_open(dev);
1158 * If it went open OK then:
1162 clear_bit(__LINK_STATE_START, &dev->state);
1167 dev->flags |= IFF_UP;
1172 net_dmaengine_get();
1175 * Initialize multicasting status
1177 dev_set_rx_mode(dev);
1180 * Wakeup transmit queue engine
1189 * dev_open - prepare an interface for use.
1190 * @dev: device to open
1192 * Takes a device from down to up state. The device's private open
1193 * function is invoked and then the multicast lists are loaded. Finally
1194 * the device is moved into the up state and a %NETDEV_UP message is
1195 * sent to the netdev notifier chain.
1197 * Calling this function on an active interface is a nop. On a failure
1198 * a negative errno code is returned.
1200 int dev_open(struct net_device *dev)
1207 if (dev->flags & IFF_UP)
1213 ret = __dev_open(dev);
1218 * ... and announce new interface.
1220 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1221 call_netdevice_notifiers(NETDEV_UP, dev);
1225 EXPORT_SYMBOL(dev_open);
1227 static int __dev_close_many(struct list_head *head)
1229 struct net_device *dev;
1234 list_for_each_entry(dev, head, unreg_list) {
1236 * Tell people we are going down, so that they can
1237 * prepare to death, when device is still operating.
1239 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1241 clear_bit(__LINK_STATE_START, &dev->state);
1243 /* Synchronize to scheduled poll. We cannot touch poll list, it
1244 * can be even on different cpu. So just clear netif_running().
1246 * dev->stop() will invoke napi_disable() on all of it's
1247 * napi_struct instances on this device.
1249 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1252 dev_deactivate_many(head);
1254 list_for_each_entry(dev, head, unreg_list) {
1255 const struct net_device_ops *ops = dev->netdev_ops;
1258 * Call the device specific close. This cannot fail.
1259 * Only if device is UP
1261 * We allow it to be called even after a DETACH hot-plug
1268 * Device is now down.
1271 dev->flags &= ~IFF_UP;
1276 net_dmaengine_put();
1282 static int __dev_close(struct net_device *dev)
1286 list_add(&dev->unreg_list, &single);
1287 return __dev_close_many(&single);
1290 static int dev_close_many(struct list_head *head)
1292 struct net_device *dev, *tmp;
1293 LIST_HEAD(tmp_list);
1295 list_for_each_entry_safe(dev, tmp, head, unreg_list)
1296 if (!(dev->flags & IFF_UP))
1297 list_move(&dev->unreg_list, &tmp_list);
1299 __dev_close_many(head);
1302 * Tell people we are down
1304 list_for_each_entry(dev, head, unreg_list) {
1305 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1306 call_netdevice_notifiers(NETDEV_DOWN, dev);
1309 /* rollback_registered_many needs the complete original list */
1310 list_splice(&tmp_list, head);
1315 * dev_close - shutdown an interface.
1316 * @dev: device to shutdown
1318 * This function moves an active device into down state. A
1319 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1320 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1323 int dev_close(struct net_device *dev)
1327 list_add(&dev->unreg_list, &single);
1328 dev_close_many(&single);
1332 EXPORT_SYMBOL(dev_close);
1336 * dev_disable_lro - disable Large Receive Offload on a device
1339 * Disable Large Receive Offload (LRO) on a net device. Must be
1340 * called under RTNL. This is needed if received packets may be
1341 * forwarded to another interface.
1343 void dev_disable_lro(struct net_device *dev)
1345 if (dev->ethtool_ops && dev->ethtool_ops->get_flags &&
1346 dev->ethtool_ops->set_flags) {
1347 u32 flags = dev->ethtool_ops->get_flags(dev);
1348 if (flags & ETH_FLAG_LRO) {
1349 flags &= ~ETH_FLAG_LRO;
1350 dev->ethtool_ops->set_flags(dev, flags);
1353 WARN_ON(dev->features & NETIF_F_LRO);
1355 EXPORT_SYMBOL(dev_disable_lro);
1358 static int dev_boot_phase = 1;
1361 * Device change register/unregister. These are not inline or static
1362 * as we export them to the world.
1366 * register_netdevice_notifier - register a network notifier block
1369 * Register a notifier to be called when network device events occur.
1370 * The notifier passed is linked into the kernel structures and must
1371 * not be reused until it has been unregistered. A negative errno code
1372 * is returned on a failure.
1374 * When registered all registration and up events are replayed
1375 * to the new notifier to allow device to have a race free
1376 * view of the network device list.
1379 int register_netdevice_notifier(struct notifier_block *nb)
1381 struct net_device *dev;
1382 struct net_device *last;
1387 err = raw_notifier_chain_register(&netdev_chain, nb);
1393 for_each_netdev(net, dev) {
1394 err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1395 err = notifier_to_errno(err);
1399 if (!(dev->flags & IFF_UP))
1402 nb->notifier_call(nb, NETDEV_UP, dev);
1413 for_each_netdev(net, dev) {
1417 if (dev->flags & IFF_UP) {
1418 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1419 nb->notifier_call(nb, NETDEV_DOWN, dev);
1421 nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1422 nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1426 raw_notifier_chain_unregister(&netdev_chain, nb);
1429 EXPORT_SYMBOL(register_netdevice_notifier);
1432 * unregister_netdevice_notifier - unregister a network notifier block
1435 * Unregister a notifier previously registered by
1436 * register_netdevice_notifier(). The notifier is unlinked into the
1437 * kernel structures and may then be reused. A negative errno code
1438 * is returned on a failure.
1441 int unregister_netdevice_notifier(struct notifier_block *nb)
1446 err = raw_notifier_chain_unregister(&netdev_chain, nb);
1450 EXPORT_SYMBOL(unregister_netdevice_notifier);
1453 * call_netdevice_notifiers - call all network notifier blocks
1454 * @val: value passed unmodified to notifier function
1455 * @dev: net_device pointer passed unmodified to notifier function
1457 * Call all network notifier blocks. Parameters and return value
1458 * are as for raw_notifier_call_chain().
1461 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1464 return raw_notifier_call_chain(&netdev_chain, val, dev);
1467 /* When > 0 there are consumers of rx skb time stamps */
1468 static atomic_t netstamp_needed = ATOMIC_INIT(0);
1470 void net_enable_timestamp(void)
1472 atomic_inc(&netstamp_needed);
1474 EXPORT_SYMBOL(net_enable_timestamp);
1476 void net_disable_timestamp(void)
1478 atomic_dec(&netstamp_needed);
1480 EXPORT_SYMBOL(net_disable_timestamp);
1482 static inline void net_timestamp_set(struct sk_buff *skb)
1484 if (atomic_read(&netstamp_needed))
1485 __net_timestamp(skb);
1487 skb->tstamp.tv64 = 0;
1490 static inline void net_timestamp_check(struct sk_buff *skb)
1492 if (!skb->tstamp.tv64 && atomic_read(&netstamp_needed))
1493 __net_timestamp(skb);
1497 * dev_forward_skb - loopback an skb to another netif
1499 * @dev: destination network device
1500 * @skb: buffer to forward
1503 * NET_RX_SUCCESS (no congestion)
1504 * NET_RX_DROP (packet was dropped, but freed)
1506 * dev_forward_skb can be used for injecting an skb from the
1507 * start_xmit function of one device into the receive queue
1508 * of another device.
1510 * The receiving device may be in another namespace, so
1511 * we have to clear all information in the skb that could
1512 * impact namespace isolation.
1514 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1519 if (unlikely(!(dev->flags & IFF_UP) ||
1520 (skb->len > (dev->mtu + dev->hard_header_len + VLAN_HLEN)))) {
1521 atomic_long_inc(&dev->rx_dropped);
1525 skb_set_dev(skb, dev);
1526 skb->tstamp.tv64 = 0;
1527 skb->pkt_type = PACKET_HOST;
1528 skb->protocol = eth_type_trans(skb, dev);
1529 return netif_rx(skb);
1531 EXPORT_SYMBOL_GPL(dev_forward_skb);
1533 static inline int deliver_skb(struct sk_buff *skb,
1534 struct packet_type *pt_prev,
1535 struct net_device *orig_dev)
1537 atomic_inc(&skb->users);
1538 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1542 * Support routine. Sends outgoing frames to any network
1543 * taps currently in use.
1546 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1548 struct packet_type *ptype;
1549 struct sk_buff *skb2 = NULL;
1550 struct packet_type *pt_prev = NULL;
1553 list_for_each_entry_rcu(ptype, &ptype_all, list) {
1554 /* Never send packets back to the socket
1555 * they originated from - MvS (miquels@drinkel.ow.org)
1557 if ((ptype->dev == dev || !ptype->dev) &&
1558 (ptype->af_packet_priv == NULL ||
1559 (struct sock *)ptype->af_packet_priv != skb->sk)) {
1561 deliver_skb(skb2, pt_prev, skb->dev);
1566 skb2 = skb_clone(skb, GFP_ATOMIC);
1570 net_timestamp_set(skb2);
1572 /* skb->nh should be correctly
1573 set by sender, so that the second statement is
1574 just protection against buggy protocols.
1576 skb_reset_mac_header(skb2);
1578 if (skb_network_header(skb2) < skb2->data ||
1579 skb2->network_header > skb2->tail) {
1580 if (net_ratelimit())
1581 printk(KERN_CRIT "protocol %04x is "
1583 ntohs(skb2->protocol),
1585 skb_reset_network_header(skb2);
1588 skb2->transport_header = skb2->network_header;
1589 skb2->pkt_type = PACKET_OUTGOING;
1594 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1598 /* netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1599 * @dev: Network device
1600 * @txq: number of queues available
1602 * If real_num_tx_queues is changed the tc mappings may no longer be
1603 * valid. To resolve this verify the tc mapping remains valid and if
1604 * not NULL the mapping. With no priorities mapping to this
1605 * offset/count pair it will no longer be used. In the worst case TC0
1606 * is invalid nothing can be done so disable priority mappings. If is
1607 * expected that drivers will fix this mapping if they can before
1608 * calling netif_set_real_num_tx_queues.
1610 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1613 struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1615 /* If TC0 is invalidated disable TC mapping */
1616 if (tc->offset + tc->count > txq) {
1617 pr_warning("Number of in use tx queues changed "
1618 "invalidating tc mappings. Priority "
1619 "traffic classification disabled!\n");
1624 /* Invalidated prio to tc mappings set to TC0 */
1625 for (i = 1; i < TC_BITMASK + 1; i++) {
1626 int q = netdev_get_prio_tc_map(dev, i);
1628 tc = &dev->tc_to_txq[q];
1629 if (tc->offset + tc->count > txq) {
1630 pr_warning("Number of in use tx queues "
1631 "changed. Priority %i to tc "
1632 "mapping %i is no longer valid "
1633 "setting map to 0\n",
1635 netdev_set_prio_tc_map(dev, i, 0);
1641 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1642 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1644 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1648 if (txq < 1 || txq > dev->num_tx_queues)
1651 if (dev->reg_state == NETREG_REGISTERED) {
1654 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
1660 netif_setup_tc(dev, txq);
1662 if (txq < dev->real_num_tx_queues)
1663 qdisc_reset_all_tx_gt(dev, txq);
1666 dev->real_num_tx_queues = txq;
1669 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
1673 * netif_set_real_num_rx_queues - set actual number of RX queues used
1674 * @dev: Network device
1675 * @rxq: Actual number of RX queues
1677 * This must be called either with the rtnl_lock held or before
1678 * registration of the net device. Returns 0 on success, or a
1679 * negative error code. If called before registration, it always
1682 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
1686 if (rxq < 1 || rxq > dev->num_rx_queues)
1689 if (dev->reg_state == NETREG_REGISTERED) {
1692 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
1698 dev->real_num_rx_queues = rxq;
1701 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
1704 static inline void __netif_reschedule(struct Qdisc *q)
1706 struct softnet_data *sd;
1707 unsigned long flags;
1709 local_irq_save(flags);
1710 sd = &__get_cpu_var(softnet_data);
1711 q->next_sched = NULL;
1712 *sd->output_queue_tailp = q;
1713 sd->output_queue_tailp = &q->next_sched;
1714 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1715 local_irq_restore(flags);
1718 void __netif_schedule(struct Qdisc *q)
1720 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1721 __netif_reschedule(q);
1723 EXPORT_SYMBOL(__netif_schedule);
1725 void dev_kfree_skb_irq(struct sk_buff *skb)
1727 if (atomic_dec_and_test(&skb->users)) {
1728 struct softnet_data *sd;
1729 unsigned long flags;
1731 local_irq_save(flags);
1732 sd = &__get_cpu_var(softnet_data);
1733 skb->next = sd->completion_queue;
1734 sd->completion_queue = skb;
1735 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1736 local_irq_restore(flags);
1739 EXPORT_SYMBOL(dev_kfree_skb_irq);
1741 void dev_kfree_skb_any(struct sk_buff *skb)
1743 if (in_irq() || irqs_disabled())
1744 dev_kfree_skb_irq(skb);
1748 EXPORT_SYMBOL(dev_kfree_skb_any);
1752 * netif_device_detach - mark device as removed
1753 * @dev: network device
1755 * Mark device as removed from system and therefore no longer available.
1757 void netif_device_detach(struct net_device *dev)
1759 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1760 netif_running(dev)) {
1761 netif_tx_stop_all_queues(dev);
1764 EXPORT_SYMBOL(netif_device_detach);
1767 * netif_device_attach - mark device as attached
1768 * @dev: network device
1770 * Mark device as attached from system and restart if needed.
1772 void netif_device_attach(struct net_device *dev)
1774 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1775 netif_running(dev)) {
1776 netif_tx_wake_all_queues(dev);
1777 __netdev_watchdog_up(dev);
1780 EXPORT_SYMBOL(netif_device_attach);
1783 * skb_dev_set -- assign a new device to a buffer
1784 * @skb: buffer for the new device
1785 * @dev: network device
1787 * If an skb is owned by a device already, we have to reset
1788 * all data private to the namespace a device belongs to
1789 * before assigning it a new device.
1791 #ifdef CONFIG_NET_NS
1792 void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
1795 if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
1798 skb_init_secmark(skb);
1802 skb->ipvs_property = 0;
1803 #ifdef CONFIG_NET_SCHED
1809 EXPORT_SYMBOL(skb_set_dev);
1810 #endif /* CONFIG_NET_NS */
1813 * Invalidate hardware checksum when packet is to be mangled, and
1814 * complete checksum manually on outgoing path.
1816 int skb_checksum_help(struct sk_buff *skb)
1819 int ret = 0, offset;
1821 if (skb->ip_summed == CHECKSUM_COMPLETE)
1822 goto out_set_summed;
1824 if (unlikely(skb_shinfo(skb)->gso_size)) {
1825 /* Let GSO fix up the checksum. */
1826 goto out_set_summed;
1829 offset = skb_checksum_start_offset(skb);
1830 BUG_ON(offset >= skb_headlen(skb));
1831 csum = skb_checksum(skb, offset, skb->len - offset, 0);
1833 offset += skb->csum_offset;
1834 BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1836 if (skb_cloned(skb) &&
1837 !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1838 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1843 *(__sum16 *)(skb->data + offset) = csum_fold(csum);
1845 skb->ip_summed = CHECKSUM_NONE;
1849 EXPORT_SYMBOL(skb_checksum_help);
1852 * skb_gso_segment - Perform segmentation on skb.
1853 * @skb: buffer to segment
1854 * @features: features for the output path (see dev->features)
1856 * This function segments the given skb and returns a list of segments.
1858 * It may return NULL if the skb requires no segmentation. This is
1859 * only possible when GSO is used for verifying header integrity.
1861 struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
1863 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1864 struct packet_type *ptype;
1865 __be16 type = skb->protocol;
1866 int vlan_depth = ETH_HLEN;
1869 while (type == htons(ETH_P_8021Q)) {
1870 struct vlan_hdr *vh;
1872 if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
1873 return ERR_PTR(-EINVAL);
1875 vh = (struct vlan_hdr *)(skb->data + vlan_depth);
1876 type = vh->h_vlan_encapsulated_proto;
1877 vlan_depth += VLAN_HLEN;
1880 skb_reset_mac_header(skb);
1881 skb->mac_len = skb->network_header - skb->mac_header;
1882 __skb_pull(skb, skb->mac_len);
1884 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1885 struct net_device *dev = skb->dev;
1886 struct ethtool_drvinfo info = {};
1888 if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1889 dev->ethtool_ops->get_drvinfo(dev, &info);
1891 WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d ip_summed=%d\n",
1892 info.driver, dev ? dev->features : 0L,
1893 skb->sk ? skb->sk->sk_route_caps : 0L,
1894 skb->len, skb->data_len, skb->ip_summed);
1896 if (skb_header_cloned(skb) &&
1897 (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1898 return ERR_PTR(err);
1902 list_for_each_entry_rcu(ptype,
1903 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1904 if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1905 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1906 err = ptype->gso_send_check(skb);
1907 segs = ERR_PTR(err);
1908 if (err || skb_gso_ok(skb, features))
1910 __skb_push(skb, (skb->data -
1911 skb_network_header(skb)));
1913 segs = ptype->gso_segment(skb, features);
1919 __skb_push(skb, skb->data - skb_mac_header(skb));
1923 EXPORT_SYMBOL(skb_gso_segment);
1925 /* Take action when hardware reception checksum errors are detected. */
1927 void netdev_rx_csum_fault(struct net_device *dev)
1929 if (net_ratelimit()) {
1930 printk(KERN_ERR "%s: hw csum failure.\n",
1931 dev ? dev->name : "<unknown>");
1935 EXPORT_SYMBOL(netdev_rx_csum_fault);
1938 /* Actually, we should eliminate this check as soon as we know, that:
1939 * 1. IOMMU is present and allows to map all the memory.
1940 * 2. No high memory really exists on this machine.
1943 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1945 #ifdef CONFIG_HIGHMEM
1947 if (!(dev->features & NETIF_F_HIGHDMA)) {
1948 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1949 if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1953 if (PCI_DMA_BUS_IS_PHYS) {
1954 struct device *pdev = dev->dev.parent;
1958 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1959 dma_addr_t addr = page_to_phys(skb_shinfo(skb)->frags[i].page);
1960 if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
1969 void (*destructor)(struct sk_buff *skb);
1972 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1974 static void dev_gso_skb_destructor(struct sk_buff *skb)
1976 struct dev_gso_cb *cb;
1979 struct sk_buff *nskb = skb->next;
1981 skb->next = nskb->next;
1984 } while (skb->next);
1986 cb = DEV_GSO_CB(skb);
1988 cb->destructor(skb);
1992 * dev_gso_segment - Perform emulated hardware segmentation on skb.
1993 * @skb: buffer to segment
1994 * @features: device features as applicable to this skb
1996 * This function segments the given skb and stores the list of segments
1999 static int dev_gso_segment(struct sk_buff *skb, int features)
2001 struct sk_buff *segs;
2003 segs = skb_gso_segment(skb, features);
2005 /* Verifying header integrity only. */
2010 return PTR_ERR(segs);
2013 DEV_GSO_CB(skb)->destructor = skb->destructor;
2014 skb->destructor = dev_gso_skb_destructor;
2020 * Try to orphan skb early, right before transmission by the device.
2021 * We cannot orphan skb if tx timestamp is requested or the sk-reference
2022 * is needed on driver level for other reasons, e.g. see net/can/raw.c
2024 static inline void skb_orphan_try(struct sk_buff *skb)
2026 struct sock *sk = skb->sk;
2028 if (sk && !skb_shinfo(skb)->tx_flags) {
2029 /* skb_tx_hash() wont be able to get sk.
2030 * We copy sk_hash into skb->rxhash
2033 skb->rxhash = sk->sk_hash;
2038 static bool can_checksum_protocol(unsigned long features, __be16 protocol)
2040 return ((features & NETIF_F_GEN_CSUM) ||
2041 ((features & NETIF_F_V4_CSUM) &&
2042 protocol == htons(ETH_P_IP)) ||
2043 ((features & NETIF_F_V6_CSUM) &&
2044 protocol == htons(ETH_P_IPV6)) ||
2045 ((features & NETIF_F_FCOE_CRC) &&
2046 protocol == htons(ETH_P_FCOE)));
2049 static int harmonize_features(struct sk_buff *skb, __be16 protocol, int features)
2051 if (!can_checksum_protocol(features, protocol)) {
2052 features &= ~NETIF_F_ALL_CSUM;
2053 features &= ~NETIF_F_SG;
2054 } else if (illegal_highdma(skb->dev, skb)) {
2055 features &= ~NETIF_F_SG;
2061 int netif_skb_features(struct sk_buff *skb)
2063 __be16 protocol = skb->protocol;
2064 int features = skb->dev->features;
2066 if (protocol == htons(ETH_P_8021Q)) {
2067 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2068 protocol = veh->h_vlan_encapsulated_proto;
2069 } else if (!vlan_tx_tag_present(skb)) {
2070 return harmonize_features(skb, protocol, features);
2073 features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
2075 if (protocol != htons(ETH_P_8021Q)) {
2076 return harmonize_features(skb, protocol, features);
2078 features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2079 NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
2080 return harmonize_features(skb, protocol, features);
2083 EXPORT_SYMBOL(netif_skb_features);
2086 * Returns true if either:
2087 * 1. skb has frag_list and the device doesn't support FRAGLIST, or
2088 * 2. skb is fragmented and the device does not support SG, or if
2089 * at least one of fragments is in highmem and device does not
2090 * support DMA from it.
2092 static inline int skb_needs_linearize(struct sk_buff *skb,
2095 return skb_is_nonlinear(skb) &&
2096 ((skb_has_frag_list(skb) &&
2097 !(features & NETIF_F_FRAGLIST)) ||
2098 (skb_shinfo(skb)->nr_frags &&
2099 !(features & NETIF_F_SG)));
2102 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2103 struct netdev_queue *txq)
2105 const struct net_device_ops *ops = dev->netdev_ops;
2106 int rc = NETDEV_TX_OK;
2108 if (likely(!skb->next)) {
2112 * If device doesnt need skb->dst, release it right now while
2113 * its hot in this cpu cache
2115 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2118 if (!list_empty(&ptype_all))
2119 dev_queue_xmit_nit(skb, dev);
2121 skb_orphan_try(skb);
2123 features = netif_skb_features(skb);
2125 if (vlan_tx_tag_present(skb) &&
2126 !(features & NETIF_F_HW_VLAN_TX)) {
2127 skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2134 if (netif_needs_gso(skb, features)) {
2135 if (unlikely(dev_gso_segment(skb, features)))
2140 if (skb_needs_linearize(skb, features) &&
2141 __skb_linearize(skb))
2144 /* If packet is not checksummed and device does not
2145 * support checksumming for this protocol, complete
2146 * checksumming here.
2148 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2149 skb_set_transport_header(skb,
2150 skb_checksum_start_offset(skb));
2151 if (!(features & NETIF_F_ALL_CSUM) &&
2152 skb_checksum_help(skb))
2157 rc = ops->ndo_start_xmit(skb, dev);
2158 trace_net_dev_xmit(skb, rc);
2159 if (rc == NETDEV_TX_OK)
2160 txq_trans_update(txq);
2166 struct sk_buff *nskb = skb->next;
2168 skb->next = nskb->next;
2172 * If device doesnt need nskb->dst, release it right now while
2173 * its hot in this cpu cache
2175 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2178 rc = ops->ndo_start_xmit(nskb, dev);
2179 trace_net_dev_xmit(nskb, rc);
2180 if (unlikely(rc != NETDEV_TX_OK)) {
2181 if (rc & ~NETDEV_TX_MASK)
2182 goto out_kfree_gso_skb;
2183 nskb->next = skb->next;
2187 txq_trans_update(txq);
2188 if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
2189 return NETDEV_TX_BUSY;
2190 } while (skb->next);
2193 if (likely(skb->next == NULL))
2194 skb->destructor = DEV_GSO_CB(skb)->destructor;
2201 static u32 hashrnd __read_mostly;
2204 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2205 * to be used as a distribution range.
2207 u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
2208 unsigned int num_tx_queues)
2212 u16 qcount = num_tx_queues;
2214 if (skb_rx_queue_recorded(skb)) {
2215 hash = skb_get_rx_queue(skb);
2216 while (unlikely(hash >= num_tx_queues))
2217 hash -= num_tx_queues;
2222 u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2223 qoffset = dev->tc_to_txq[tc].offset;
2224 qcount = dev->tc_to_txq[tc].count;
2227 if (skb->sk && skb->sk->sk_hash)
2228 hash = skb->sk->sk_hash;
2230 hash = (__force u16) skb->protocol ^ skb->rxhash;
2231 hash = jhash_1word(hash, hashrnd);
2233 return (u16) (((u64) hash * qcount) >> 32) + qoffset;
2235 EXPORT_SYMBOL(__skb_tx_hash);
2237 static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2239 if (unlikely(queue_index >= dev->real_num_tx_queues)) {
2240 if (net_ratelimit()) {
2241 pr_warning("%s selects TX queue %d, but "
2242 "real number of TX queues is %d\n",
2243 dev->name, queue_index, dev->real_num_tx_queues);
2250 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2253 struct xps_dev_maps *dev_maps;
2254 struct xps_map *map;
2255 int queue_index = -1;
2258 dev_maps = rcu_dereference(dev->xps_maps);
2260 map = rcu_dereference(
2261 dev_maps->cpu_map[raw_smp_processor_id()]);
2264 queue_index = map->queues[0];
2267 if (skb->sk && skb->sk->sk_hash)
2268 hash = skb->sk->sk_hash;
2270 hash = (__force u16) skb->protocol ^
2272 hash = jhash_1word(hash, hashrnd);
2273 queue_index = map->queues[
2274 ((u64)hash * map->len) >> 32];
2276 if (unlikely(queue_index >= dev->real_num_tx_queues))
2288 static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2289 struct sk_buff *skb)
2292 const struct net_device_ops *ops = dev->netdev_ops;
2294 if (dev->real_num_tx_queues == 1)
2296 else if (ops->ndo_select_queue) {
2297 queue_index = ops->ndo_select_queue(dev, skb);
2298 queue_index = dev_cap_txqueue(dev, queue_index);
2300 struct sock *sk = skb->sk;
2301 queue_index = sk_tx_queue_get(sk);
2303 if (queue_index < 0 || skb->ooo_okay ||
2304 queue_index >= dev->real_num_tx_queues) {
2305 int old_index = queue_index;
2307 queue_index = get_xps_queue(dev, skb);
2308 if (queue_index < 0)
2309 queue_index = skb_tx_hash(dev, skb);
2311 if (queue_index != old_index && sk) {
2312 struct dst_entry *dst =
2313 rcu_dereference_check(sk->sk_dst_cache, 1);
2315 if (dst && skb_dst(skb) == dst)
2316 sk_tx_queue_set(sk, queue_index);
2321 skb_set_queue_mapping(skb, queue_index);
2322 return netdev_get_tx_queue(dev, queue_index);
2325 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2326 struct net_device *dev,
2327 struct netdev_queue *txq)
2329 spinlock_t *root_lock = qdisc_lock(q);
2333 qdisc_skb_cb(skb)->pkt_len = skb->len;
2334 qdisc_calculate_pkt_len(skb, q);
2336 * Heuristic to force contended enqueues to serialize on a
2337 * separate lock before trying to get qdisc main lock.
2338 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2339 * and dequeue packets faster.
2341 contended = qdisc_is_running(q);
2342 if (unlikely(contended))
2343 spin_lock(&q->busylock);
2345 spin_lock(root_lock);
2346 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2349 } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2350 qdisc_run_begin(q)) {
2352 * This is a work-conserving queue; there are no old skbs
2353 * waiting to be sent out; and the qdisc is not running -
2354 * xmit the skb directly.
2356 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2359 qdisc_bstats_update(q, skb);
2361 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2362 if (unlikely(contended)) {
2363 spin_unlock(&q->busylock);
2370 rc = NET_XMIT_SUCCESS;
2373 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2374 if (qdisc_run_begin(q)) {
2375 if (unlikely(contended)) {
2376 spin_unlock(&q->busylock);
2382 spin_unlock(root_lock);
2383 if (unlikely(contended))
2384 spin_unlock(&q->busylock);
2388 static DEFINE_PER_CPU(int, xmit_recursion);
2389 #define RECURSION_LIMIT 10
2392 * dev_queue_xmit - transmit a buffer
2393 * @skb: buffer to transmit
2395 * Queue a buffer for transmission to a network device. The caller must
2396 * have set the device and priority and built the buffer before calling
2397 * this function. The function can be called from an interrupt.
2399 * A negative errno code is returned on a failure. A success does not
2400 * guarantee the frame will be transmitted as it may be dropped due
2401 * to congestion or traffic shaping.
2403 * -----------------------------------------------------------------------------------
2404 * I notice this method can also return errors from the queue disciplines,
2405 * including NET_XMIT_DROP, which is a positive value. So, errors can also
2408 * Regardless of the return value, the skb is consumed, so it is currently
2409 * difficult to retry a send to this method. (You can bump the ref count
2410 * before sending to hold a reference for retry if you are careful.)
2412 * When calling this method, interrupts MUST be enabled. This is because
2413 * the BH enable code must have IRQs enabled so that it will not deadlock.
2416 int dev_queue_xmit(struct sk_buff *skb)
2418 struct net_device *dev = skb->dev;
2419 struct netdev_queue *txq;
2423 /* Disable soft irqs for various locks below. Also
2424 * stops preemption for RCU.
2428 txq = dev_pick_tx(dev, skb);
2429 q = rcu_dereference_bh(txq->qdisc);
2431 #ifdef CONFIG_NET_CLS_ACT
2432 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2434 trace_net_dev_queue(skb);
2436 rc = __dev_xmit_skb(skb, q, dev, txq);
2440 /* The device has no queue. Common case for software devices:
2441 loopback, all the sorts of tunnels...
2443 Really, it is unlikely that netif_tx_lock protection is necessary
2444 here. (f.e. loopback and IP tunnels are clean ignoring statistics
2446 However, it is possible, that they rely on protection
2449 Check this and shot the lock. It is not prone from deadlocks.
2450 Either shot noqueue qdisc, it is even simpler 8)
2452 if (dev->flags & IFF_UP) {
2453 int cpu = smp_processor_id(); /* ok because BHs are off */
2455 if (txq->xmit_lock_owner != cpu) {
2457 if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2458 goto recursion_alert;
2460 HARD_TX_LOCK(dev, txq, cpu);
2462 if (!netif_tx_queue_stopped(txq)) {
2463 __this_cpu_inc(xmit_recursion);
2464 rc = dev_hard_start_xmit(skb, dev, txq);
2465 __this_cpu_dec(xmit_recursion);
2466 if (dev_xmit_complete(rc)) {
2467 HARD_TX_UNLOCK(dev, txq);
2471 HARD_TX_UNLOCK(dev, txq);
2472 if (net_ratelimit())
2473 printk(KERN_CRIT "Virtual device %s asks to "
2474 "queue packet!\n", dev->name);
2476 /* Recursion is detected! It is possible,
2480 if (net_ratelimit())
2481 printk(KERN_CRIT "Dead loop on virtual device "
2482 "%s, fix it urgently!\n", dev->name);
2487 rcu_read_unlock_bh();
2492 rcu_read_unlock_bh();
2495 EXPORT_SYMBOL(dev_queue_xmit);
2498 /*=======================================================================
2500 =======================================================================*/
2502 int netdev_max_backlog __read_mostly = 1000;
2503 int netdev_tstamp_prequeue __read_mostly = 1;
2504 int netdev_budget __read_mostly = 300;
2505 int weight_p __read_mostly = 64; /* old backlog weight */
2507 /* Called with irq disabled */
2508 static inline void ____napi_schedule(struct softnet_data *sd,
2509 struct napi_struct *napi)
2511 list_add_tail(&napi->poll_list, &sd->poll_list);
2512 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2516 * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
2517 * and src/dst port numbers. Returns a non-zero hash number on success
2520 __u32 __skb_get_rxhash(struct sk_buff *skb)
2522 int nhoff, hash = 0, poff;
2523 struct ipv6hdr *ip6;
2526 u32 addr1, addr2, ihl;
2532 nhoff = skb_network_offset(skb);
2534 switch (skb->protocol) {
2535 case __constant_htons(ETH_P_IP):
2536 if (!pskb_may_pull(skb, sizeof(*ip) + nhoff))
2539 ip = (struct iphdr *) (skb->data + nhoff);
2540 if (ip->frag_off & htons(IP_MF | IP_OFFSET))
2543 ip_proto = ip->protocol;
2544 addr1 = (__force u32) ip->saddr;
2545 addr2 = (__force u32) ip->daddr;
2548 case __constant_htons(ETH_P_IPV6):
2549 if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff))
2552 ip6 = (struct ipv6hdr *) (skb->data + nhoff);
2553 ip_proto = ip6->nexthdr;
2554 addr1 = (__force u32) ip6->saddr.s6_addr32[3];
2555 addr2 = (__force u32) ip6->daddr.s6_addr32[3];
2563 poff = proto_ports_offset(ip_proto);
2565 nhoff += ihl * 4 + poff;
2566 if (pskb_may_pull(skb, nhoff + 4)) {
2567 ports.v32 = * (__force u32 *) (skb->data + nhoff);
2568 if (ports.v16[1] < ports.v16[0])
2569 swap(ports.v16[0], ports.v16[1]);
2573 /* get a consistent hash (same value on both flow directions) */
2577 hash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
2584 EXPORT_SYMBOL(__skb_get_rxhash);
2588 /* One global table that all flow-based protocols share. */
2589 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2590 EXPORT_SYMBOL(rps_sock_flow_table);
2592 static struct rps_dev_flow *
2593 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2594 struct rps_dev_flow *rflow, u16 next_cpu)
2598 tcpu = rflow->cpu = next_cpu;
2599 if (tcpu != RPS_NO_CPU) {
2600 #ifdef CONFIG_RFS_ACCEL
2601 struct netdev_rx_queue *rxqueue;
2602 struct rps_dev_flow_table *flow_table;
2603 struct rps_dev_flow *old_rflow;
2608 /* Should we steer this flow to a different hardware queue? */
2609 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap)
2611 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2612 if (rxq_index == skb_get_rx_queue(skb))
2615 rxqueue = dev->_rx + rxq_index;
2616 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2619 flow_id = skb->rxhash & flow_table->mask;
2620 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2621 rxq_index, flow_id);
2625 rflow = &flow_table->flows[flow_id];
2626 rflow->cpu = next_cpu;
2628 if (old_rflow->filter == rflow->filter)
2629 old_rflow->filter = RPS_NO_FILTER;
2633 per_cpu(softnet_data, tcpu).input_queue_head;
2640 * get_rps_cpu is called from netif_receive_skb and returns the target
2641 * CPU from the RPS map of the receiving queue for a given skb.
2642 * rcu_read_lock must be held on entry.
2644 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2645 struct rps_dev_flow **rflowp)
2647 struct netdev_rx_queue *rxqueue;
2648 struct rps_map *map;
2649 struct rps_dev_flow_table *flow_table;
2650 struct rps_sock_flow_table *sock_flow_table;
2654 if (skb_rx_queue_recorded(skb)) {
2655 u16 index = skb_get_rx_queue(skb);
2656 if (unlikely(index >= dev->real_num_rx_queues)) {
2657 WARN_ONCE(dev->real_num_rx_queues > 1,
2658 "%s received packet on queue %u, but number "
2659 "of RX queues is %u\n",
2660 dev->name, index, dev->real_num_rx_queues);
2663 rxqueue = dev->_rx + index;
2667 map = rcu_dereference(rxqueue->rps_map);
2669 if (map->len == 1) {
2670 tcpu = map->cpus[0];
2671 if (cpu_online(tcpu))
2675 } else if (!rcu_dereference_raw(rxqueue->rps_flow_table)) {
2679 skb_reset_network_header(skb);
2680 if (!skb_get_rxhash(skb))
2683 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2684 sock_flow_table = rcu_dereference(rps_sock_flow_table);
2685 if (flow_table && sock_flow_table) {
2687 struct rps_dev_flow *rflow;
2689 rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2692 next_cpu = sock_flow_table->ents[skb->rxhash &
2693 sock_flow_table->mask];
2696 * If the desired CPU (where last recvmsg was done) is
2697 * different from current CPU (one in the rx-queue flow
2698 * table entry), switch if one of the following holds:
2699 * - Current CPU is unset (equal to RPS_NO_CPU).
2700 * - Current CPU is offline.
2701 * - The current CPU's queue tail has advanced beyond the
2702 * last packet that was enqueued using this table entry.
2703 * This guarantees that all previous packets for the flow
2704 * have been dequeued, thus preserving in order delivery.
2706 if (unlikely(tcpu != next_cpu) &&
2707 (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2708 ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2709 rflow->last_qtail)) >= 0))
2710 rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
2712 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2720 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2722 if (cpu_online(tcpu)) {
2732 #ifdef CONFIG_RFS_ACCEL
2735 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
2736 * @dev: Device on which the filter was set
2737 * @rxq_index: RX queue index
2738 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
2739 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
2741 * Drivers that implement ndo_rx_flow_steer() should periodically call
2742 * this function for each installed filter and remove the filters for
2743 * which it returns %true.
2745 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
2746 u32 flow_id, u16 filter_id)
2748 struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
2749 struct rps_dev_flow_table *flow_table;
2750 struct rps_dev_flow *rflow;
2755 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2756 if (flow_table && flow_id <= flow_table->mask) {
2757 rflow = &flow_table->flows[flow_id];
2758 cpu = ACCESS_ONCE(rflow->cpu);
2759 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
2760 ((int)(per_cpu(softnet_data, cpu).input_queue_head -
2761 rflow->last_qtail) <
2762 (int)(10 * flow_table->mask)))
2768 EXPORT_SYMBOL(rps_may_expire_flow);
2770 #endif /* CONFIG_RFS_ACCEL */
2772 /* Called from hardirq (IPI) context */
2773 static void rps_trigger_softirq(void *data)
2775 struct softnet_data *sd = data;
2777 ____napi_schedule(sd, &sd->backlog);
2781 #endif /* CONFIG_RPS */
2784 * Check if this softnet_data structure is another cpu one
2785 * If yes, queue it to our IPI list and return 1
2788 static int rps_ipi_queued(struct softnet_data *sd)
2791 struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2794 sd->rps_ipi_next = mysd->rps_ipi_list;
2795 mysd->rps_ipi_list = sd;
2797 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2800 #endif /* CONFIG_RPS */
2805 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2806 * queue (may be a remote CPU queue).
2808 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2809 unsigned int *qtail)
2811 struct softnet_data *sd;
2812 unsigned long flags;
2814 sd = &per_cpu(softnet_data, cpu);
2816 local_irq_save(flags);
2819 if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
2820 if (skb_queue_len(&sd->input_pkt_queue)) {
2822 __skb_queue_tail(&sd->input_pkt_queue, skb);
2823 input_queue_tail_incr_save(sd, qtail);
2825 local_irq_restore(flags);
2826 return NET_RX_SUCCESS;
2829 /* Schedule NAPI for backlog device
2830 * We can use non atomic operation since we own the queue lock
2832 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
2833 if (!rps_ipi_queued(sd))
2834 ____napi_schedule(sd, &sd->backlog);
2842 local_irq_restore(flags);
2844 atomic_long_inc(&skb->dev->rx_dropped);
2850 * netif_rx - post buffer to the network code
2851 * @skb: buffer to post
2853 * This function receives a packet from a device driver and queues it for
2854 * the upper (protocol) levels to process. It always succeeds. The buffer
2855 * may be dropped during processing for congestion control or by the
2859 * NET_RX_SUCCESS (no congestion)
2860 * NET_RX_DROP (packet was dropped)
2864 int netif_rx(struct sk_buff *skb)
2868 /* if netpoll wants it, pretend we never saw it */
2869 if (netpoll_rx(skb))
2872 if (netdev_tstamp_prequeue)
2873 net_timestamp_check(skb);
2875 trace_netif_rx(skb);
2878 struct rps_dev_flow voidflow, *rflow = &voidflow;
2884 cpu = get_rps_cpu(skb->dev, skb, &rflow);
2886 cpu = smp_processor_id();
2888 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2896 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
2902 EXPORT_SYMBOL(netif_rx);
2904 int netif_rx_ni(struct sk_buff *skb)
2909 err = netif_rx(skb);
2910 if (local_softirq_pending())
2916 EXPORT_SYMBOL(netif_rx_ni);
2918 static void net_tx_action(struct softirq_action *h)
2920 struct softnet_data *sd = &__get_cpu_var(softnet_data);
2922 if (sd->completion_queue) {
2923 struct sk_buff *clist;
2925 local_irq_disable();
2926 clist = sd->completion_queue;
2927 sd->completion_queue = NULL;
2931 struct sk_buff *skb = clist;
2932 clist = clist->next;
2934 WARN_ON(atomic_read(&skb->users));
2935 trace_kfree_skb(skb, net_tx_action);
2940 if (sd->output_queue) {
2943 local_irq_disable();
2944 head = sd->output_queue;
2945 sd->output_queue = NULL;
2946 sd->output_queue_tailp = &sd->output_queue;
2950 struct Qdisc *q = head;
2951 spinlock_t *root_lock;
2953 head = head->next_sched;
2955 root_lock = qdisc_lock(q);
2956 if (spin_trylock(root_lock)) {
2957 smp_mb__before_clear_bit();
2958 clear_bit(__QDISC_STATE_SCHED,
2961 spin_unlock(root_lock);
2963 if (!test_bit(__QDISC_STATE_DEACTIVATED,
2965 __netif_reschedule(q);
2967 smp_mb__before_clear_bit();
2968 clear_bit(__QDISC_STATE_SCHED,
2976 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
2977 (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
2978 /* This hook is defined here for ATM LANE */
2979 int (*br_fdb_test_addr_hook)(struct net_device *dev,
2980 unsigned char *addr) __read_mostly;
2981 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
2984 #ifdef CONFIG_NET_CLS_ACT
2985 /* TODO: Maybe we should just force sch_ingress to be compiled in
2986 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2987 * a compare and 2 stores extra right now if we dont have it on
2988 * but have CONFIG_NET_CLS_ACT
2989 * NOTE: This doesnt stop any functionality; if you dont have
2990 * the ingress scheduler, you just cant add policies on ingress.
2993 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
2995 struct net_device *dev = skb->dev;
2996 u32 ttl = G_TC_RTTL(skb->tc_verd);
2997 int result = TC_ACT_OK;
3000 if (unlikely(MAX_RED_LOOP < ttl++)) {
3001 if (net_ratelimit())
3002 pr_warning( "Redir loop detected Dropping packet (%d->%d)\n",
3003 skb->skb_iif, dev->ifindex);
3007 skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3008 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3011 if (q != &noop_qdisc) {
3012 spin_lock(qdisc_lock(q));
3013 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3014 result = qdisc_enqueue_root(skb, q);
3015 spin_unlock(qdisc_lock(q));
3021 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3022 struct packet_type **pt_prev,
3023 int *ret, struct net_device *orig_dev)
3025 struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3027 if (!rxq || rxq->qdisc == &noop_qdisc)
3031 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3035 switch (ing_filter(skb, rxq)) {
3049 * netdev_rx_handler_register - register receive handler
3050 * @dev: device to register a handler for
3051 * @rx_handler: receive handler to register
3052 * @rx_handler_data: data pointer that is used by rx handler
3054 * Register a receive hander for a device. This handler will then be
3055 * called from __netif_receive_skb. A negative errno code is returned
3058 * The caller must hold the rtnl_mutex.
3060 int netdev_rx_handler_register(struct net_device *dev,
3061 rx_handler_func_t *rx_handler,
3062 void *rx_handler_data)
3066 if (dev->rx_handler)
3069 rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3070 rcu_assign_pointer(dev->rx_handler, rx_handler);
3074 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3077 * netdev_rx_handler_unregister - unregister receive handler
3078 * @dev: device to unregister a handler from
3080 * Unregister a receive hander from a device.
3082 * The caller must hold the rtnl_mutex.
3084 void netdev_rx_handler_unregister(struct net_device *dev)
3088 rcu_assign_pointer(dev->rx_handler, NULL);
3089 rcu_assign_pointer(dev->rx_handler_data, NULL);
3091 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3093 static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
3094 struct net_device *master)
3096 if (skb->pkt_type == PACKET_HOST) {
3097 u16 *dest = (u16 *) eth_hdr(skb)->h_dest;
3099 memcpy(dest, master->dev_addr, ETH_ALEN);
3103 /* On bonding slaves other than the currently active slave, suppress
3104 * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
3105 * ARP on active-backup slaves with arp_validate enabled.
3107 int __skb_bond_should_drop(struct sk_buff *skb, struct net_device *master)
3109 struct net_device *dev = skb->dev;
3111 if (master->priv_flags & IFF_MASTER_ARPMON)
3112 dev->last_rx = jiffies;
3114 if ((master->priv_flags & IFF_MASTER_ALB) &&
3115 (master->priv_flags & IFF_BRIDGE_PORT)) {
3116 /* Do address unmangle. The local destination address
3117 * will be always the one master has. Provides the right
3118 * functionality in a bridge.
3120 skb_bond_set_mac_by_master(skb, master);
3123 if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
3124 if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
3125 skb->protocol == __cpu_to_be16(ETH_P_ARP))
3128 if (master->priv_flags & IFF_MASTER_ALB) {
3129 if (skb->pkt_type != PACKET_BROADCAST &&
3130 skb->pkt_type != PACKET_MULTICAST)
3133 if (master->priv_flags & IFF_MASTER_8023AD &&
3134 skb->protocol == __cpu_to_be16(ETH_P_SLOW))
3141 EXPORT_SYMBOL(__skb_bond_should_drop);
3143 static int __netif_receive_skb(struct sk_buff *skb)
3145 struct packet_type *ptype, *pt_prev;
3146 rx_handler_func_t *rx_handler;
3147 struct net_device *orig_dev;
3148 struct net_device *master;
3149 struct net_device *null_or_orig;
3150 struct net_device *orig_or_bond;
3151 int ret = NET_RX_DROP;
3154 if (!netdev_tstamp_prequeue)
3155 net_timestamp_check(skb);
3157 trace_netif_receive_skb(skb);
3159 /* if we've gotten here through NAPI, check netpoll */
3160 if (netpoll_receive_skb(skb))
3164 skb->skb_iif = skb->dev->ifindex;
3167 * bonding note: skbs received on inactive slaves should only
3168 * be delivered to pkt handlers that are exact matches. Also
3169 * the deliver_no_wcard flag will be set. If packet handlers
3170 * are sensitive to duplicate packets these skbs will need to
3171 * be dropped at the handler.
3173 null_or_orig = NULL;
3174 orig_dev = skb->dev;
3175 master = ACCESS_ONCE(orig_dev->master);
3176 if (skb->deliver_no_wcard)
3177 null_or_orig = orig_dev;
3179 if (skb_bond_should_drop(skb, master)) {
3180 skb->deliver_no_wcard = 1;
3181 null_or_orig = orig_dev; /* deliver only exact match */
3186 __this_cpu_inc(softnet_data.processed);
3187 skb_reset_network_header(skb);
3188 skb_reset_transport_header(skb);
3189 skb->mac_len = skb->network_header - skb->mac_header;
3195 #ifdef CONFIG_NET_CLS_ACT
3196 if (skb->tc_verd & TC_NCLS) {
3197 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3202 list_for_each_entry_rcu(ptype, &ptype_all, list) {
3203 if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
3204 ptype->dev == orig_dev) {
3206 ret = deliver_skb(skb, pt_prev, orig_dev);
3211 #ifdef CONFIG_NET_CLS_ACT
3212 skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3218 /* Handle special case of bridge or macvlan */
3219 rx_handler = rcu_dereference(skb->dev->rx_handler);
3222 ret = deliver_skb(skb, pt_prev, orig_dev);
3225 skb = rx_handler(skb);
3230 if (vlan_tx_tag_present(skb)) {
3232 ret = deliver_skb(skb, pt_prev, orig_dev);
3235 if (vlan_hwaccel_do_receive(&skb)) {
3236 ret = __netif_receive_skb(skb);
3238 } else if (unlikely(!skb))
3243 * Make sure frames received on VLAN interfaces stacked on
3244 * bonding interfaces still make their way to any base bonding
3245 * device that may have registered for a specific ptype. The
3246 * handler may have to adjust skb->dev and orig_dev.
3248 orig_or_bond = orig_dev;
3249 if ((skb->dev->priv_flags & IFF_802_1Q_VLAN) &&
3250 (vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING)) {
3251 orig_or_bond = vlan_dev_real_dev(skb->dev);
3254 type = skb->protocol;
3255 list_for_each_entry_rcu(ptype,
3256 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3257 if (ptype->type == type && (ptype->dev == null_or_orig ||
3258 ptype->dev == skb->dev || ptype->dev == orig_dev ||
3259 ptype->dev == orig_or_bond)) {
3261 ret = deliver_skb(skb, pt_prev, orig_dev);
3267 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3269 atomic_long_inc(&skb->dev->rx_dropped);
3271 /* Jamal, now you will not able to escape explaining
3272 * me how you were going to use this. :-)
3283 * netif_receive_skb - process receive buffer from network
3284 * @skb: buffer to process
3286 * netif_receive_skb() is the main receive data processing function.
3287 * It always succeeds. The buffer may be dropped during processing
3288 * for congestion control or by the protocol layers.
3290 * This function may only be called from softirq context and interrupts
3291 * should be enabled.
3293 * Return values (usually ignored):
3294 * NET_RX_SUCCESS: no congestion
3295 * NET_RX_DROP: packet was dropped
3297 int netif_receive_skb(struct sk_buff *skb)
3299 if (netdev_tstamp_prequeue)
3300 net_timestamp_check(skb);
3302 if (skb_defer_rx_timestamp(skb))
3303 return NET_RX_SUCCESS;
3307 struct rps_dev_flow voidflow, *rflow = &voidflow;
3312 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3315 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3319 ret = __netif_receive_skb(skb);
3325 return __netif_receive_skb(skb);
3328 EXPORT_SYMBOL(netif_receive_skb);
3330 /* Network device is going away, flush any packets still pending
3331 * Called with irqs disabled.
3333 static void flush_backlog(void *arg)
3335 struct net_device *dev = arg;
3336 struct softnet_data *sd = &__get_cpu_var(softnet_data);
3337 struct sk_buff *skb, *tmp;
3340 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3341 if (skb->dev == dev) {
3342 __skb_unlink(skb, &sd->input_pkt_queue);
3344 input_queue_head_incr(sd);
3349 skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3350 if (skb->dev == dev) {
3351 __skb_unlink(skb, &sd->process_queue);
3353 input_queue_head_incr(sd);
3358 static int napi_gro_complete(struct sk_buff *skb)
3360 struct packet_type *ptype;
3361 __be16 type = skb->protocol;
3362 struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3365 if (NAPI_GRO_CB(skb)->count == 1) {
3366 skb_shinfo(skb)->gso_size = 0;
3371 list_for_each_entry_rcu(ptype, head, list) {
3372 if (ptype->type != type || ptype->dev || !ptype->gro_complete)
3375 err = ptype->gro_complete(skb);
3381 WARN_ON(&ptype->list == head);
3383 return NET_RX_SUCCESS;
3387 return netif_receive_skb(skb);
3390 inline void napi_gro_flush(struct napi_struct *napi)
3392 struct sk_buff *skb, *next;
3394 for (skb = napi->gro_list; skb; skb = next) {
3397 napi_gro_complete(skb);
3400 napi->gro_count = 0;
3401 napi->gro_list = NULL;
3403 EXPORT_SYMBOL(napi_gro_flush);
3405 enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3407 struct sk_buff **pp = NULL;
3408 struct packet_type *ptype;
3409 __be16 type = skb->protocol;
3410 struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3413 enum gro_result ret;
3415 if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3418 if (skb_is_gso(skb) || skb_has_frag_list(skb))
3422 list_for_each_entry_rcu(ptype, head, list) {
3423 if (ptype->type != type || ptype->dev || !ptype->gro_receive)
3426 skb_set_network_header(skb, skb_gro_offset(skb));
3427 mac_len = skb->network_header - skb->mac_header;
3428 skb->mac_len = mac_len;
3429 NAPI_GRO_CB(skb)->same_flow = 0;
3430 NAPI_GRO_CB(skb)->flush = 0;
3431 NAPI_GRO_CB(skb)->free = 0;
3433 pp = ptype->gro_receive(&napi->gro_list, skb);
3438 if (&ptype->list == head)
3441 same_flow = NAPI_GRO_CB(skb)->same_flow;
3442 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3445 struct sk_buff *nskb = *pp;
3449 napi_gro_complete(nskb);
3456 if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3460 NAPI_GRO_CB(skb)->count = 1;
3461 skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3462 skb->next = napi->gro_list;
3463 napi->gro_list = skb;
3467 if (skb_headlen(skb) < skb_gro_offset(skb)) {
3468 int grow = skb_gro_offset(skb) - skb_headlen(skb);
3470 BUG_ON(skb->end - skb->tail < grow);
3472 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3475 skb->data_len -= grow;
3477 skb_shinfo(skb)->frags[0].page_offset += grow;
3478 skb_shinfo(skb)->frags[0].size -= grow;
3480 if (unlikely(!skb_shinfo(skb)->frags[0].size)) {
3481 put_page(skb_shinfo(skb)->frags[0].page);
3482 memmove(skb_shinfo(skb)->frags,
3483 skb_shinfo(skb)->frags + 1,
3484 --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3495 EXPORT_SYMBOL(dev_gro_receive);
3497 static inline gro_result_t
3498 __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3502 for (p = napi->gro_list; p; p = p->next) {
3503 unsigned long diffs;
3505 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3506 diffs |= p->vlan_tci ^ skb->vlan_tci;
3507 diffs |= compare_ether_header(skb_mac_header(p),
3508 skb_gro_mac_header(skb));
3509 NAPI_GRO_CB(p)->same_flow = !diffs;
3510 NAPI_GRO_CB(p)->flush = 0;
3513 return dev_gro_receive(napi, skb);
3516 gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3520 if (netif_receive_skb(skb))
3525 case GRO_MERGED_FREE:
3536 EXPORT_SYMBOL(napi_skb_finish);
3538 void skb_gro_reset_offset(struct sk_buff *skb)
3540 NAPI_GRO_CB(skb)->data_offset = 0;
3541 NAPI_GRO_CB(skb)->frag0 = NULL;
3542 NAPI_GRO_CB(skb)->frag0_len = 0;
3544 if (skb->mac_header == skb->tail &&
3545 !PageHighMem(skb_shinfo(skb)->frags[0].page)) {
3546 NAPI_GRO_CB(skb)->frag0 =
3547 page_address(skb_shinfo(skb)->frags[0].page) +
3548 skb_shinfo(skb)->frags[0].page_offset;
3549 NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size;
3552 EXPORT_SYMBOL(skb_gro_reset_offset);
3554 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3556 skb_gro_reset_offset(skb);
3558 return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
3560 EXPORT_SYMBOL(napi_gro_receive);
3562 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3564 __skb_pull(skb, skb_headlen(skb));
3565 skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
3571 struct sk_buff *napi_get_frags(struct napi_struct *napi)
3573 struct sk_buff *skb = napi->skb;
3576 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3582 EXPORT_SYMBOL(napi_get_frags);
3584 gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3590 skb->protocol = eth_type_trans(skb, skb->dev);
3592 if (ret == GRO_HELD)
3593 skb_gro_pull(skb, -ETH_HLEN);
3594 else if (netif_receive_skb(skb))
3599 case GRO_MERGED_FREE:
3600 napi_reuse_skb(napi, skb);
3609 EXPORT_SYMBOL(napi_frags_finish);
3611 struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3613 struct sk_buff *skb = napi->skb;
3620 skb_reset_mac_header(skb);
3621 skb_gro_reset_offset(skb);
3623 off = skb_gro_offset(skb);
3624 hlen = off + sizeof(*eth);
3625 eth = skb_gro_header_fast(skb, off);
3626 if (skb_gro_header_hard(skb, hlen)) {
3627 eth = skb_gro_header_slow(skb, hlen, off);
3628 if (unlikely(!eth)) {
3629 napi_reuse_skb(napi, skb);
3635 skb_gro_pull(skb, sizeof(*eth));
3638 * This works because the only protocols we care about don't require
3639 * special handling. We'll fix it up properly at the end.
3641 skb->protocol = eth->h_proto;
3646 EXPORT_SYMBOL(napi_frags_skb);
3648 gro_result_t napi_gro_frags(struct napi_struct *napi)
3650 struct sk_buff *skb = napi_frags_skb(napi);
3655 return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
3657 EXPORT_SYMBOL(napi_gro_frags);
3660 * net_rps_action sends any pending IPI's for rps.
3661 * Note: called with local irq disabled, but exits with local irq enabled.
3663 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3666 struct softnet_data *remsd = sd->rps_ipi_list;
3669 sd->rps_ipi_list = NULL;
3673 /* Send pending IPI's to kick RPS processing on remote cpus. */
3675 struct softnet_data *next = remsd->rps_ipi_next;
3677 if (cpu_online(remsd->cpu))
3678 __smp_call_function_single(remsd->cpu,
3687 static int process_backlog(struct napi_struct *napi, int quota)
3690 struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
3693 /* Check if we have pending ipi, its better to send them now,
3694 * not waiting net_rx_action() end.
3696 if (sd->rps_ipi_list) {
3697 local_irq_disable();
3698 net_rps_action_and_irq_enable(sd);
3701 napi->weight = weight_p;
3702 local_irq_disable();
3703 while (work < quota) {
3704 struct sk_buff *skb;
3707 while ((skb = __skb_dequeue(&sd->process_queue))) {
3709 __netif_receive_skb(skb);
3710 local_irq_disable();
3711 input_queue_head_incr(sd);
3712 if (++work >= quota) {
3719 qlen = skb_queue_len(&sd->input_pkt_queue);
3721 skb_queue_splice_tail_init(&sd->input_pkt_queue,
3722 &sd->process_queue);
3724 if (qlen < quota - work) {
3726 * Inline a custom version of __napi_complete().
3727 * only current cpu owns and manipulates this napi,
3728 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3729 * we can use a plain write instead of clear_bit(),
3730 * and we dont need an smp_mb() memory barrier.
3732 list_del(&napi->poll_list);
3735 quota = work + qlen;
3745 * __napi_schedule - schedule for receive
3746 * @n: entry to schedule
3748 * The entry's receive function will be scheduled to run
3750 void __napi_schedule(struct napi_struct *n)
3752 unsigned long flags;
3754 local_irq_save(flags);
3755 ____napi_schedule(&__get_cpu_var(softnet_data), n);
3756 local_irq_restore(flags);
3758 EXPORT_SYMBOL(__napi_schedule);
3760 void __napi_complete(struct napi_struct *n)
3762 BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3763 BUG_ON(n->gro_list);
3765 list_del(&n->poll_list);
3766 smp_mb__before_clear_bit();
3767 clear_bit(NAPI_STATE_SCHED, &n->state);
3769 EXPORT_SYMBOL(__napi_complete);
3771 void napi_complete(struct napi_struct *n)
3773 unsigned long flags;
3776 * don't let napi dequeue from the cpu poll list
3777 * just in case its running on a different cpu
3779 if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3783 local_irq_save(flags);
3785 local_irq_restore(flags);
3787 EXPORT_SYMBOL(napi_complete);
3789 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3790 int (*poll)(struct napi_struct *, int), int weight)
3792 INIT_LIST_HEAD(&napi->poll_list);
3793 napi->gro_count = 0;
3794 napi->gro_list = NULL;
3797 napi->weight = weight;
3798 list_add(&napi->dev_list, &dev->napi_list);
3800 #ifdef CONFIG_NETPOLL
3801 spin_lock_init(&napi->poll_lock);
3802 napi->poll_owner = -1;
3804 set_bit(NAPI_STATE_SCHED, &napi->state);
3806 EXPORT_SYMBOL(netif_napi_add);
3808 void netif_napi_del(struct napi_struct *napi)
3810 struct sk_buff *skb, *next;
3812 list_del_init(&napi->dev_list);
3813 napi_free_frags(napi);
3815 for (skb = napi->gro_list; skb; skb = next) {
3821 napi->gro_list = NULL;
3822 napi->gro_count = 0;
3824 EXPORT_SYMBOL(netif_napi_del);
3826 static void net_rx_action(struct softirq_action *h)
3828 struct softnet_data *sd = &__get_cpu_var(softnet_data);
3829 unsigned long time_limit = jiffies + 2;
3830 int budget = netdev_budget;
3833 local_irq_disable();
3835 while (!list_empty(&sd->poll_list)) {
3836 struct napi_struct *n;
3839 /* If softirq window is exhuasted then punt.
3840 * Allow this to run for 2 jiffies since which will allow
3841 * an average latency of 1.5/HZ.
3843 if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
3848 /* Even though interrupts have been re-enabled, this
3849 * access is safe because interrupts can only add new
3850 * entries to the tail of this list, and only ->poll()
3851 * calls can remove this head entry from the list.
3853 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
3855 have = netpoll_poll_lock(n);
3859 /* This NAPI_STATE_SCHED test is for avoiding a race
3860 * with netpoll's poll_napi(). Only the entity which
3861 * obtains the lock and sees NAPI_STATE_SCHED set will
3862 * actually make the ->poll() call. Therefore we avoid
3863 * accidently calling ->poll() when NAPI is not scheduled.
3866 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
3867 work = n->poll(n, weight);
3871 WARN_ON_ONCE(work > weight);
3875 local_irq_disable();
3877 /* Drivers must not modify the NAPI state if they
3878 * consume the entire weight. In such cases this code
3879 * still "owns" the NAPI instance and therefore can
3880 * move the instance around on the list at-will.
3882 if (unlikely(work == weight)) {
3883 if (unlikely(napi_disable_pending(n))) {
3886 local_irq_disable();
3888 list_move_tail(&n->poll_list, &sd->poll_list);
3891 netpoll_poll_unlock(have);
3894 net_rps_action_and_irq_enable(sd);
3896 #ifdef CONFIG_NET_DMA
3898 * There may not be any more sk_buffs coming right now, so push
3899 * any pending DMA copies to hardware
3901 dma_issue_pending_all();
3908 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3912 static gifconf_func_t *gifconf_list[NPROTO];
3915 * register_gifconf - register a SIOCGIF handler
3916 * @family: Address family
3917 * @gifconf: Function handler
3919 * Register protocol dependent address dumping routines. The handler
3920 * that is passed must not be freed or reused until it has been replaced
3921 * by another handler.
3923 int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
3925 if (family >= NPROTO)
3927 gifconf_list[family] = gifconf;
3930 EXPORT_SYMBOL(register_gifconf);
3934 * Map an interface index to its name (SIOCGIFNAME)
3938 * We need this ioctl for efficient implementation of the
3939 * if_indextoname() function required by the IPv6 API. Without
3940 * it, we would have to search all the interfaces to find a
3944 static int dev_ifname(struct net *net, struct ifreq __user *arg)
3946 struct net_device *dev;
3950 * Fetch the caller's info block.
3953 if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3957 dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
3963 strcpy(ifr.ifr_name, dev->name);
3966 if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
3972 * Perform a SIOCGIFCONF call. This structure will change
3973 * size eventually, and there is nothing I can do about it.
3974 * Thus we will need a 'compatibility mode'.
3977 static int dev_ifconf(struct net *net, char __user *arg)
3980 struct net_device *dev;
3987 * Fetch the caller's info block.
3990 if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
3997 * Loop over the interfaces, and write an info block for each.
4001 for_each_netdev(net, dev) {
4002 for (i = 0; i < NPROTO; i++) {
4003 if (gifconf_list[i]) {
4006 done = gifconf_list[i](dev, NULL, 0);
4008 done = gifconf_list[i](dev, pos + total,
4018 * All done. Write the updated control block back to the caller.
4020 ifc.ifc_len = total;
4023 * Both BSD and Solaris return 0 here, so we do too.
4025 return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
4028 #ifdef CONFIG_PROC_FS
4030 * This is invoked by the /proc filesystem handler to display a device
4033 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
4036 struct net *net = seq_file_net(seq);
4038 struct net_device *dev;
4042 return SEQ_START_TOKEN;
4045 for_each_netdev_rcu(net, dev)
4052 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4054 struct net_device *dev = (v == SEQ_START_TOKEN) ?
4055 first_net_device(seq_file_net(seq)) :
4056 next_net_device((struct net_device *)v);
4059 return rcu_dereference(dev);
4062 void dev_seq_stop(struct seq_file *seq, void *v)
4068 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
4070 struct rtnl_link_stats64 temp;
4071 const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
4073 seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
4074 "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
4075 dev->name, stats->rx_bytes, stats->rx_packets,
4077 stats->rx_dropped + stats->rx_missed_errors,
4078 stats->rx_fifo_errors,
4079 stats->rx_length_errors + stats->rx_over_errors +
4080 stats->rx_crc_errors + stats->rx_frame_errors,
4081 stats->rx_compressed, stats->multicast,
4082 stats->tx_bytes, stats->tx_packets,
4083 stats->tx_errors, stats->tx_dropped,
4084 stats->tx_fifo_errors, stats->collisions,
4085 stats->tx_carrier_errors +
4086 stats->tx_aborted_errors +
4087 stats->tx_window_errors +
4088 stats->tx_heartbeat_errors,
4089 stats->tx_compressed);
4093 * Called from the PROCfs module. This now uses the new arbitrary sized
4094 * /proc/net interface to create /proc/net/dev
4096 static int dev_seq_show(struct seq_file *seq, void *v)
4098 if (v == SEQ_START_TOKEN)
4099 seq_puts(seq, "Inter-| Receive "
4101 " face |bytes packets errs drop fifo frame "
4102 "compressed multicast|bytes packets errs "
4103 "drop fifo colls carrier compressed\n");
4105 dev_seq_printf_stats(seq, v);
4109 static struct softnet_data *softnet_get_online(loff_t *pos)
4111 struct softnet_data *sd = NULL;
4113 while (*pos < nr_cpu_ids)
4114 if (cpu_online(*pos)) {
4115 sd = &per_cpu(softnet_data, *pos);
4122 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
4124 return softnet_get_online(pos);
4127 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4130 return softnet_get_online(pos);
4133 static void softnet_seq_stop(struct seq_file *seq, void *v)
4137 static int softnet_seq_show(struct seq_file *seq, void *v)
4139 struct softnet_data *sd = v;
4141 seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
4142 sd->processed, sd->dropped, sd->time_squeeze, 0,
4143 0, 0, 0, 0, /* was fastroute */
4144 sd->cpu_collision, sd->received_rps);
4148 static const struct seq_operations dev_seq_ops = {
4149 .start = dev_seq_start,
4150 .next = dev_seq_next,
4151 .stop = dev_seq_stop,
4152 .show = dev_seq_show,
4155 static int dev_seq_open(struct inode *inode, struct file *file)
4157 return seq_open_net(inode, file, &dev_seq_ops,
4158 sizeof(struct seq_net_private));
4161 static const struct file_operations dev_seq_fops = {
4162 .owner = THIS_MODULE,
4163 .open = dev_seq_open,
4165 .llseek = seq_lseek,
4166 .release = seq_release_net,
4169 static const struct seq_operations softnet_seq_ops = {
4170 .start = softnet_seq_start,
4171 .next = softnet_seq_next,
4172 .stop = softnet_seq_stop,
4173 .show = softnet_seq_show,
4176 static int softnet_seq_open(struct inode *inode, struct file *file)
4178 return seq_open(file, &softnet_seq_ops);
4181 static const struct file_operations softnet_seq_fops = {
4182 .owner = THIS_MODULE,
4183 .open = softnet_seq_open,
4185 .llseek = seq_lseek,
4186 .release = seq_release,
4189 static void *ptype_get_idx(loff_t pos)
4191 struct packet_type *pt = NULL;
4195 list_for_each_entry_rcu(pt, &ptype_all, list) {
4201 for (t = 0; t < PTYPE_HASH_SIZE; t++) {
4202 list_for_each_entry_rcu(pt, &ptype_base[t], list) {
4211 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
4215 return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
4218 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4220 struct packet_type *pt;
4221 struct list_head *nxt;
4225 if (v == SEQ_START_TOKEN)
4226 return ptype_get_idx(0);
4229 nxt = pt->list.next;
4230 if (pt->type == htons(ETH_P_ALL)) {
4231 if (nxt != &ptype_all)
4234 nxt = ptype_base[0].next;
4236 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
4238 while (nxt == &ptype_base[hash]) {
4239 if (++hash >= PTYPE_HASH_SIZE)
4241 nxt = ptype_base[hash].next;
4244 return list_entry(nxt, struct packet_type, list);
4247 static void ptype_seq_stop(struct seq_file *seq, void *v)
4253 static int ptype_seq_show(struct seq_file *seq, void *v)
4255 struct packet_type *pt = v;
4257 if (v == SEQ_START_TOKEN)
4258 seq_puts(seq, "Type Device Function\n");
4259 else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
4260 if (pt->type == htons(ETH_P_ALL))
4261 seq_puts(seq, "ALL ");
4263 seq_printf(seq, "%04x", ntohs(pt->type));
4265 seq_printf(seq, " %-8s %pF\n",
4266 pt->dev ? pt->dev->name : "", pt->func);
4272 static const struct seq_operations ptype_seq_ops = {
4273 .start = ptype_seq_start,
4274 .next = ptype_seq_next,
4275 .stop = ptype_seq_stop,
4276 .show = ptype_seq_show,
4279 static int ptype_seq_open(struct inode *inode, struct file *file)
4281 return seq_open_net(inode, file, &ptype_seq_ops,
4282 sizeof(struct seq_net_private));
4285 static const struct file_operations ptype_seq_fops = {
4286 .owner = THIS_MODULE,
4287 .open = ptype_seq_open,
4289 .llseek = seq_lseek,
4290 .release = seq_release_net,
4294 static int __net_init dev_proc_net_init(struct net *net)
4298 if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
4300 if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
4302 if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
4305 if (wext_proc_init(net))
4311 proc_net_remove(net, "ptype");
4313 proc_net_remove(net, "softnet_stat");
4315 proc_net_remove(net, "dev");
4319 static void __net_exit dev_proc_net_exit(struct net *net)
4321 wext_proc_exit(net);
4323 proc_net_remove(net, "ptype");
4324 proc_net_remove(net, "softnet_stat");
4325 proc_net_remove(net, "dev");
4328 static struct pernet_operations __net_initdata dev_proc_ops = {
4329 .init = dev_proc_net_init,
4330 .exit = dev_proc_net_exit,
4333 static int __init dev_proc_init(void)
4335 return register_pernet_subsys(&dev_proc_ops);
4338 #define dev_proc_init() 0
4339 #endif /* CONFIG_PROC_FS */
4343 * netdev_set_master - set up master/slave pair
4344 * @slave: slave device
4345 * @master: new master device
4347 * Changes the master device of the slave. Pass %NULL to break the
4348 * bonding. The caller must hold the RTNL semaphore. On a failure
4349 * a negative errno code is returned. On success the reference counts
4350 * are adjusted, %RTM_NEWLINK is sent to the routing socket and the
4351 * function returns zero.
4353 int netdev_set_master(struct net_device *slave, struct net_device *master)
4355 struct net_device *old = slave->master;
4365 slave->master = master;
4372 slave->flags |= IFF_SLAVE;
4374 slave->flags &= ~IFF_SLAVE;
4376 rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4379 EXPORT_SYMBOL(netdev_set_master);
4381 static void dev_change_rx_flags(struct net_device *dev, int flags)
4383 const struct net_device_ops *ops = dev->netdev_ops;
4385 if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4386 ops->ndo_change_rx_flags(dev, flags);
4389 static int __dev_set_promiscuity(struct net_device *dev, int inc)
4391 unsigned short old_flags = dev->flags;
4397 dev->flags |= IFF_PROMISC;
4398 dev->promiscuity += inc;
4399 if (dev->promiscuity == 0) {
4402 * If inc causes overflow, untouch promisc and return error.
4405 dev->flags &= ~IFF_PROMISC;
4407 dev->promiscuity -= inc;
4408 printk(KERN_WARNING "%s: promiscuity touches roof, "
4409 "set promiscuity failed, promiscuity feature "
4410 "of device might be broken.\n", dev->name);
4414 if (dev->flags != old_flags) {
4415 printk(KERN_INFO "device %s %s promiscuous mode\n",
4416 dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
4418 if (audit_enabled) {
4419 current_uid_gid(&uid, &gid);
4420 audit_log(current->audit_context, GFP_ATOMIC,
4421 AUDIT_ANOM_PROMISCUOUS,
4422 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4423 dev->name, (dev->flags & IFF_PROMISC),
4424 (old_flags & IFF_PROMISC),
4425 audit_get_loginuid(current),
4427 audit_get_sessionid(current));
4430 dev_change_rx_flags(dev, IFF_PROMISC);
4436 * dev_set_promiscuity - update promiscuity count on a device
4440 * Add or remove promiscuity from a device. While the count in the device
4441 * remains above zero the interface remains promiscuous. Once it hits zero
4442 * the device reverts back to normal filtering operation. A negative inc
4443 * value is used to drop promiscuity on the device.
4444 * Return 0 if successful or a negative errno code on error.
4446 int dev_set_promiscuity(struct net_device *dev, int inc)
4448 unsigned short old_flags = dev->flags;
4451 err = __dev_set_promiscuity(dev, inc);
4454 if (dev->flags != old_flags)
4455 dev_set_rx_mode(dev);
4458 EXPORT_SYMBOL(dev_set_promiscuity);
4461 * dev_set_allmulti - update allmulti count on a device
4465 * Add or remove reception of all multicast frames to a device. While the
4466 * count in the device remains above zero the interface remains listening
4467 * to all interfaces. Once it hits zero the device reverts back to normal
4468 * filtering operation. A negative @inc value is used to drop the counter
4469 * when releasing a resource needing all multicasts.
4470 * Return 0 if successful or a negative errno code on error.
4473 int dev_set_allmulti(struct net_device *dev, int inc)
4475 unsigned short old_flags = dev->flags;
4479 dev->flags |= IFF_ALLMULTI;
4480 dev->allmulti += inc;
4481 if (dev->allmulti == 0) {
4484 * If inc causes overflow, untouch allmulti and return error.
4487 dev->flags &= ~IFF_ALLMULTI;
4489 dev->allmulti -= inc;
4490 printk(KERN_WARNING "%s: allmulti touches roof, "
4491 "set allmulti failed, allmulti feature of "
4492 "device might be broken.\n", dev->name);
4496 if (dev->flags ^ old_flags) {
4497 dev_change_rx_flags(dev, IFF_ALLMULTI);
4498 dev_set_rx_mode(dev);
4502 EXPORT_SYMBOL(dev_set_allmulti);
4505 * Upload unicast and multicast address lists to device and
4506 * configure RX filtering. When the device doesn't support unicast
4507 * filtering it is put in promiscuous mode while unicast addresses
4510 void __dev_set_rx_mode(struct net_device *dev)
4512 const struct net_device_ops *ops = dev->netdev_ops;
4514 /* dev_open will call this function so the list will stay sane. */
4515 if (!(dev->flags&IFF_UP))
4518 if (!netif_device_present(dev))
4521 if (ops->ndo_set_rx_mode)
4522 ops->ndo_set_rx_mode(dev);
4524 /* Unicast addresses changes may only happen under the rtnl,
4525 * therefore calling __dev_set_promiscuity here is safe.
4527 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4528 __dev_set_promiscuity(dev, 1);
4529 dev->uc_promisc = 1;
4530 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4531 __dev_set_promiscuity(dev, -1);
4532 dev->uc_promisc = 0;
4535 if (ops->ndo_set_multicast_list)
4536 ops->ndo_set_multicast_list(dev);
4540 void dev_set_rx_mode(struct net_device *dev)
4542 netif_addr_lock_bh(dev);
4543 __dev_set_rx_mode(dev);
4544 netif_addr_unlock_bh(dev);
4548 * dev_get_flags - get flags reported to userspace
4551 * Get the combination of flag bits exported through APIs to userspace.
4553 unsigned dev_get_flags(const struct net_device *dev)
4557 flags = (dev->flags & ~(IFF_PROMISC |
4562 (dev->gflags & (IFF_PROMISC |
4565 if (netif_running(dev)) {
4566 if (netif_oper_up(dev))
4567 flags |= IFF_RUNNING;
4568 if (netif_carrier_ok(dev))
4569 flags |= IFF_LOWER_UP;
4570 if (netif_dormant(dev))
4571 flags |= IFF_DORMANT;
4576 EXPORT_SYMBOL(dev_get_flags);
4578 int __dev_change_flags(struct net_device *dev, unsigned int flags)
4580 int old_flags = dev->flags;
4586 * Set the flags on our device.
4589 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4590 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4592 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4596 * Load in the correct multicast list now the flags have changed.
4599 if ((old_flags ^ flags) & IFF_MULTICAST)
4600 dev_change_rx_flags(dev, IFF_MULTICAST);
4602 dev_set_rx_mode(dev);
4605 * Have we downed the interface. We handle IFF_UP ourselves
4606 * according to user attempts to set it, rather than blindly
4611 if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */
4612 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4615 dev_set_rx_mode(dev);
4618 if ((flags ^ dev->gflags) & IFF_PROMISC) {
4619 int inc = (flags & IFF_PROMISC) ? 1 : -1;
4621 dev->gflags ^= IFF_PROMISC;
4622 dev_set_promiscuity(dev, inc);
4625 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4626 is important. Some (broken) drivers set IFF_PROMISC, when
4627 IFF_ALLMULTI is requested not asking us and not reporting.
4629 if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4630 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4632 dev->gflags ^= IFF_ALLMULTI;
4633 dev_set_allmulti(dev, inc);
4639 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4641 unsigned int changes = dev->flags ^ old_flags;
4643 if (changes & IFF_UP) {
4644 if (dev->flags & IFF_UP)
4645 call_netdevice_notifiers(NETDEV_UP, dev);
4647 call_netdevice_notifiers(NETDEV_DOWN, dev);
4650 if (dev->flags & IFF_UP &&
4651 (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4652 call_netdevice_notifiers(NETDEV_CHANGE, dev);
4656 * dev_change_flags - change device settings
4658 * @flags: device state flags
4660 * Change settings on device based state flags. The flags are
4661 * in the userspace exported format.
4663 int dev_change_flags(struct net_device *dev, unsigned flags)
4666 int old_flags = dev->flags;
4668 ret = __dev_change_flags(dev, flags);
4672 changes = old_flags ^ dev->flags;
4674 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4676 __dev_notify_flags(dev, old_flags);
4679 EXPORT_SYMBOL(dev_change_flags);
4682 * dev_set_mtu - Change maximum transfer unit
4684 * @new_mtu: new transfer unit
4686 * Change the maximum transfer size of the network device.
4688 int dev_set_mtu(struct net_device *dev, int new_mtu)
4690 const struct net_device_ops *ops = dev->netdev_ops;
4693 if (new_mtu == dev->mtu)
4696 /* MTU must be positive. */
4700 if (!netif_device_present(dev))
4704 if (ops->ndo_change_mtu)
4705 err = ops->ndo_change_mtu(dev, new_mtu);
4709 if (!err && dev->flags & IFF_UP)
4710 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4713 EXPORT_SYMBOL(dev_set_mtu);
4716 * dev_set_group - Change group this device belongs to
4718 * @new_group: group this device should belong to
4720 void dev_set_group(struct net_device *dev, int new_group)
4722 dev->group = new_group;
4724 EXPORT_SYMBOL(dev_set_group);
4727 * dev_set_mac_address - Change Media Access Control Address
4731 * Change the hardware (MAC) address of the device
4733 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4735 const struct net_device_ops *ops = dev->netdev_ops;
4738 if (!ops->ndo_set_mac_address)
4740 if (sa->sa_family != dev->type)
4742 if (!netif_device_present(dev))
4744 err = ops->ndo_set_mac_address(dev, sa);
4746 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4749 EXPORT_SYMBOL(dev_set_mac_address);
4752 * Perform the SIOCxIFxxx calls, inside rcu_read_lock()
4754 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4757 struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
4763 case SIOCGIFFLAGS: /* Get interface flags */
4764 ifr->ifr_flags = (short) dev_get_flags(dev);
4767 case SIOCGIFMETRIC: /* Get the metric on the interface
4768 (currently unused) */
4769 ifr->ifr_metric = 0;
4772 case SIOCGIFMTU: /* Get the MTU of a device */
4773 ifr->ifr_mtu = dev->mtu;
4778 memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4780 memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4781 min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4782 ifr->ifr_hwaddr.sa_family = dev->type;
4790 ifr->ifr_map.mem_start = dev->mem_start;
4791 ifr->ifr_map.mem_end = dev->mem_end;
4792 ifr->ifr_map.base_addr = dev->base_addr;
4793 ifr->ifr_map.irq = dev->irq;
4794 ifr->ifr_map.dma = dev->dma;
4795 ifr->ifr_map.port = dev->if_port;
4799 ifr->ifr_ifindex = dev->ifindex;
4803 ifr->ifr_qlen = dev->tx_queue_len;
4807 /* dev_ioctl() should ensure this case
4819 * Perform the SIOCxIFxxx calls, inside rtnl_lock()
4821 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4824 struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4825 const struct net_device_ops *ops;
4830 ops = dev->netdev_ops;
4833 case SIOCSIFFLAGS: /* Set interface flags */
4834 return dev_change_flags(dev, ifr->ifr_flags);
4836 case SIOCSIFMETRIC: /* Set the metric on the interface
4837 (currently unused) */
4840 case SIOCSIFMTU: /* Set the MTU of a device */
4841 return dev_set_mtu(dev, ifr->ifr_mtu);
4844 return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
4846 case SIOCSIFHWBROADCAST:
4847 if (ifr->ifr_hwaddr.sa_family != dev->type)
4849 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4850 min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4851 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4855 if (ops->ndo_set_config) {
4856 if (!netif_device_present(dev))
4858 return ops->ndo_set_config(dev, &ifr->ifr_map);
4863 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4864 ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4866 if (!netif_device_present(dev))
4868 return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
4871 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4872 ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4874 if (!netif_device_present(dev))
4876 return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
4879 if (ifr->ifr_qlen < 0)
4881 dev->tx_queue_len = ifr->ifr_qlen;
4885 ifr->ifr_newname[IFNAMSIZ-1] = '\0';
4886 return dev_change_name(dev, ifr->ifr_newname);
4889 * Unknown or private ioctl
4892 if ((cmd >= SIOCDEVPRIVATE &&
4893 cmd <= SIOCDEVPRIVATE + 15) ||
4894 cmd == SIOCBONDENSLAVE ||
4895 cmd == SIOCBONDRELEASE ||
4896 cmd == SIOCBONDSETHWADDR ||
4897 cmd == SIOCBONDSLAVEINFOQUERY ||
4898 cmd == SIOCBONDINFOQUERY ||
4899 cmd == SIOCBONDCHANGEACTIVE ||
4900 cmd == SIOCGMIIPHY ||
4901 cmd == SIOCGMIIREG ||
4902 cmd == SIOCSMIIREG ||
4903 cmd == SIOCBRADDIF ||
4904 cmd == SIOCBRDELIF ||
4905 cmd == SIOCSHWTSTAMP ||
4906 cmd == SIOCWANDEV) {
4908 if (ops->ndo_do_ioctl) {
4909 if (netif_device_present(dev))
4910 err = ops->ndo_do_ioctl(dev, ifr, cmd);
4922 * This function handles all "interface"-type I/O control requests. The actual
4923 * 'doing' part of this is dev_ifsioc above.
4927 * dev_ioctl - network device ioctl
4928 * @net: the applicable net namespace
4929 * @cmd: command to issue
4930 * @arg: pointer to a struct ifreq in user space
4932 * Issue ioctl functions to devices. This is normally called by the
4933 * user space syscall interfaces but can sometimes be useful for
4934 * other purposes. The return value is the return from the syscall if
4935 * positive or a negative errno code on error.
4938 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
4944 /* One special case: SIOCGIFCONF takes ifconf argument
4945 and requires shared lock, because it sleeps writing
4949 if (cmd == SIOCGIFCONF) {
4951 ret = dev_ifconf(net, (char __user *) arg);
4955 if (cmd == SIOCGIFNAME)
4956 return dev_ifname(net, (struct ifreq __user *)arg);
4958 if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4961 ifr.ifr_name[IFNAMSIZ-1] = 0;
4963 colon = strchr(ifr.ifr_name, ':');
4968 * See which interface the caller is talking about.
4973 * These ioctl calls:
4974 * - can be done by all.
4975 * - atomic and do not require locking.
4986 dev_load(net, ifr.ifr_name);
4988 ret = dev_ifsioc_locked(net, &ifr, cmd);
4993 if (copy_to_user(arg, &ifr,
4994 sizeof(struct ifreq)))
5000 dev_load(net, ifr.ifr_name);
5002 ret = dev_ethtool(net, &ifr);
5007 if (copy_to_user(arg, &ifr,
5008 sizeof(struct ifreq)))
5014 * These ioctl calls:
5015 * - require superuser power.
5016 * - require strict serialization.
5022 if (!capable(CAP_NET_ADMIN))
5024 dev_load(net, ifr.ifr_name);
5026 ret = dev_ifsioc(net, &ifr, cmd);
5031 if (copy_to_user(arg, &ifr,
5032 sizeof(struct ifreq)))
5038 * These ioctl calls:
5039 * - require superuser power.
5040 * - require strict serialization.
5041 * - do not return a value
5051 case SIOCSIFHWBROADCAST:
5054 case SIOCBONDENSLAVE:
5055 case SIOCBONDRELEASE:
5056 case SIOCBONDSETHWADDR:
5057 case SIOCBONDCHANGEACTIVE:
5061 if (!capable(CAP_NET_ADMIN))
5064 case SIOCBONDSLAVEINFOQUERY:
5065 case SIOCBONDINFOQUERY:
5066 dev_load(net, ifr.ifr_name);
5068 ret = dev_ifsioc(net, &ifr, cmd);
5073 /* Get the per device memory space. We can add this but
5074 * currently do not support it */
5076 /* Set the per device memory buffer space.
5077 * Not applicable in our case */
5082 * Unknown or private ioctl.
5085 if (cmd == SIOCWANDEV ||
5086 (cmd >= SIOCDEVPRIVATE &&
5087 cmd <= SIOCDEVPRIVATE + 15)) {
5088 dev_load(net, ifr.ifr_name);
5090 ret = dev_ifsioc(net, &ifr, cmd);
5092 if (!ret && copy_to_user(arg, &ifr,
5093 sizeof(struct ifreq)))
5097 /* Take care of Wireless Extensions */
5098 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
5099 return wext_handle_ioctl(net, &ifr, cmd, arg);
5106 * dev_new_index - allocate an ifindex
5107 * @net: the applicable net namespace
5109 * Returns a suitable unique value for a new device interface
5110 * number. The caller must hold the rtnl semaphore or the
5111 * dev_base_lock to be sure it remains unique.
5113 static int dev_new_index(struct net *net)
5119 if (!__dev_get_by_index(net, ifindex))
5124 /* Delayed registration/unregisteration */
5125 static LIST_HEAD(net_todo_list);
5127 static void net_set_todo(struct net_device *dev)
5129 list_add_tail(&dev->todo_list, &net_todo_list);
5132 static void rollback_registered_many(struct list_head *head)
5134 struct net_device *dev, *tmp;
5136 BUG_ON(dev_boot_phase);
5139 list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5140 /* Some devices call without registering
5141 * for initialization unwind. Remove those
5142 * devices and proceed with the remaining.
5144 if (dev->reg_state == NETREG_UNINITIALIZED) {
5145 pr_debug("unregister_netdevice: device %s/%p never "
5146 "was registered\n", dev->name, dev);
5149 list_del(&dev->unreg_list);
5153 BUG_ON(dev->reg_state != NETREG_REGISTERED);
5156 /* If device is running, close it first. */
5157 dev_close_many(head);
5159 list_for_each_entry(dev, head, unreg_list) {
5160 /* And unlink it from device chain. */
5161 unlist_netdevice(dev);
5163 dev->reg_state = NETREG_UNREGISTERING;
5168 list_for_each_entry(dev, head, unreg_list) {
5169 /* Shutdown queueing discipline. */
5173 /* Notify protocols, that we are about to destroy
5174 this device. They should clean all the things.
5176 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5178 if (!dev->rtnl_link_ops ||
5179 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5180 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5183 * Flush the unicast and multicast chains
5188 if (dev->netdev_ops->ndo_uninit)
5189 dev->netdev_ops->ndo_uninit(dev);
5191 /* Notifier chain MUST detach us from master device. */
5192 WARN_ON(dev->master);
5194 /* Remove entries from kobject tree */
5195 netdev_unregister_kobject(dev);
5198 /* Process any work delayed until the end of the batch */
5199 dev = list_first_entry(head, struct net_device, unreg_list);
5200 call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5204 list_for_each_entry(dev, head, unreg_list)
5208 static void rollback_registered(struct net_device *dev)
5212 list_add(&dev->unreg_list, &single);
5213 rollback_registered_many(&single);
5216 unsigned long netdev_fix_features(unsigned long features, const char *name)
5218 /* Fix illegal SG+CSUM combinations. */
5219 if ((features & NETIF_F_SG) &&
5220 !(features & NETIF_F_ALL_CSUM)) {
5222 printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no "
5223 "checksum feature.\n", name);
5224 features &= ~NETIF_F_SG;
5227 /* TSO requires that SG is present as well. */
5228 if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) {
5230 printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no "
5231 "SG feature.\n", name);
5232 features &= ~NETIF_F_TSO;
5235 if (features & NETIF_F_UFO) {
5236 /* maybe split UFO into V4 and V6? */
5237 if (!((features & NETIF_F_GEN_CSUM) ||
5238 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5239 == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5241 printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
5242 "since no checksum offload features.\n",
5244 features &= ~NETIF_F_UFO;
5247 if (!(features & NETIF_F_SG)) {
5249 printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
5250 "since no NETIF_F_SG feature.\n", name);
5251 features &= ~NETIF_F_UFO;
5257 EXPORT_SYMBOL(netdev_fix_features);
5260 * netif_stacked_transfer_operstate - transfer operstate
5261 * @rootdev: the root or lower level device to transfer state from
5262 * @dev: the device to transfer operstate to
5264 * Transfer operational state from root to device. This is normally
5265 * called when a stacking relationship exists between the root
5266 * device and the device(a leaf device).
5268 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5269 struct net_device *dev)
5271 if (rootdev->operstate == IF_OPER_DORMANT)
5272 netif_dormant_on(dev);
5274 netif_dormant_off(dev);
5276 if (netif_carrier_ok(rootdev)) {
5277 if (!netif_carrier_ok(dev))
5278 netif_carrier_on(dev);
5280 if (netif_carrier_ok(dev))
5281 netif_carrier_off(dev);
5284 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5287 static int netif_alloc_rx_queues(struct net_device *dev)
5289 unsigned int i, count = dev->num_rx_queues;
5290 struct netdev_rx_queue *rx;
5294 rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5296 pr_err("netdev: Unable to allocate %u rx queues.\n", count);
5301 for (i = 0; i < count; i++)
5307 static void netdev_init_one_queue(struct net_device *dev,
5308 struct netdev_queue *queue, void *_unused)
5310 /* Initialize queue lock */
5311 spin_lock_init(&queue->_xmit_lock);
5312 netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5313 queue->xmit_lock_owner = -1;
5314 netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5318 static int netif_alloc_netdev_queues(struct net_device *dev)
5320 unsigned int count = dev->num_tx_queues;
5321 struct netdev_queue *tx;
5325 tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5327 pr_err("netdev: Unable to allocate %u tx queues.\n",
5333 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5334 spin_lock_init(&dev->tx_global_lock);
5340 * register_netdevice - register a network device
5341 * @dev: device to register
5343 * Take a completed network device structure and add it to the kernel
5344 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5345 * chain. 0 is returned on success. A negative errno code is returned
5346 * on a failure to set up the device, or if the name is a duplicate.
5348 * Callers must hold the rtnl semaphore. You may want
5349 * register_netdev() instead of this.
5352 * The locking appears insufficient to guarantee two parallel registers
5353 * will not get the same name.
5356 int register_netdevice(struct net_device *dev)
5359 struct net *net = dev_net(dev);
5361 BUG_ON(dev_boot_phase);
5366 /* When net_device's are persistent, this will be fatal. */
5367 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5370 spin_lock_init(&dev->addr_list_lock);
5371 netdev_set_addr_lockdep_class(dev);
5375 /* Init, if this function is available */
5376 if (dev->netdev_ops->ndo_init) {
5377 ret = dev->netdev_ops->ndo_init(dev);
5385 ret = dev_get_valid_name(dev, dev->name, 0);
5389 dev->ifindex = dev_new_index(net);
5390 if (dev->iflink == -1)
5391 dev->iflink = dev->ifindex;
5393 /* Fix illegal checksum combinations */
5394 if ((dev->features & NETIF_F_HW_CSUM) &&
5395 (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5396 printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n",
5398 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5401 if ((dev->features & NETIF_F_NO_CSUM) &&
5402 (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5403 printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n",
5405 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
5408 dev->features = netdev_fix_features(dev->features, dev->name);
5410 /* Enable software GSO if SG is supported. */
5411 if (dev->features & NETIF_F_SG)
5412 dev->features |= NETIF_F_GSO;
5414 /* Enable GRO and NETIF_F_HIGHDMA for vlans by default,
5415 * vlan_dev_init() will do the dev->features check, so these features
5416 * are enabled only if supported by underlying device.
5418 dev->vlan_features |= (NETIF_F_GRO | NETIF_F_HIGHDMA);
5420 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5421 ret = notifier_to_errno(ret);
5425 ret = netdev_register_kobject(dev);
5428 dev->reg_state = NETREG_REGISTERED;
5431 * Default initial state at registry is that the
5432 * device is present.
5435 set_bit(__LINK_STATE_PRESENT, &dev->state);
5437 dev_init_scheduler(dev);
5439 list_netdevice(dev);
5441 /* Notify protocols, that a new device appeared. */
5442 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5443 ret = notifier_to_errno(ret);
5445 rollback_registered(dev);
5446 dev->reg_state = NETREG_UNREGISTERED;
5449 * Prevent userspace races by waiting until the network
5450 * device is fully setup before sending notifications.
5452 if (!dev->rtnl_link_ops ||
5453 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5454 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5460 if (dev->netdev_ops->ndo_uninit)
5461 dev->netdev_ops->ndo_uninit(dev);
5464 EXPORT_SYMBOL(register_netdevice);
5467 * init_dummy_netdev - init a dummy network device for NAPI
5468 * @dev: device to init
5470 * This takes a network device structure and initialize the minimum
5471 * amount of fields so it can be used to schedule NAPI polls without
5472 * registering a full blown interface. This is to be used by drivers
5473 * that need to tie several hardware interfaces to a single NAPI
5474 * poll scheduler due to HW limitations.
5476 int init_dummy_netdev(struct net_device *dev)
5478 /* Clear everything. Note we don't initialize spinlocks
5479 * are they aren't supposed to be taken by any of the
5480 * NAPI code and this dummy netdev is supposed to be
5481 * only ever used for NAPI polls
5483 memset(dev, 0, sizeof(struct net_device));
5485 /* make sure we BUG if trying to hit standard
5486 * register/unregister code path
5488 dev->reg_state = NETREG_DUMMY;
5490 /* NAPI wants this */
5491 INIT_LIST_HEAD(&dev->napi_list);
5493 /* a dummy interface is started by default */
5494 set_bit(__LINK_STATE_PRESENT, &dev->state);
5495 set_bit(__LINK_STATE_START, &dev->state);
5497 /* Note : We dont allocate pcpu_refcnt for dummy devices,
5498 * because users of this 'device' dont need to change
5504 EXPORT_SYMBOL_GPL(init_dummy_netdev);
5508 * register_netdev - register a network device
5509 * @dev: device to register
5511 * Take a completed network device structure and add it to the kernel
5512 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5513 * chain. 0 is returned on success. A negative errno code is returned
5514 * on a failure to set up the device, or if the name is a duplicate.
5516 * This is a wrapper around register_netdevice that takes the rtnl semaphore
5517 * and expands the device name if you passed a format string to
5520 int register_netdev(struct net_device *dev)
5527 * If the name is a format string the caller wants us to do a
5530 if (strchr(dev->name, '%')) {
5531 err = dev_alloc_name(dev, dev->name);
5536 err = register_netdevice(dev);
5541 EXPORT_SYMBOL(register_netdev);
5543 int netdev_refcnt_read(const struct net_device *dev)
5547 for_each_possible_cpu(i)
5548 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5551 EXPORT_SYMBOL(netdev_refcnt_read);
5554 * netdev_wait_allrefs - wait until all references are gone.
5556 * This is called when unregistering network devices.
5558 * Any protocol or device that holds a reference should register
5559 * for netdevice notification, and cleanup and put back the
5560 * reference if they receive an UNREGISTER event.
5561 * We can get stuck here if buggy protocols don't correctly
5564 static void netdev_wait_allrefs(struct net_device *dev)
5566 unsigned long rebroadcast_time, warning_time;
5569 linkwatch_forget_dev(dev);
5571 rebroadcast_time = warning_time = jiffies;
5572 refcnt = netdev_refcnt_read(dev);
5574 while (refcnt != 0) {
5575 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5578 /* Rebroadcast unregister notification */
5579 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5580 /* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
5581 * should have already handle it the first time */
5583 if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5585 /* We must not have linkwatch events
5586 * pending on unregister. If this
5587 * happens, we simply run the queue
5588 * unscheduled, resulting in a noop
5591 linkwatch_run_queue();
5596 rebroadcast_time = jiffies;
5601 refcnt = netdev_refcnt_read(dev);
5603 if (time_after(jiffies, warning_time + 10 * HZ)) {
5604 printk(KERN_EMERG "unregister_netdevice: "
5605 "waiting for %s to become free. Usage "
5608 warning_time = jiffies;
5617 * register_netdevice(x1);
5618 * register_netdevice(x2);
5620 * unregister_netdevice(y1);
5621 * unregister_netdevice(y2);
5627 * We are invoked by rtnl_unlock().
5628 * This allows us to deal with problems:
5629 * 1) We can delete sysfs objects which invoke hotplug
5630 * without deadlocking with linkwatch via keventd.
5631 * 2) Since we run with the RTNL semaphore not held, we can sleep
5632 * safely in order to wait for the netdev refcnt to drop to zero.
5634 * We must not return until all unregister events added during
5635 * the interval the lock was held have been completed.
5637 void netdev_run_todo(void)
5639 struct list_head list;
5641 /* Snapshot list, allow later requests */
5642 list_replace_init(&net_todo_list, &list);
5646 while (!list_empty(&list)) {
5647 struct net_device *dev
5648 = list_first_entry(&list, struct net_device, todo_list);
5649 list_del(&dev->todo_list);
5651 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5652 printk(KERN_ERR "network todo '%s' but state %d\n",
5653 dev->name, dev->reg_state);
5658 dev->reg_state = NETREG_UNREGISTERED;
5660 on_each_cpu(flush_backlog, dev, 1);
5662 netdev_wait_allrefs(dev);
5665 BUG_ON(netdev_refcnt_read(dev));
5666 WARN_ON(rcu_dereference_raw(dev->ip_ptr));
5667 WARN_ON(rcu_dereference_raw(dev->ip6_ptr));
5668 WARN_ON(dev->dn_ptr);
5670 if (dev->destructor)
5671 dev->destructor(dev);
5673 /* Free network device */
5674 kobject_put(&dev->dev.kobj);
5678 /* Convert net_device_stats to rtnl_link_stats64. They have the same
5679 * fields in the same order, with only the type differing.
5681 static void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5682 const struct net_device_stats *netdev_stats)
5684 #if BITS_PER_LONG == 64
5685 BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5686 memcpy(stats64, netdev_stats, sizeof(*stats64));
5688 size_t i, n = sizeof(*stats64) / sizeof(u64);
5689 const unsigned long *src = (const unsigned long *)netdev_stats;
5690 u64 *dst = (u64 *)stats64;
5692 BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5693 sizeof(*stats64) / sizeof(u64));
5694 for (i = 0; i < n; i++)
5700 * dev_get_stats - get network device statistics
5701 * @dev: device to get statistics from
5702 * @storage: place to store stats
5704 * Get network statistics from device. Return @storage.
5705 * The device driver may provide its own method by setting
5706 * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5707 * otherwise the internal statistics structure is used.
5709 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5710 struct rtnl_link_stats64 *storage)
5712 const struct net_device_ops *ops = dev->netdev_ops;
5714 if (ops->ndo_get_stats64) {
5715 memset(storage, 0, sizeof(*storage));
5716 ops->ndo_get_stats64(dev, storage);
5717 } else if (ops->ndo_get_stats) {
5718 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
5720 netdev_stats_to_stats64(storage, &dev->stats);
5722 storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
5725 EXPORT_SYMBOL(dev_get_stats);
5727 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
5729 struct netdev_queue *queue = dev_ingress_queue(dev);
5731 #ifdef CONFIG_NET_CLS_ACT
5734 queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5737 netdev_init_one_queue(dev, queue, NULL);
5738 queue->qdisc = &noop_qdisc;
5739 queue->qdisc_sleeping = &noop_qdisc;
5740 rcu_assign_pointer(dev->ingress_queue, queue);
5746 * alloc_netdev_mqs - allocate network device
5747 * @sizeof_priv: size of private data to allocate space for
5748 * @name: device name format string
5749 * @setup: callback to initialize device
5750 * @txqs: the number of TX subqueues to allocate
5751 * @rxqs: the number of RX subqueues to allocate
5753 * Allocates a struct net_device with private data area for driver use
5754 * and performs basic initialization. Also allocates subquue structs
5755 * for each queue on the device.
5757 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
5758 void (*setup)(struct net_device *),
5759 unsigned int txqs, unsigned int rxqs)
5761 struct net_device *dev;
5763 struct net_device *p;
5765 BUG_ON(strlen(name) >= sizeof(dev->name));
5768 pr_err("alloc_netdev: Unable to allocate device "
5769 "with zero queues.\n");
5775 pr_err("alloc_netdev: Unable to allocate device "
5776 "with zero RX queues.\n");
5781 alloc_size = sizeof(struct net_device);
5783 /* ensure 32-byte alignment of private area */
5784 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5785 alloc_size += sizeof_priv;
5787 /* ensure 32-byte alignment of whole construct */
5788 alloc_size += NETDEV_ALIGN - 1;
5790 p = kzalloc(alloc_size, GFP_KERNEL);
5792 printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
5796 dev = PTR_ALIGN(p, NETDEV_ALIGN);
5797 dev->padded = (char *)dev - (char *)p;
5799 dev->pcpu_refcnt = alloc_percpu(int);
5800 if (!dev->pcpu_refcnt)
5803 if (dev_addr_init(dev))
5809 dev_net_set(dev, &init_net);
5811 dev->num_tx_queues = txqs;
5812 dev->real_num_tx_queues = txqs;
5813 if (netif_alloc_netdev_queues(dev))
5817 dev->num_rx_queues = rxqs;
5818 dev->real_num_rx_queues = rxqs;
5819 if (netif_alloc_rx_queues(dev))
5823 dev->gso_max_size = GSO_MAX_SIZE;
5825 INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list);
5826 dev->ethtool_ntuple_list.count = 0;
5827 INIT_LIST_HEAD(&dev->napi_list);
5828 INIT_LIST_HEAD(&dev->unreg_list);
5829 INIT_LIST_HEAD(&dev->link_watch_list);
5830 dev->priv_flags = IFF_XMIT_DST_RELEASE;
5832 strcpy(dev->name, name);
5833 dev->group = INIT_NETDEV_GROUP;
5837 free_percpu(dev->pcpu_refcnt);
5847 EXPORT_SYMBOL(alloc_netdev_mqs);
5850 * free_netdev - free network device
5853 * This function does the last stage of destroying an allocated device
5854 * interface. The reference to the device object is released.
5855 * If this is the last reference then it will be freed.
5857 void free_netdev(struct net_device *dev)
5859 struct napi_struct *p, *n;
5861 release_net(dev_net(dev));
5868 kfree(rcu_dereference_raw(dev->ingress_queue));
5870 /* Flush device addresses */
5871 dev_addr_flush(dev);
5873 /* Clear ethtool n-tuple list */
5874 ethtool_ntuple_flush(dev);
5876 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5879 free_percpu(dev->pcpu_refcnt);
5880 dev->pcpu_refcnt = NULL;
5882 /* Compatibility with error handling in drivers */
5883 if (dev->reg_state == NETREG_UNINITIALIZED) {
5884 kfree((char *)dev - dev->padded);
5888 BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5889 dev->reg_state = NETREG_RELEASED;
5891 /* will free via device release */
5892 put_device(&dev->dev);
5894 EXPORT_SYMBOL(free_netdev);
5897 * synchronize_net - Synchronize with packet receive processing
5899 * Wait for packets currently being received to be done.
5900 * Does not block later packets from starting.
5902 void synchronize_net(void)
5907 EXPORT_SYMBOL(synchronize_net);
5910 * unregister_netdevice_queue - remove device from the kernel
5914 * This function shuts down a device interface and removes it
5915 * from the kernel tables.
5916 * If head not NULL, device is queued to be unregistered later.
5918 * Callers must hold the rtnl semaphore. You may want
5919 * unregister_netdev() instead of this.
5922 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
5927 list_move_tail(&dev->unreg_list, head);
5929 rollback_registered(dev);
5930 /* Finish processing unregister after unlock */
5934 EXPORT_SYMBOL(unregister_netdevice_queue);
5937 * unregister_netdevice_many - unregister many devices
5938 * @head: list of devices
5940 void unregister_netdevice_many(struct list_head *head)
5942 struct net_device *dev;
5944 if (!list_empty(head)) {
5945 rollback_registered_many(head);
5946 list_for_each_entry(dev, head, unreg_list)
5950 EXPORT_SYMBOL(unregister_netdevice_many);
5953 * unregister_netdev - remove device from the kernel
5956 * This function shuts down a device interface and removes it
5957 * from the kernel tables.
5959 * This is just a wrapper for unregister_netdevice that takes
5960 * the rtnl semaphore. In general you want to use this and not
5961 * unregister_netdevice.
5963 void unregister_netdev(struct net_device *dev)
5966 unregister_netdevice(dev);
5969 EXPORT_SYMBOL(unregister_netdev);
5972 * dev_change_net_namespace - move device to different nethost namespace
5974 * @net: network namespace
5975 * @pat: If not NULL name pattern to try if the current device name
5976 * is already taken in the destination network namespace.
5978 * This function shuts down a device interface and moves it
5979 * to a new network namespace. On success 0 is returned, on
5980 * a failure a netagive errno code is returned.
5982 * Callers must hold the rtnl semaphore.
5985 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
5991 /* Don't allow namespace local devices to be moved. */
5993 if (dev->features & NETIF_F_NETNS_LOCAL)
5996 /* Ensure the device has been registrered */
5998 if (dev->reg_state != NETREG_REGISTERED)
6001 /* Get out if there is nothing todo */
6003 if (net_eq(dev_net(dev), net))
6006 /* Pick the destination device name, and ensure
6007 * we can use it in the destination network namespace.
6010 if (__dev_get_by_name(net, dev->name)) {
6011 /* We get here if we can't use the current device name */
6014 if (dev_get_valid_name(dev, pat, 1))
6019 * And now a mini version of register_netdevice unregister_netdevice.
6022 /* If device is running close it first. */
6025 /* And unlink it from device chain */
6027 unlist_netdevice(dev);
6031 /* Shutdown queueing discipline. */
6034 /* Notify protocols, that we are about to destroy
6035 this device. They should clean all the things.
6037 Note that dev->reg_state stays at NETREG_REGISTERED.
6038 This is wanted because this way 8021q and macvlan know
6039 the device is just moving and can keep their slaves up.
6041 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6042 call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
6045 * Flush the unicast and multicast chains
6050 /* Actually switch the network namespace */
6051 dev_net_set(dev, net);
6053 /* If there is an ifindex conflict assign a new one */
6054 if (__dev_get_by_index(net, dev->ifindex)) {
6055 int iflink = (dev->iflink == dev->ifindex);
6056 dev->ifindex = dev_new_index(net);
6058 dev->iflink = dev->ifindex;
6061 /* Fixup kobjects */
6062 err = device_rename(&dev->dev, dev->name);
6065 /* Add the device back in the hashes */
6066 list_netdevice(dev);
6068 /* Notify protocols, that a new device appeared. */
6069 call_netdevice_notifiers(NETDEV_REGISTER, dev);
6072 * Prevent userspace races by waiting until the network
6073 * device is fully setup before sending notifications.
6075 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6082 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6084 static int dev_cpu_callback(struct notifier_block *nfb,
6085 unsigned long action,
6088 struct sk_buff **list_skb;
6089 struct sk_buff *skb;
6090 unsigned int cpu, oldcpu = (unsigned long)ocpu;
6091 struct softnet_data *sd, *oldsd;
6093 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6096 local_irq_disable();
6097 cpu = smp_processor_id();
6098 sd = &per_cpu(softnet_data, cpu);
6099 oldsd = &per_cpu(softnet_data, oldcpu);
6101 /* Find end of our completion_queue. */
6102 list_skb = &sd->completion_queue;
6104 list_skb = &(*list_skb)->next;
6105 /* Append completion queue from offline CPU. */
6106 *list_skb = oldsd->completion_queue;
6107 oldsd->completion_queue = NULL;
6109 /* Append output queue from offline CPU. */
6110 if (oldsd->output_queue) {
6111 *sd->output_queue_tailp = oldsd->output_queue;
6112 sd->output_queue_tailp = oldsd->output_queue_tailp;
6113 oldsd->output_queue = NULL;
6114 oldsd->output_queue_tailp = &oldsd->output_queue;
6117 raise_softirq_irqoff(NET_TX_SOFTIRQ);
6120 /* Process offline CPU's input_pkt_queue */
6121 while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6123 input_queue_head_incr(oldsd);
6125 while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6127 input_queue_head_incr(oldsd);
6135 * netdev_increment_features - increment feature set by one
6136 * @all: current feature set
6137 * @one: new feature set
6138 * @mask: mask feature set
6140 * Computes a new feature set after adding a device with feature set
6141 * @one to the master device with current feature set @all. Will not
6142 * enable anything that is off in @mask. Returns the new feature set.
6144 unsigned long netdev_increment_features(unsigned long all, unsigned long one,
6147 /* If device needs checksumming, downgrade to it. */
6148 if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
6149 all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM);
6150 else if (mask & NETIF_F_ALL_CSUM) {
6151 /* If one device supports v4/v6 checksumming, set for all. */
6152 if (one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM) &&
6153 !(all & NETIF_F_GEN_CSUM)) {
6154 all &= ~NETIF_F_ALL_CSUM;
6155 all |= one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
6158 /* If one device supports hw checksumming, set for all. */
6159 if (one & NETIF_F_GEN_CSUM && !(all & NETIF_F_GEN_CSUM)) {
6160 all &= ~NETIF_F_ALL_CSUM;
6161 all |= NETIF_F_HW_CSUM;
6165 one |= NETIF_F_ALL_CSUM;
6167 one |= all & NETIF_F_ONE_FOR_ALL;
6168 all &= one | NETIF_F_LLTX | NETIF_F_GSO | NETIF_F_UFO;
6169 all |= one & mask & NETIF_F_ONE_FOR_ALL;
6173 EXPORT_SYMBOL(netdev_increment_features);
6175 static struct hlist_head *netdev_create_hash(void)
6178 struct hlist_head *hash;
6180 hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6182 for (i = 0; i < NETDEV_HASHENTRIES; i++)
6183 INIT_HLIST_HEAD(&hash[i]);
6188 /* Initialize per network namespace state */
6189 static int __net_init netdev_init(struct net *net)
6191 INIT_LIST_HEAD(&net->dev_base_head);
6193 net->dev_name_head = netdev_create_hash();
6194 if (net->dev_name_head == NULL)
6197 net->dev_index_head = netdev_create_hash();
6198 if (net->dev_index_head == NULL)
6204 kfree(net->dev_name_head);
6210 * netdev_drivername - network driver for the device
6211 * @dev: network device
6212 * @buffer: buffer for resulting name
6213 * @len: size of buffer
6215 * Determine network driver for device.
6217 char *netdev_drivername(const struct net_device *dev, char *buffer, int len)
6219 const struct device_driver *driver;
6220 const struct device *parent;
6222 if (len <= 0 || !buffer)
6226 parent = dev->dev.parent;
6231 driver = parent->driver;
6232 if (driver && driver->name)
6233 strlcpy(buffer, driver->name, len);
6237 static int __netdev_printk(const char *level, const struct net_device *dev,
6238 struct va_format *vaf)
6242 if (dev && dev->dev.parent)
6243 r = dev_printk(level, dev->dev.parent, "%s: %pV",
6244 netdev_name(dev), vaf);
6246 r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6248 r = printk("%s(NULL net_device): %pV", level, vaf);
6253 int netdev_printk(const char *level, const struct net_device *dev,
6254 const char *format, ...)
6256 struct va_format vaf;
6260 va_start(args, format);
6265 r = __netdev_printk(level, dev, &vaf);
6270 EXPORT_SYMBOL(netdev_printk);
6272 #define define_netdev_printk_level(func, level) \
6273 int func(const struct net_device *dev, const char *fmt, ...) \
6276 struct va_format vaf; \
6279 va_start(args, fmt); \
6284 r = __netdev_printk(level, dev, &vaf); \
6289 EXPORT_SYMBOL(func);
6291 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6292 define_netdev_printk_level(netdev_alert, KERN_ALERT);
6293 define_netdev_printk_level(netdev_crit, KERN_CRIT);
6294 define_netdev_printk_level(netdev_err, KERN_ERR);
6295 define_netdev_printk_level(netdev_warn, KERN_WARNING);
6296 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6297 define_netdev_printk_level(netdev_info, KERN_INFO);
6299 static void __net_exit netdev_exit(struct net *net)
6301 kfree(net->dev_name_head);
6302 kfree(net->dev_index_head);
6305 static struct pernet_operations __net_initdata netdev_net_ops = {
6306 .init = netdev_init,
6307 .exit = netdev_exit,
6310 static void __net_exit default_device_exit(struct net *net)
6312 struct net_device *dev, *aux;
6314 * Push all migratable network devices back to the
6315 * initial network namespace
6318 for_each_netdev_safe(net, dev, aux) {
6320 char fb_name[IFNAMSIZ];
6322 /* Ignore unmoveable devices (i.e. loopback) */
6323 if (dev->features & NETIF_F_NETNS_LOCAL)
6326 /* Leave virtual devices for the generic cleanup */
6327 if (dev->rtnl_link_ops)
6330 /* Push remaing network devices to init_net */
6331 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6332 err = dev_change_net_namespace(dev, &init_net, fb_name);
6334 printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
6335 __func__, dev->name, err);
6342 static void __net_exit default_device_exit_batch(struct list_head *net_list)
6344 /* At exit all network devices most be removed from a network
6345 * namespace. Do this in the reverse order of registration.
6346 * Do this across as many network namespaces as possible to
6347 * improve batching efficiency.
6349 struct net_device *dev;
6351 LIST_HEAD(dev_kill_list);
6354 list_for_each_entry(net, net_list, exit_list) {
6355 for_each_netdev_reverse(net, dev) {
6356 if (dev->rtnl_link_ops)
6357 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6359 unregister_netdevice_queue(dev, &dev_kill_list);
6362 unregister_netdevice_many(&dev_kill_list);
6366 static struct pernet_operations __net_initdata default_device_ops = {
6367 .exit = default_device_exit,
6368 .exit_batch = default_device_exit_batch,
6372 * Initialize the DEV module. At boot time this walks the device list and
6373 * unhooks any devices that fail to initialise (normally hardware not
6374 * present) and leaves us with a valid list of present and active devices.
6379 * This is called single threaded during boot, so no need
6380 * to take the rtnl semaphore.
6382 static int __init net_dev_init(void)
6384 int i, rc = -ENOMEM;
6386 BUG_ON(!dev_boot_phase);
6388 if (dev_proc_init())
6391 if (netdev_kobject_init())
6394 INIT_LIST_HEAD(&ptype_all);
6395 for (i = 0; i < PTYPE_HASH_SIZE; i++)
6396 INIT_LIST_HEAD(&ptype_base[i]);
6398 if (register_pernet_subsys(&netdev_net_ops))
6402 * Initialise the packet receive queues.
6405 for_each_possible_cpu(i) {
6406 struct softnet_data *sd = &per_cpu(softnet_data, i);
6408 memset(sd, 0, sizeof(*sd));
6409 skb_queue_head_init(&sd->input_pkt_queue);
6410 skb_queue_head_init(&sd->process_queue);
6411 sd->completion_queue = NULL;
6412 INIT_LIST_HEAD(&sd->poll_list);
6413 sd->output_queue = NULL;
6414 sd->output_queue_tailp = &sd->output_queue;
6416 sd->csd.func = rps_trigger_softirq;
6422 sd->backlog.poll = process_backlog;
6423 sd->backlog.weight = weight_p;
6424 sd->backlog.gro_list = NULL;
6425 sd->backlog.gro_count = 0;
6430 /* The loopback device is special if any other network devices
6431 * is present in a network namespace the loopback device must
6432 * be present. Since we now dynamically allocate and free the
6433 * loopback device ensure this invariant is maintained by
6434 * keeping the loopback device as the first device on the
6435 * list of network devices. Ensuring the loopback devices
6436 * is the first device that appears and the last network device
6439 if (register_pernet_device(&loopback_net_ops))
6442 if (register_pernet_device(&default_device_ops))
6445 open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6446 open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6448 hotcpu_notifier(dev_cpu_callback, 0);
6456 subsys_initcall(net_dev_init);
6458 static int __init initialize_hashrnd(void)
6460 get_random_bytes(&hashrnd, sizeof(hashrnd));
6464 late_initcall_sync(initialize_hashrnd);