2 * NET3 Protocol independent device support routines.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
9 * Derived from the non IP parts of dev.c 1.0.19
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
15 * Florian la Roche <rzsfl@rz.uni-sb.de>
16 * Alan Cox <gw4pts@gw4pts.ampr.org>
17 * David Hinds <dahinds@users.sourceforge.net>
18 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19 * Adam Sulmicki <adam@cfar.umd.edu>
20 * Pekka Riikonen <priikone@poesidon.pspt.fi>
23 * D.J. Barrow : Fixed bug where dev->refcnt gets set
24 * to 2 if register_netdev gets called
25 * before net_dev_init & also removed a
26 * few lines of code in the process.
27 * Alan Cox : device private ioctl copies fields back.
28 * Alan Cox : Transmit queue code does relevant
29 * stunts to keep the queue safe.
30 * Alan Cox : Fixed double lock.
31 * Alan Cox : Fixed promisc NULL pointer trap
32 * ???????? : Support the full private ioctl range
33 * Alan Cox : Moved ioctl permission check into
35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
36 * Alan Cox : 100 backlog just doesn't cut it when
37 * you start doing multicast video 8)
38 * Alan Cox : Rewrote net_bh and list manager.
39 * Alan Cox : Fix ETH_P_ALL echoback lengths.
40 * Alan Cox : Took out transmit every packet pass
41 * Saved a few bytes in the ioctl handler
42 * Alan Cox : Network driver sets packet type before
43 * calling netif_rx. Saves a function
45 * Alan Cox : Hashed net_bh()
46 * Richard Kooijman: Timestamp fixes.
47 * Alan Cox : Wrong field in SIOCGIFDSTADDR
48 * Alan Cox : Device lock protection.
49 * Alan Cox : Fixed nasty side effect of device close
51 * Rudi Cilibrasi : Pass the right thing to
53 * Dave Miller : 32bit quantity for the device lock to
54 * make it work out on a Sparc.
55 * Bjorn Ekwall : Added KERNELD hack.
56 * Alan Cox : Cleaned up the backlog initialise.
57 * Craig Metz : SIOCGIFCONF fix if space for under
59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
60 * is no device open function.
61 * Andi Kleen : Fix error reporting for SIOCGIFCONF
62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
63 * Cyrus Durgin : Cleaned for KMOD
64 * Adam Sulmicki : Bug Fix : Network Device Unload
65 * A network device unload needs to purge
67 * Paul Rusty Russell : SIOCSIFNAME
68 * Pekka Riikonen : Netdev boot-time settings code
69 * Andrew Morton : Make unregister_netdevice wait
70 * indefinitely on dev->refcnt
71 * J Hadi Salim : - Backlog queue sampling
72 * - netif_rx() feedback
75 #include <asm/uaccess.h>
76 #include <asm/system.h>
77 #include <linux/bitops.h>
78 #include <linux/capability.h>
79 #include <linux/cpu.h>
80 #include <linux/types.h>
81 #include <linux/kernel.h>
82 #include <linux/hash.h>
83 #include <linux/slab.h>
84 #include <linux/sched.h>
85 #include <linux/mutex.h>
86 #include <linux/string.h>
88 #include <linux/socket.h>
89 #include <linux/sockios.h>
90 #include <linux/errno.h>
91 #include <linux/interrupt.h>
92 #include <linux/if_ether.h>
93 #include <linux/netdevice.h>
94 #include <linux/etherdevice.h>
95 #include <linux/ethtool.h>
96 #include <linux/notifier.h>
97 #include <linux/skbuff.h>
98 #include <net/net_namespace.h>
100 #include <linux/rtnetlink.h>
101 #include <linux/proc_fs.h>
102 #include <linux/seq_file.h>
103 #include <linux/stat.h>
104 #include <linux/if_bridge.h>
105 #include <linux/if_macvlan.h>
107 #include <net/pkt_sched.h>
108 #include <net/checksum.h>
109 #include <net/xfrm.h>
110 #include <linux/highmem.h>
111 #include <linux/init.h>
112 #include <linux/kmod.h>
113 #include <linux/module.h>
114 #include <linux/netpoll.h>
115 #include <linux/rcupdate.h>
116 #include <linux/delay.h>
117 #include <net/wext.h>
118 #include <net/iw_handler.h>
119 #include <asm/current.h>
120 #include <linux/audit.h>
121 #include <linux/dmaengine.h>
122 #include <linux/err.h>
123 #include <linux/ctype.h>
124 #include <linux/if_arp.h>
125 #include <linux/if_vlan.h>
126 #include <linux/ip.h>
128 #include <linux/ipv6.h>
129 #include <linux/in.h>
130 #include <linux/jhash.h>
131 #include <linux/random.h>
132 #include <trace/events/napi.h>
133 #include <linux/pci.h>
135 #include "net-sysfs.h"
137 /* Instead of increasing this, you should create a hash table. */
138 #define MAX_GRO_SKBS 8
140 /* This should be increased if a protocol with a bigger head is added. */
141 #define GRO_MAX_HEAD (MAX_HEADER + 128)
144 * The list of packet types we will receive (as opposed to discard)
145 * and the routines to invoke.
147 * Why 16. Because with 16 the only overlap we get on a hash of the
148 * low nibble of the protocol value is RARP/SNAP/X.25.
150 * NOTE: That is no longer true with the addition of VLAN tags. Not
151 * sure which should go first, but I bet it won't make much
152 * difference if we are running VLANs. The good news is that
153 * this protocol won't be in the list unless compiled in, so
154 * the average user (w/out VLANs) will not be adversely affected.
171 #define PTYPE_HASH_SIZE (16)
172 #define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
174 static DEFINE_SPINLOCK(ptype_lock);
175 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
176 static struct list_head ptype_all __read_mostly; /* Taps */
179 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
182 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
184 * Writers must hold the rtnl semaphore while they loop through the
185 * dev_base_head list, and hold dev_base_lock for writing when they do the
186 * actual updates. This allows pure readers to access the list even
187 * while a writer is preparing to update it.
189 * To put it another way, dev_base_lock is held for writing only to
190 * protect against pure readers; the rtnl semaphore provides the
191 * protection against other writers.
193 * See, for example usages, register_netdevice() and
194 * unregister_netdevice(), which must be called with the rtnl
197 DEFINE_RWLOCK(dev_base_lock);
198 EXPORT_SYMBOL(dev_base_lock);
200 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
202 unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
203 return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
206 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
208 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
211 static inline void rps_lock(struct softnet_data *sd)
214 spin_lock(&sd->input_pkt_queue.lock);
218 static inline void rps_unlock(struct softnet_data *sd)
221 spin_unlock(&sd->input_pkt_queue.lock);
225 /* Device list insertion */
226 static int list_netdevice(struct net_device *dev)
228 struct net *net = dev_net(dev);
232 write_lock_bh(&dev_base_lock);
233 list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
234 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
235 hlist_add_head_rcu(&dev->index_hlist,
236 dev_index_hash(net, dev->ifindex));
237 write_unlock_bh(&dev_base_lock);
241 /* Device list removal
242 * caller must respect a RCU grace period before freeing/reusing dev
244 static void unlist_netdevice(struct net_device *dev)
248 /* Unlink dev from the device chain */
249 write_lock_bh(&dev_base_lock);
250 list_del_rcu(&dev->dev_list);
251 hlist_del_rcu(&dev->name_hlist);
252 hlist_del_rcu(&dev->index_hlist);
253 write_unlock_bh(&dev_base_lock);
260 static RAW_NOTIFIER_HEAD(netdev_chain);
263 * Device drivers call our routines to queue packets here. We empty the
264 * queue in the local softnet handler.
267 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
268 EXPORT_PER_CPU_SYMBOL(softnet_data);
270 #ifdef CONFIG_LOCKDEP
272 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
273 * according to dev->type
275 static const unsigned short netdev_lock_type[] =
276 {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
277 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
278 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
279 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
280 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
281 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
282 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
283 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
284 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
285 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
286 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
287 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
288 ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
289 ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
290 ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,
291 ARPHRD_VOID, ARPHRD_NONE};
293 static const char *const netdev_lock_name[] =
294 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
295 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
296 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
297 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
298 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
299 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
300 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
301 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
302 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
303 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
304 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
305 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
306 "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
307 "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
308 "_xmit_PHONET_PIPE", "_xmit_IEEE802154",
309 "_xmit_VOID", "_xmit_NONE"};
311 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
312 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
314 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
318 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
319 if (netdev_lock_type[i] == dev_type)
321 /* the last key is used by default */
322 return ARRAY_SIZE(netdev_lock_type) - 1;
325 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
326 unsigned short dev_type)
330 i = netdev_lock_pos(dev_type);
331 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
332 netdev_lock_name[i]);
335 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
339 i = netdev_lock_pos(dev->type);
340 lockdep_set_class_and_name(&dev->addr_list_lock,
341 &netdev_addr_lock_key[i],
342 netdev_lock_name[i]);
345 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
346 unsigned short dev_type)
349 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
354 /*******************************************************************************
356 Protocol management and registration routines
358 *******************************************************************************/
361 * Add a protocol ID to the list. Now that the input handler is
362 * smarter we can dispense with all the messy stuff that used to be
365 * BEWARE!!! Protocol handlers, mangling input packets,
366 * MUST BE last in hash buckets and checking protocol handlers
367 * MUST start from promiscuous ptype_all chain in net_bh.
368 * It is true now, do not change it.
369 * Explanation follows: if protocol handler, mangling packet, will
370 * be the first on list, it is not able to sense, that packet
371 * is cloned and should be copied-on-write, so that it will
372 * change it and subsequent readers will get broken packet.
377 * dev_add_pack - add packet handler
378 * @pt: packet type declaration
380 * Add a protocol handler to the networking stack. The passed &packet_type
381 * is linked into kernel lists and may not be freed until it has been
382 * removed from the kernel lists.
384 * This call does not sleep therefore it can not
385 * guarantee all CPU's that are in middle of receiving packets
386 * will see the new packet type (until the next received packet).
389 void dev_add_pack(struct packet_type *pt)
393 spin_lock_bh(&ptype_lock);
394 if (pt->type == htons(ETH_P_ALL))
395 list_add_rcu(&pt->list, &ptype_all);
397 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
398 list_add_rcu(&pt->list, &ptype_base[hash]);
400 spin_unlock_bh(&ptype_lock);
402 EXPORT_SYMBOL(dev_add_pack);
405 * __dev_remove_pack - remove packet handler
406 * @pt: packet type declaration
408 * Remove a protocol handler that was previously added to the kernel
409 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
410 * from the kernel lists and can be freed or reused once this function
413 * The packet type might still be in use by receivers
414 * and must not be freed until after all the CPU's have gone
415 * through a quiescent state.
417 void __dev_remove_pack(struct packet_type *pt)
419 struct list_head *head;
420 struct packet_type *pt1;
422 spin_lock_bh(&ptype_lock);
424 if (pt->type == htons(ETH_P_ALL))
427 head = &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
429 list_for_each_entry(pt1, head, list) {
431 list_del_rcu(&pt->list);
436 printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
438 spin_unlock_bh(&ptype_lock);
440 EXPORT_SYMBOL(__dev_remove_pack);
443 * dev_remove_pack - remove packet handler
444 * @pt: packet type declaration
446 * Remove a protocol handler that was previously added to the kernel
447 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
448 * from the kernel lists and can be freed or reused once this function
451 * This call sleeps to guarantee that no CPU is looking at the packet
454 void dev_remove_pack(struct packet_type *pt)
456 __dev_remove_pack(pt);
460 EXPORT_SYMBOL(dev_remove_pack);
462 /******************************************************************************
464 Device Boot-time Settings Routines
466 *******************************************************************************/
468 /* Boot time configuration table */
469 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
472 * netdev_boot_setup_add - add new setup entry
473 * @name: name of the device
474 * @map: configured settings for the device
476 * Adds new setup entry to the dev_boot_setup list. The function
477 * returns 0 on error and 1 on success. This is a generic routine to
480 static int netdev_boot_setup_add(char *name, struct ifmap *map)
482 struct netdev_boot_setup *s;
486 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
487 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
488 memset(s[i].name, 0, sizeof(s[i].name));
489 strlcpy(s[i].name, name, IFNAMSIZ);
490 memcpy(&s[i].map, map, sizeof(s[i].map));
495 return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
499 * netdev_boot_setup_check - check boot time settings
500 * @dev: the netdevice
502 * Check boot time settings for the device.
503 * The found settings are set for the device to be used
504 * later in the device probing.
505 * Returns 0 if no settings found, 1 if they are.
507 int netdev_boot_setup_check(struct net_device *dev)
509 struct netdev_boot_setup *s = dev_boot_setup;
512 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
513 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
514 !strcmp(dev->name, s[i].name)) {
515 dev->irq = s[i].map.irq;
516 dev->base_addr = s[i].map.base_addr;
517 dev->mem_start = s[i].map.mem_start;
518 dev->mem_end = s[i].map.mem_end;
524 EXPORT_SYMBOL(netdev_boot_setup_check);
528 * netdev_boot_base - get address from boot time settings
529 * @prefix: prefix for network device
530 * @unit: id for network device
532 * Check boot time settings for the base address of device.
533 * The found settings are set for the device to be used
534 * later in the device probing.
535 * Returns 0 if no settings found.
537 unsigned long netdev_boot_base(const char *prefix, int unit)
539 const struct netdev_boot_setup *s = dev_boot_setup;
543 sprintf(name, "%s%d", prefix, unit);
546 * If device already registered then return base of 1
547 * to indicate not to probe for this interface
549 if (__dev_get_by_name(&init_net, name))
552 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
553 if (!strcmp(name, s[i].name))
554 return s[i].map.base_addr;
559 * Saves at boot time configured settings for any netdevice.
561 int __init netdev_boot_setup(char *str)
566 str = get_options(str, ARRAY_SIZE(ints), ints);
571 memset(&map, 0, sizeof(map));
575 map.base_addr = ints[2];
577 map.mem_start = ints[3];
579 map.mem_end = ints[4];
581 /* Add new entry to the list */
582 return netdev_boot_setup_add(str, &map);
585 __setup("netdev=", netdev_boot_setup);
587 /*******************************************************************************
589 Device Interface Subroutines
591 *******************************************************************************/
594 * __dev_get_by_name - find a device by its name
595 * @net: the applicable net namespace
596 * @name: name to find
598 * Find an interface by name. Must be called under RTNL semaphore
599 * or @dev_base_lock. If the name is found a pointer to the device
600 * is returned. If the name is not found then %NULL is returned. The
601 * reference counters are not incremented so the caller must be
602 * careful with locks.
605 struct net_device *__dev_get_by_name(struct net *net, const char *name)
607 struct hlist_node *p;
608 struct net_device *dev;
609 struct hlist_head *head = dev_name_hash(net, name);
611 hlist_for_each_entry(dev, p, head, name_hlist)
612 if (!strncmp(dev->name, name, IFNAMSIZ))
617 EXPORT_SYMBOL(__dev_get_by_name);
620 * dev_get_by_name_rcu - find a device by its name
621 * @net: the applicable net namespace
622 * @name: name to find
624 * Find an interface by name.
625 * If the name is found a pointer to the device is returned.
626 * If the name is not found then %NULL is returned.
627 * The reference counters are not incremented so the caller must be
628 * careful with locks. The caller must hold RCU lock.
631 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
633 struct hlist_node *p;
634 struct net_device *dev;
635 struct hlist_head *head = dev_name_hash(net, name);
637 hlist_for_each_entry_rcu(dev, p, head, name_hlist)
638 if (!strncmp(dev->name, name, IFNAMSIZ))
643 EXPORT_SYMBOL(dev_get_by_name_rcu);
646 * dev_get_by_name - find a device by its name
647 * @net: the applicable net namespace
648 * @name: name to find
650 * Find an interface by name. This can be called from any
651 * context and does its own locking. The returned handle has
652 * the usage count incremented and the caller must use dev_put() to
653 * release it when it is no longer needed. %NULL is returned if no
654 * matching device is found.
657 struct net_device *dev_get_by_name(struct net *net, const char *name)
659 struct net_device *dev;
662 dev = dev_get_by_name_rcu(net, name);
668 EXPORT_SYMBOL(dev_get_by_name);
671 * __dev_get_by_index - find a device by its ifindex
672 * @net: the applicable net namespace
673 * @ifindex: index of device
675 * Search for an interface by index. Returns %NULL if the device
676 * is not found or a pointer to the device. The device has not
677 * had its reference counter increased so the caller must be careful
678 * about locking. The caller must hold either the RTNL semaphore
682 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
684 struct hlist_node *p;
685 struct net_device *dev;
686 struct hlist_head *head = dev_index_hash(net, ifindex);
688 hlist_for_each_entry(dev, p, head, index_hlist)
689 if (dev->ifindex == ifindex)
694 EXPORT_SYMBOL(__dev_get_by_index);
697 * dev_get_by_index_rcu - find a device by its ifindex
698 * @net: the applicable net namespace
699 * @ifindex: index of device
701 * Search for an interface by index. Returns %NULL if the device
702 * is not found or a pointer to the device. The device has not
703 * had its reference counter increased so the caller must be careful
704 * about locking. The caller must hold RCU lock.
707 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
709 struct hlist_node *p;
710 struct net_device *dev;
711 struct hlist_head *head = dev_index_hash(net, ifindex);
713 hlist_for_each_entry_rcu(dev, p, head, index_hlist)
714 if (dev->ifindex == ifindex)
719 EXPORT_SYMBOL(dev_get_by_index_rcu);
723 * dev_get_by_index - find a device by its ifindex
724 * @net: the applicable net namespace
725 * @ifindex: index of device
727 * Search for an interface by index. Returns NULL if the device
728 * is not found or a pointer to the device. The device returned has
729 * had a reference added and the pointer is safe until the user calls
730 * dev_put to indicate they have finished with it.
733 struct net_device *dev_get_by_index(struct net *net, int ifindex)
735 struct net_device *dev;
738 dev = dev_get_by_index_rcu(net, ifindex);
744 EXPORT_SYMBOL(dev_get_by_index);
747 * dev_getbyhwaddr - find a device by its hardware address
748 * @net: the applicable net namespace
749 * @type: media type of device
750 * @ha: hardware address
752 * Search for an interface by MAC address. Returns NULL if the device
753 * is not found or a pointer to the device. The caller must hold the
754 * rtnl semaphore. The returned device has not had its ref count increased
755 * and the caller must therefore be careful about locking
758 * If the API was consistent this would be __dev_get_by_hwaddr
761 struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, char *ha)
763 struct net_device *dev;
767 for_each_netdev(net, dev)
768 if (dev->type == type &&
769 !memcmp(dev->dev_addr, ha, dev->addr_len))
774 EXPORT_SYMBOL(dev_getbyhwaddr);
776 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
778 struct net_device *dev;
781 for_each_netdev(net, dev)
782 if (dev->type == type)
787 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
789 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
791 struct net_device *dev, *ret = NULL;
794 for_each_netdev_rcu(net, dev)
795 if (dev->type == type) {
803 EXPORT_SYMBOL(dev_getfirstbyhwtype);
806 * dev_get_by_flags - find any device with given flags
807 * @net: the applicable net namespace
808 * @if_flags: IFF_* values
809 * @mask: bitmask of bits in if_flags to check
811 * Search for any interface with the given flags. Returns NULL if a device
812 * is not found or a pointer to the device. The device returned has
813 * had a reference added and the pointer is safe until the user calls
814 * dev_put to indicate they have finished with it.
817 struct net_device *dev_get_by_flags(struct net *net, unsigned short if_flags,
820 struct net_device *dev, *ret;
824 for_each_netdev_rcu(net, dev) {
825 if (((dev->flags ^ if_flags) & mask) == 0) {
834 EXPORT_SYMBOL(dev_get_by_flags);
837 * dev_valid_name - check if name is okay for network device
840 * Network device names need to be valid file names to
841 * to allow sysfs to work. We also disallow any kind of
844 int dev_valid_name(const char *name)
848 if (strlen(name) >= IFNAMSIZ)
850 if (!strcmp(name, ".") || !strcmp(name, ".."))
854 if (*name == '/' || isspace(*name))
860 EXPORT_SYMBOL(dev_valid_name);
863 * __dev_alloc_name - allocate a name for a device
864 * @net: network namespace to allocate the device name in
865 * @name: name format string
866 * @buf: scratch buffer and result name string
868 * Passed a format string - eg "lt%d" it will try and find a suitable
869 * id. It scans list of devices to build up a free map, then chooses
870 * the first empty slot. The caller must hold the dev_base or rtnl lock
871 * while allocating the name and adding the device in order to avoid
873 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
874 * Returns the number of the unit assigned or a negative errno code.
877 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
881 const int max_netdevices = 8*PAGE_SIZE;
882 unsigned long *inuse;
883 struct net_device *d;
885 p = strnchr(name, IFNAMSIZ-1, '%');
888 * Verify the string as this thing may have come from
889 * the user. There must be either one "%d" and no other "%"
892 if (p[1] != 'd' || strchr(p + 2, '%'))
895 /* Use one page as a bit array of possible slots */
896 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
900 for_each_netdev(net, d) {
901 if (!sscanf(d->name, name, &i))
903 if (i < 0 || i >= max_netdevices)
906 /* avoid cases where sscanf is not exact inverse of printf */
907 snprintf(buf, IFNAMSIZ, name, i);
908 if (!strncmp(buf, d->name, IFNAMSIZ))
912 i = find_first_zero_bit(inuse, max_netdevices);
913 free_page((unsigned long) inuse);
917 snprintf(buf, IFNAMSIZ, name, i);
918 if (!__dev_get_by_name(net, buf))
921 /* It is possible to run out of possible slots
922 * when the name is long and there isn't enough space left
923 * for the digits, or if all bits are used.
929 * dev_alloc_name - allocate a name for a device
931 * @name: name format string
933 * Passed a format string - eg "lt%d" it will try and find a suitable
934 * id. It scans list of devices to build up a free map, then chooses
935 * the first empty slot. The caller must hold the dev_base or rtnl lock
936 * while allocating the name and adding the device in order to avoid
938 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
939 * Returns the number of the unit assigned or a negative errno code.
942 int dev_alloc_name(struct net_device *dev, const char *name)
948 BUG_ON(!dev_net(dev));
950 ret = __dev_alloc_name(net, name, buf);
952 strlcpy(dev->name, buf, IFNAMSIZ);
955 EXPORT_SYMBOL(dev_alloc_name);
957 static int dev_get_valid_name(struct net *net, const char *name, char *buf,
960 if (!dev_valid_name(name))
963 if (fmt && strchr(name, '%'))
964 return __dev_alloc_name(net, name, buf);
965 else if (__dev_get_by_name(net, name))
967 else if (buf != name)
968 strlcpy(buf, name, IFNAMSIZ);
974 * dev_change_name - change name of a device
976 * @newname: name (or format string) must be at least IFNAMSIZ
978 * Change name of a device, can pass format strings "eth%d".
981 int dev_change_name(struct net_device *dev, const char *newname)
983 char oldname[IFNAMSIZ];
989 BUG_ON(!dev_net(dev));
992 if (dev->flags & IFF_UP)
995 if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
998 memcpy(oldname, dev->name, IFNAMSIZ);
1000 err = dev_get_valid_name(net, newname, dev->name, 1);
1005 /* For now only devices in the initial network namespace
1008 if (net_eq(net, &init_net)) {
1009 ret = device_rename(&dev->dev, dev->name);
1011 memcpy(dev->name, oldname, IFNAMSIZ);
1016 write_lock_bh(&dev_base_lock);
1017 hlist_del(&dev->name_hlist);
1018 write_unlock_bh(&dev_base_lock);
1022 write_lock_bh(&dev_base_lock);
1023 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1024 write_unlock_bh(&dev_base_lock);
1026 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1027 ret = notifier_to_errno(ret);
1030 /* err >= 0 after dev_alloc_name() or stores the first errno */
1033 memcpy(dev->name, oldname, IFNAMSIZ);
1037 "%s: name change rollback failed: %d.\n",
1046 * dev_set_alias - change ifalias of a device
1048 * @alias: name up to IFALIASZ
1049 * @len: limit of bytes to copy from info
1051 * Set ifalias for a device,
1053 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1057 if (len >= IFALIASZ)
1062 kfree(dev->ifalias);
1063 dev->ifalias = NULL;
1068 dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1072 strlcpy(dev->ifalias, alias, len+1);
1078 * netdev_features_change - device changes features
1079 * @dev: device to cause notification
1081 * Called to indicate a device has changed features.
1083 void netdev_features_change(struct net_device *dev)
1085 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1087 EXPORT_SYMBOL(netdev_features_change);
1090 * netdev_state_change - device changes state
1091 * @dev: device to cause notification
1093 * Called to indicate a device has changed state. This function calls
1094 * the notifier chains for netdev_chain and sends a NEWLINK message
1095 * to the routing socket.
1097 void netdev_state_change(struct net_device *dev)
1099 if (dev->flags & IFF_UP) {
1100 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1101 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1104 EXPORT_SYMBOL(netdev_state_change);
1106 int netdev_bonding_change(struct net_device *dev, unsigned long event)
1108 return call_netdevice_notifiers(event, dev);
1110 EXPORT_SYMBOL(netdev_bonding_change);
1113 * dev_load - load a network module
1114 * @net: the applicable net namespace
1115 * @name: name of interface
1117 * If a network interface is not present and the process has suitable
1118 * privileges this function loads the module. If module loading is not
1119 * available in this kernel then it becomes a nop.
1122 void dev_load(struct net *net, const char *name)
1124 struct net_device *dev;
1127 dev = dev_get_by_name_rcu(net, name);
1130 if (!dev && capable(CAP_NET_ADMIN))
1131 request_module("%s", name);
1133 EXPORT_SYMBOL(dev_load);
1135 static int __dev_open(struct net_device *dev)
1137 const struct net_device_ops *ops = dev->netdev_ops;
1143 * Is it even present?
1145 if (!netif_device_present(dev))
1148 ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1149 ret = notifier_to_errno(ret);
1154 * Call device private open method
1156 set_bit(__LINK_STATE_START, &dev->state);
1158 if (ops->ndo_validate_addr)
1159 ret = ops->ndo_validate_addr(dev);
1161 if (!ret && ops->ndo_open)
1162 ret = ops->ndo_open(dev);
1165 * If it went open OK then:
1169 clear_bit(__LINK_STATE_START, &dev->state);
1174 dev->flags |= IFF_UP;
1179 net_dmaengine_get();
1182 * Initialize multicasting status
1184 dev_set_rx_mode(dev);
1187 * Wakeup transmit queue engine
1196 * dev_open - prepare an interface for use.
1197 * @dev: device to open
1199 * Takes a device from down to up state. The device's private open
1200 * function is invoked and then the multicast lists are loaded. Finally
1201 * the device is moved into the up state and a %NETDEV_UP message is
1202 * sent to the netdev notifier chain.
1204 * Calling this function on an active interface is a nop. On a failure
1205 * a negative errno code is returned.
1207 int dev_open(struct net_device *dev)
1214 if (dev->flags & IFF_UP)
1220 ret = __dev_open(dev);
1225 * ... and announce new interface.
1227 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1228 call_netdevice_notifiers(NETDEV_UP, dev);
1232 EXPORT_SYMBOL(dev_open);
1234 static int __dev_close(struct net_device *dev)
1236 const struct net_device_ops *ops = dev->netdev_ops;
1242 * Tell people we are going down, so that they can
1243 * prepare to death, when device is still operating.
1245 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1247 clear_bit(__LINK_STATE_START, &dev->state);
1249 /* Synchronize to scheduled poll. We cannot touch poll list,
1250 * it can be even on different cpu. So just clear netif_running().
1252 * dev->stop() will invoke napi_disable() on all of it's
1253 * napi_struct instances on this device.
1255 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1257 dev_deactivate(dev);
1260 * Call the device specific close. This cannot fail.
1261 * Only if device is UP
1263 * We allow it to be called even after a DETACH hot-plug
1270 * Device is now down.
1273 dev->flags &= ~IFF_UP;
1278 net_dmaengine_put();
1284 * dev_close - shutdown an interface.
1285 * @dev: device to shutdown
1287 * This function moves an active device into down state. A
1288 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1289 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1292 int dev_close(struct net_device *dev)
1294 if (!(dev->flags & IFF_UP))
1300 * Tell people we are down
1302 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1303 call_netdevice_notifiers(NETDEV_DOWN, dev);
1307 EXPORT_SYMBOL(dev_close);
1311 * dev_disable_lro - disable Large Receive Offload on a device
1314 * Disable Large Receive Offload (LRO) on a net device. Must be
1315 * called under RTNL. This is needed if received packets may be
1316 * forwarded to another interface.
1318 void dev_disable_lro(struct net_device *dev)
1320 if (dev->ethtool_ops && dev->ethtool_ops->get_flags &&
1321 dev->ethtool_ops->set_flags) {
1322 u32 flags = dev->ethtool_ops->get_flags(dev);
1323 if (flags & ETH_FLAG_LRO) {
1324 flags &= ~ETH_FLAG_LRO;
1325 dev->ethtool_ops->set_flags(dev, flags);
1328 WARN_ON(dev->features & NETIF_F_LRO);
1330 EXPORT_SYMBOL(dev_disable_lro);
1333 static int dev_boot_phase = 1;
1336 * Device change register/unregister. These are not inline or static
1337 * as we export them to the world.
1341 * register_netdevice_notifier - register a network notifier block
1344 * Register a notifier to be called when network device events occur.
1345 * The notifier passed is linked into the kernel structures and must
1346 * not be reused until it has been unregistered. A negative errno code
1347 * is returned on a failure.
1349 * When registered all registration and up events are replayed
1350 * to the new notifier to allow device to have a race free
1351 * view of the network device list.
1354 int register_netdevice_notifier(struct notifier_block *nb)
1356 struct net_device *dev;
1357 struct net_device *last;
1362 err = raw_notifier_chain_register(&netdev_chain, nb);
1368 for_each_netdev(net, dev) {
1369 err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1370 err = notifier_to_errno(err);
1374 if (!(dev->flags & IFF_UP))
1377 nb->notifier_call(nb, NETDEV_UP, dev);
1388 for_each_netdev(net, dev) {
1392 if (dev->flags & IFF_UP) {
1393 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1394 nb->notifier_call(nb, NETDEV_DOWN, dev);
1396 nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1397 nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1401 raw_notifier_chain_unregister(&netdev_chain, nb);
1404 EXPORT_SYMBOL(register_netdevice_notifier);
1407 * unregister_netdevice_notifier - unregister a network notifier block
1410 * Unregister a notifier previously registered by
1411 * register_netdevice_notifier(). The notifier is unlinked into the
1412 * kernel structures and may then be reused. A negative errno code
1413 * is returned on a failure.
1416 int unregister_netdevice_notifier(struct notifier_block *nb)
1421 err = raw_notifier_chain_unregister(&netdev_chain, nb);
1425 EXPORT_SYMBOL(unregister_netdevice_notifier);
1428 * call_netdevice_notifiers - call all network notifier blocks
1429 * @val: value passed unmodified to notifier function
1430 * @dev: net_device pointer passed unmodified to notifier function
1432 * Call all network notifier blocks. Parameters and return value
1433 * are as for raw_notifier_call_chain().
1436 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1439 return raw_notifier_call_chain(&netdev_chain, val, dev);
1442 /* When > 0 there are consumers of rx skb time stamps */
1443 static atomic_t netstamp_needed = ATOMIC_INIT(0);
1445 void net_enable_timestamp(void)
1447 atomic_inc(&netstamp_needed);
1449 EXPORT_SYMBOL(net_enable_timestamp);
1451 void net_disable_timestamp(void)
1453 atomic_dec(&netstamp_needed);
1455 EXPORT_SYMBOL(net_disable_timestamp);
1457 static inline void net_timestamp_set(struct sk_buff *skb)
1459 if (atomic_read(&netstamp_needed))
1460 __net_timestamp(skb);
1462 skb->tstamp.tv64 = 0;
1465 static inline void net_timestamp_check(struct sk_buff *skb)
1467 if (!skb->tstamp.tv64 && atomic_read(&netstamp_needed))
1468 __net_timestamp(skb);
1472 * dev_forward_skb - loopback an skb to another netif
1474 * @dev: destination network device
1475 * @skb: buffer to forward
1478 * NET_RX_SUCCESS (no congestion)
1479 * NET_RX_DROP (packet was dropped, but freed)
1481 * dev_forward_skb can be used for injecting an skb from the
1482 * start_xmit function of one device into the receive queue
1483 * of another device.
1485 * The receiving device may be in another namespace, so
1486 * we have to clear all information in the skb that could
1487 * impact namespace isolation.
1489 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1493 if (!(dev->flags & IFF_UP) ||
1494 (skb->len > (dev->mtu + dev->hard_header_len))) {
1498 skb_set_dev(skb, dev);
1499 skb->tstamp.tv64 = 0;
1500 skb->pkt_type = PACKET_HOST;
1501 skb->protocol = eth_type_trans(skb, dev);
1502 return netif_rx(skb);
1504 EXPORT_SYMBOL_GPL(dev_forward_skb);
1507 * Support routine. Sends outgoing frames to any network
1508 * taps currently in use.
1511 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1513 struct packet_type *ptype;
1515 #ifdef CONFIG_NET_CLS_ACT
1516 if (!(skb->tstamp.tv64 && (G_TC_FROM(skb->tc_verd) & AT_INGRESS)))
1517 net_timestamp_set(skb);
1519 net_timestamp_set(skb);
1523 list_for_each_entry_rcu(ptype, &ptype_all, list) {
1524 /* Never send packets back to the socket
1525 * they originated from - MvS (miquels@drinkel.ow.org)
1527 if ((ptype->dev == dev || !ptype->dev) &&
1528 (ptype->af_packet_priv == NULL ||
1529 (struct sock *)ptype->af_packet_priv != skb->sk)) {
1530 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1534 /* skb->nh should be correctly
1535 set by sender, so that the second statement is
1536 just protection against buggy protocols.
1538 skb_reset_mac_header(skb2);
1540 if (skb_network_header(skb2) < skb2->data ||
1541 skb2->network_header > skb2->tail) {
1542 if (net_ratelimit())
1543 printk(KERN_CRIT "protocol %04x is "
1545 skb2->protocol, dev->name);
1546 skb_reset_network_header(skb2);
1549 skb2->transport_header = skb2->network_header;
1550 skb2->pkt_type = PACKET_OUTGOING;
1551 ptype->func(skb2, skb->dev, ptype, skb->dev);
1558 static inline void __netif_reschedule(struct Qdisc *q)
1560 struct softnet_data *sd;
1561 unsigned long flags;
1563 local_irq_save(flags);
1564 sd = &__get_cpu_var(softnet_data);
1565 q->next_sched = NULL;
1566 *sd->output_queue_tailp = q;
1567 sd->output_queue_tailp = &q->next_sched;
1568 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1569 local_irq_restore(flags);
1572 void __netif_schedule(struct Qdisc *q)
1574 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1575 __netif_reschedule(q);
1577 EXPORT_SYMBOL(__netif_schedule);
1579 void dev_kfree_skb_irq(struct sk_buff *skb)
1581 if (atomic_dec_and_test(&skb->users)) {
1582 struct softnet_data *sd;
1583 unsigned long flags;
1585 local_irq_save(flags);
1586 sd = &__get_cpu_var(softnet_data);
1587 skb->next = sd->completion_queue;
1588 sd->completion_queue = skb;
1589 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1590 local_irq_restore(flags);
1593 EXPORT_SYMBOL(dev_kfree_skb_irq);
1595 void dev_kfree_skb_any(struct sk_buff *skb)
1597 if (in_irq() || irqs_disabled())
1598 dev_kfree_skb_irq(skb);
1602 EXPORT_SYMBOL(dev_kfree_skb_any);
1606 * netif_device_detach - mark device as removed
1607 * @dev: network device
1609 * Mark device as removed from system and therefore no longer available.
1611 void netif_device_detach(struct net_device *dev)
1613 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1614 netif_running(dev)) {
1615 netif_tx_stop_all_queues(dev);
1618 EXPORT_SYMBOL(netif_device_detach);
1621 * netif_device_attach - mark device as attached
1622 * @dev: network device
1624 * Mark device as attached from system and restart if needed.
1626 void netif_device_attach(struct net_device *dev)
1628 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1629 netif_running(dev)) {
1630 netif_tx_wake_all_queues(dev);
1631 __netdev_watchdog_up(dev);
1634 EXPORT_SYMBOL(netif_device_attach);
1636 static bool can_checksum_protocol(unsigned long features, __be16 protocol)
1638 return ((features & NETIF_F_GEN_CSUM) ||
1639 ((features & NETIF_F_IP_CSUM) &&
1640 protocol == htons(ETH_P_IP)) ||
1641 ((features & NETIF_F_IPV6_CSUM) &&
1642 protocol == htons(ETH_P_IPV6)) ||
1643 ((features & NETIF_F_FCOE_CRC) &&
1644 protocol == htons(ETH_P_FCOE)));
1647 static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb)
1649 if (can_checksum_protocol(dev->features, skb->protocol))
1652 if (skb->protocol == htons(ETH_P_8021Q)) {
1653 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
1654 if (can_checksum_protocol(dev->features & dev->vlan_features,
1655 veh->h_vlan_encapsulated_proto))
1663 * skb_dev_set -- assign a new device to a buffer
1664 * @skb: buffer for the new device
1665 * @dev: network device
1667 * If an skb is owned by a device already, we have to reset
1668 * all data private to the namespace a device belongs to
1669 * before assigning it a new device.
1671 #ifdef CONFIG_NET_NS
1672 void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
1675 if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
1678 skb_init_secmark(skb);
1682 skb->ipvs_property = 0;
1683 #ifdef CONFIG_NET_SCHED
1689 EXPORT_SYMBOL(skb_set_dev);
1690 #endif /* CONFIG_NET_NS */
1693 * Invalidate hardware checksum when packet is to be mangled, and
1694 * complete checksum manually on outgoing path.
1696 int skb_checksum_help(struct sk_buff *skb)
1699 int ret = 0, offset;
1701 if (skb->ip_summed == CHECKSUM_COMPLETE)
1702 goto out_set_summed;
1704 if (unlikely(skb_shinfo(skb)->gso_size)) {
1705 /* Let GSO fix up the checksum. */
1706 goto out_set_summed;
1709 offset = skb->csum_start - skb_headroom(skb);
1710 BUG_ON(offset >= skb_headlen(skb));
1711 csum = skb_checksum(skb, offset, skb->len - offset, 0);
1713 offset += skb->csum_offset;
1714 BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1716 if (skb_cloned(skb) &&
1717 !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1718 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1723 *(__sum16 *)(skb->data + offset) = csum_fold(csum);
1725 skb->ip_summed = CHECKSUM_NONE;
1729 EXPORT_SYMBOL(skb_checksum_help);
1732 * skb_gso_segment - Perform segmentation on skb.
1733 * @skb: buffer to segment
1734 * @features: features for the output path (see dev->features)
1736 * This function segments the given skb and returns a list of segments.
1738 * It may return NULL if the skb requires no segmentation. This is
1739 * only possible when GSO is used for verifying header integrity.
1741 struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
1743 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1744 struct packet_type *ptype;
1745 __be16 type = skb->protocol;
1748 skb_reset_mac_header(skb);
1749 skb->mac_len = skb->network_header - skb->mac_header;
1750 __skb_pull(skb, skb->mac_len);
1752 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1753 struct net_device *dev = skb->dev;
1754 struct ethtool_drvinfo info = {};
1756 if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1757 dev->ethtool_ops->get_drvinfo(dev, &info);
1759 WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d "
1761 info.driver, dev ? dev->features : 0L,
1762 skb->sk ? skb->sk->sk_route_caps : 0L,
1763 skb->len, skb->data_len, skb->ip_summed);
1765 if (skb_header_cloned(skb) &&
1766 (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1767 return ERR_PTR(err);
1771 list_for_each_entry_rcu(ptype,
1772 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1773 if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1774 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1775 err = ptype->gso_send_check(skb);
1776 segs = ERR_PTR(err);
1777 if (err || skb_gso_ok(skb, features))
1779 __skb_push(skb, (skb->data -
1780 skb_network_header(skb)));
1782 segs = ptype->gso_segment(skb, features);
1788 __skb_push(skb, skb->data - skb_mac_header(skb));
1792 EXPORT_SYMBOL(skb_gso_segment);
1794 /* Take action when hardware reception checksum errors are detected. */
1796 void netdev_rx_csum_fault(struct net_device *dev)
1798 if (net_ratelimit()) {
1799 printk(KERN_ERR "%s: hw csum failure.\n",
1800 dev ? dev->name : "<unknown>");
1804 EXPORT_SYMBOL(netdev_rx_csum_fault);
1807 /* Actually, we should eliminate this check as soon as we know, that:
1808 * 1. IOMMU is present and allows to map all the memory.
1809 * 2. No high memory really exists on this machine.
1812 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1814 #ifdef CONFIG_HIGHMEM
1816 if (!(dev->features & NETIF_F_HIGHDMA)) {
1817 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1818 if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1822 if (PCI_DMA_BUS_IS_PHYS) {
1823 struct device *pdev = dev->dev.parent;
1827 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1828 dma_addr_t addr = page_to_phys(skb_shinfo(skb)->frags[i].page);
1829 if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
1838 void (*destructor)(struct sk_buff *skb);
1841 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1843 static void dev_gso_skb_destructor(struct sk_buff *skb)
1845 struct dev_gso_cb *cb;
1848 struct sk_buff *nskb = skb->next;
1850 skb->next = nskb->next;
1853 } while (skb->next);
1855 cb = DEV_GSO_CB(skb);
1857 cb->destructor(skb);
1861 * dev_gso_segment - Perform emulated hardware segmentation on skb.
1862 * @skb: buffer to segment
1864 * This function segments the given skb and stores the list of segments
1867 static int dev_gso_segment(struct sk_buff *skb)
1869 struct net_device *dev = skb->dev;
1870 struct sk_buff *segs;
1871 int features = dev->features & ~(illegal_highdma(dev, skb) ?
1874 segs = skb_gso_segment(skb, features);
1876 /* Verifying header integrity only. */
1881 return PTR_ERR(segs);
1884 DEV_GSO_CB(skb)->destructor = skb->destructor;
1885 skb->destructor = dev_gso_skb_destructor;
1891 * Try to orphan skb early, right before transmission by the device.
1892 * We cannot orphan skb if tx timestamp is requested, since
1893 * drivers need to call skb_tstamp_tx() to send the timestamp.
1895 static inline void skb_orphan_try(struct sk_buff *skb)
1897 if (!skb_tx(skb)->flags)
1901 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
1902 struct netdev_queue *txq)
1904 const struct net_device_ops *ops = dev->netdev_ops;
1905 int rc = NETDEV_TX_OK;
1907 if (likely(!skb->next)) {
1908 if (!list_empty(&ptype_all))
1909 dev_queue_xmit_nit(skb, dev);
1912 * If device doesnt need skb->dst, release it right now while
1913 * its hot in this cpu cache
1915 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
1918 skb_orphan_try(skb);
1920 if (netif_needs_gso(dev, skb)) {
1921 if (unlikely(dev_gso_segment(skb)))
1927 rc = ops->ndo_start_xmit(skb, dev);
1928 if (rc == NETDEV_TX_OK)
1929 txq_trans_update(txq);
1935 struct sk_buff *nskb = skb->next;
1937 skb->next = nskb->next;
1941 * If device doesnt need nskb->dst, release it right now while
1942 * its hot in this cpu cache
1944 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
1947 rc = ops->ndo_start_xmit(nskb, dev);
1948 if (unlikely(rc != NETDEV_TX_OK)) {
1949 if (rc & ~NETDEV_TX_MASK)
1950 goto out_kfree_gso_skb;
1951 nskb->next = skb->next;
1955 txq_trans_update(txq);
1956 if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
1957 return NETDEV_TX_BUSY;
1958 } while (skb->next);
1961 if (likely(skb->next == NULL))
1962 skb->destructor = DEV_GSO_CB(skb)->destructor;
1968 static u32 hashrnd __read_mostly;
1970 u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb)
1974 if (skb_rx_queue_recorded(skb)) {
1975 hash = skb_get_rx_queue(skb);
1976 while (unlikely(hash >= dev->real_num_tx_queues))
1977 hash -= dev->real_num_tx_queues;
1981 if (skb->sk && skb->sk->sk_hash)
1982 hash = skb->sk->sk_hash;
1984 hash = (__force u16) skb->protocol;
1986 hash = jhash_1word(hash, hashrnd);
1988 return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
1990 EXPORT_SYMBOL(skb_tx_hash);
1992 static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
1994 if (unlikely(queue_index >= dev->real_num_tx_queues)) {
1995 if (net_ratelimit()) {
1996 pr_warning("%s selects TX queue %d, but "
1997 "real number of TX queues is %d\n",
1998 dev->name, queue_index, dev->real_num_tx_queues);
2005 static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2006 struct sk_buff *skb)
2009 struct sock *sk = skb->sk;
2011 if (sk_tx_queue_recorded(sk)) {
2012 queue_index = sk_tx_queue_get(sk);
2014 const struct net_device_ops *ops = dev->netdev_ops;
2016 if (ops->ndo_select_queue) {
2017 queue_index = ops->ndo_select_queue(dev, skb);
2018 queue_index = dev_cap_txqueue(dev, queue_index);
2021 if (dev->real_num_tx_queues > 1)
2022 queue_index = skb_tx_hash(dev, skb);
2025 struct dst_entry *dst = rcu_dereference_check(sk->sk_dst_cache, 1);
2027 if (dst && skb_dst(skb) == dst)
2028 sk_tx_queue_set(sk, queue_index);
2033 skb_set_queue_mapping(skb, queue_index);
2034 return netdev_get_tx_queue(dev, queue_index);
2037 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2038 struct net_device *dev,
2039 struct netdev_queue *txq)
2041 spinlock_t *root_lock = qdisc_lock(q);
2044 spin_lock(root_lock);
2045 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2048 } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2049 !test_and_set_bit(__QDISC_STATE_RUNNING, &q->state)) {
2051 * This is a work-conserving queue; there are no old skbs
2052 * waiting to be sent out; and the qdisc is not running -
2053 * xmit the skb directly.
2055 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2057 __qdisc_update_bstats(q, skb->len);
2058 if (sch_direct_xmit(skb, q, dev, txq, root_lock))
2061 clear_bit(__QDISC_STATE_RUNNING, &q->state);
2063 rc = NET_XMIT_SUCCESS;
2066 rc = qdisc_enqueue_root(skb, q);
2069 spin_unlock(root_lock);
2075 * Returns true if either:
2076 * 1. skb has frag_list and the device doesn't support FRAGLIST, or
2077 * 2. skb is fragmented and the device does not support SG, or if
2078 * at least one of fragments is in highmem and device does not
2079 * support DMA from it.
2081 static inline int skb_needs_linearize(struct sk_buff *skb,
2082 struct net_device *dev)
2084 return (skb_has_frags(skb) && !(dev->features & NETIF_F_FRAGLIST)) ||
2085 (skb_shinfo(skb)->nr_frags && (!(dev->features & NETIF_F_SG) ||
2086 illegal_highdma(dev, skb)));
2090 * dev_queue_xmit - transmit a buffer
2091 * @skb: buffer to transmit
2093 * Queue a buffer for transmission to a network device. The caller must
2094 * have set the device and priority and built the buffer before calling
2095 * this function. The function can be called from an interrupt.
2097 * A negative errno code is returned on a failure. A success does not
2098 * guarantee the frame will be transmitted as it may be dropped due
2099 * to congestion or traffic shaping.
2101 * -----------------------------------------------------------------------------------
2102 * I notice this method can also return errors from the queue disciplines,
2103 * including NET_XMIT_DROP, which is a positive value. So, errors can also
2106 * Regardless of the return value, the skb is consumed, so it is currently
2107 * difficult to retry a send to this method. (You can bump the ref count
2108 * before sending to hold a reference for retry if you are careful.)
2110 * When calling this method, interrupts MUST be enabled. This is because
2111 * the BH enable code must have IRQs enabled so that it will not deadlock.
2114 int dev_queue_xmit(struct sk_buff *skb)
2116 struct net_device *dev = skb->dev;
2117 struct netdev_queue *txq;
2121 /* GSO will handle the following emulations directly. */
2122 if (netif_needs_gso(dev, skb))
2125 /* Convert a paged skb to linear, if required */
2126 if (skb_needs_linearize(skb, dev) && __skb_linearize(skb))
2129 /* If packet is not checksummed and device does not support
2130 * checksumming for this protocol, complete checksumming here.
2132 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2133 skb_set_transport_header(skb, skb->csum_start -
2135 if (!dev_can_checksum(dev, skb) && skb_checksum_help(skb))
2140 /* Disable soft irqs for various locks below. Also
2141 * stops preemption for RCU.
2145 txq = dev_pick_tx(dev, skb);
2146 q = rcu_dereference_bh(txq->qdisc);
2148 #ifdef CONFIG_NET_CLS_ACT
2149 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2152 rc = __dev_xmit_skb(skb, q, dev, txq);
2156 /* The device has no queue. Common case for software devices:
2157 loopback, all the sorts of tunnels...
2159 Really, it is unlikely that netif_tx_lock protection is necessary
2160 here. (f.e. loopback and IP tunnels are clean ignoring statistics
2162 However, it is possible, that they rely on protection
2165 Check this and shot the lock. It is not prone from deadlocks.
2166 Either shot noqueue qdisc, it is even simpler 8)
2168 if (dev->flags & IFF_UP) {
2169 int cpu = smp_processor_id(); /* ok because BHs are off */
2171 if (txq->xmit_lock_owner != cpu) {
2173 HARD_TX_LOCK(dev, txq, cpu);
2175 if (!netif_tx_queue_stopped(txq)) {
2176 rc = dev_hard_start_xmit(skb, dev, txq);
2177 if (dev_xmit_complete(rc)) {
2178 HARD_TX_UNLOCK(dev, txq);
2182 HARD_TX_UNLOCK(dev, txq);
2183 if (net_ratelimit())
2184 printk(KERN_CRIT "Virtual device %s asks to "
2185 "queue packet!\n", dev->name);
2187 /* Recursion is detected! It is possible,
2189 if (net_ratelimit())
2190 printk(KERN_CRIT "Dead loop on virtual device "
2191 "%s, fix it urgently!\n", dev->name);
2196 rcu_read_unlock_bh();
2202 rcu_read_unlock_bh();
2205 EXPORT_SYMBOL(dev_queue_xmit);
2208 /*=======================================================================
2210 =======================================================================*/
2212 int netdev_max_backlog __read_mostly = 1000;
2213 int netdev_tstamp_prequeue __read_mostly = 1;
2214 int netdev_budget __read_mostly = 300;
2215 int weight_p __read_mostly = 64; /* old backlog weight */
2217 /* Called with irq disabled */
2218 static inline void ____napi_schedule(struct softnet_data *sd,
2219 struct napi_struct *napi)
2221 list_add_tail(&napi->poll_list, &sd->poll_list);
2222 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2227 /* One global table that all flow-based protocols share. */
2228 struct rps_sock_flow_table *rps_sock_flow_table __read_mostly;
2229 EXPORT_SYMBOL(rps_sock_flow_table);
2232 * get_rps_cpu is called from netif_receive_skb and returns the target
2233 * CPU from the RPS map of the receiving queue for a given skb.
2234 * rcu_read_lock must be held on entry.
2236 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2237 struct rps_dev_flow **rflowp)
2239 struct ipv6hdr *ip6;
2241 struct netdev_rx_queue *rxqueue;
2242 struct rps_map *map;
2243 struct rps_dev_flow_table *flow_table;
2244 struct rps_sock_flow_table *sock_flow_table;
2248 u32 addr1, addr2, ihl;
2254 if (skb_rx_queue_recorded(skb)) {
2255 u16 index = skb_get_rx_queue(skb);
2256 if (unlikely(index >= dev->num_rx_queues)) {
2257 if (net_ratelimit()) {
2258 pr_warning("%s received packet on queue "
2259 "%u, but number of RX queues is %u\n",
2260 dev->name, index, dev->num_rx_queues);
2264 rxqueue = dev->_rx + index;
2268 if (!rxqueue->rps_map && !rxqueue->rps_flow_table)
2272 goto got_hash; /* Skip hash computation on packet header */
2274 switch (skb->protocol) {
2275 case __constant_htons(ETH_P_IP):
2276 if (!pskb_may_pull(skb, sizeof(*ip)))
2279 ip = (struct iphdr *) skb->data;
2280 ip_proto = ip->protocol;
2281 addr1 = (__force u32) ip->saddr;
2282 addr2 = (__force u32) ip->daddr;
2285 case __constant_htons(ETH_P_IPV6):
2286 if (!pskb_may_pull(skb, sizeof(*ip6)))
2289 ip6 = (struct ipv6hdr *) skb->data;
2290 ip_proto = ip6->nexthdr;
2291 addr1 = (__force u32) ip6->saddr.s6_addr32[3];
2292 addr2 = (__force u32) ip6->daddr.s6_addr32[3];
2305 case IPPROTO_UDPLITE:
2306 if (pskb_may_pull(skb, (ihl * 4) + 4)) {
2307 ports.v32 = * (__force u32 *) (skb->data + (ihl * 4));
2308 if (ports.v16[1] < ports.v16[0])
2309 swap(ports.v16[0], ports.v16[1]);
2317 /* get a consistent hash (same value on both flow directions) */
2320 skb->rxhash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
2325 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2326 sock_flow_table = rcu_dereference(rps_sock_flow_table);
2327 if (flow_table && sock_flow_table) {
2329 struct rps_dev_flow *rflow;
2331 rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2334 next_cpu = sock_flow_table->ents[skb->rxhash &
2335 sock_flow_table->mask];
2338 * If the desired CPU (where last recvmsg was done) is
2339 * different from current CPU (one in the rx-queue flow
2340 * table entry), switch if one of the following holds:
2341 * - Current CPU is unset (equal to RPS_NO_CPU).
2342 * - Current CPU is offline.
2343 * - The current CPU's queue tail has advanced beyond the
2344 * last packet that was enqueued using this table entry.
2345 * This guarantees that all previous packets for the flow
2346 * have been dequeued, thus preserving in order delivery.
2348 if (unlikely(tcpu != next_cpu) &&
2349 (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2350 ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2351 rflow->last_qtail)) >= 0)) {
2352 tcpu = rflow->cpu = next_cpu;
2353 if (tcpu != RPS_NO_CPU)
2354 rflow->last_qtail = per_cpu(softnet_data,
2355 tcpu).input_queue_head;
2357 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2364 map = rcu_dereference(rxqueue->rps_map);
2366 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2368 if (cpu_online(tcpu)) {
2378 /* Called from hardirq (IPI) context */
2379 static void rps_trigger_softirq(void *data)
2381 struct softnet_data *sd = data;
2383 ____napi_schedule(sd, &sd->backlog);
2387 #endif /* CONFIG_RPS */
2390 * Check if this softnet_data structure is another cpu one
2391 * If yes, queue it to our IPI list and return 1
2394 static int rps_ipi_queued(struct softnet_data *sd)
2397 struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2400 sd->rps_ipi_next = mysd->rps_ipi_list;
2401 mysd->rps_ipi_list = sd;
2403 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2406 #endif /* CONFIG_RPS */
2411 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2412 * queue (may be a remote CPU queue).
2414 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2415 unsigned int *qtail)
2417 struct softnet_data *sd;
2418 unsigned long flags;
2420 sd = &per_cpu(softnet_data, cpu);
2422 local_irq_save(flags);
2425 if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
2426 if (skb_queue_len(&sd->input_pkt_queue)) {
2428 __skb_queue_tail(&sd->input_pkt_queue, skb);
2430 *qtail = sd->input_queue_head +
2431 skb_queue_len(&sd->input_pkt_queue);
2434 local_irq_restore(flags);
2435 return NET_RX_SUCCESS;
2438 /* Schedule NAPI for backlog device
2439 * We can use non atomic operation since we own the queue lock
2441 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
2442 if (!rps_ipi_queued(sd))
2443 ____napi_schedule(sd, &sd->backlog);
2451 local_irq_restore(flags);
2458 * netif_rx - post buffer to the network code
2459 * @skb: buffer to post
2461 * This function receives a packet from a device driver and queues it for
2462 * the upper (protocol) levels to process. It always succeeds. The buffer
2463 * may be dropped during processing for congestion control or by the
2467 * NET_RX_SUCCESS (no congestion)
2468 * NET_RX_DROP (packet was dropped)
2472 int netif_rx(struct sk_buff *skb)
2476 /* if netpoll wants it, pretend we never saw it */
2477 if (netpoll_rx(skb))
2480 if (netdev_tstamp_prequeue)
2481 net_timestamp_check(skb);
2485 struct rps_dev_flow voidflow, *rflow = &voidflow;
2490 cpu = get_rps_cpu(skb->dev, skb, &rflow);
2492 cpu = smp_processor_id();
2494 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2501 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
2507 EXPORT_SYMBOL(netif_rx);
2509 int netif_rx_ni(struct sk_buff *skb)
2514 err = netif_rx(skb);
2515 if (local_softirq_pending())
2521 EXPORT_SYMBOL(netif_rx_ni);
2523 static void net_tx_action(struct softirq_action *h)
2525 struct softnet_data *sd = &__get_cpu_var(softnet_data);
2527 if (sd->completion_queue) {
2528 struct sk_buff *clist;
2530 local_irq_disable();
2531 clist = sd->completion_queue;
2532 sd->completion_queue = NULL;
2536 struct sk_buff *skb = clist;
2537 clist = clist->next;
2539 WARN_ON(atomic_read(&skb->users));
2544 if (sd->output_queue) {
2547 local_irq_disable();
2548 head = sd->output_queue;
2549 sd->output_queue = NULL;
2550 sd->output_queue_tailp = &sd->output_queue;
2554 struct Qdisc *q = head;
2555 spinlock_t *root_lock;
2557 head = head->next_sched;
2559 root_lock = qdisc_lock(q);
2560 if (spin_trylock(root_lock)) {
2561 smp_mb__before_clear_bit();
2562 clear_bit(__QDISC_STATE_SCHED,
2565 spin_unlock(root_lock);
2567 if (!test_bit(__QDISC_STATE_DEACTIVATED,
2569 __netif_reschedule(q);
2571 smp_mb__before_clear_bit();
2572 clear_bit(__QDISC_STATE_SCHED,
2580 static inline int deliver_skb(struct sk_buff *skb,
2581 struct packet_type *pt_prev,
2582 struct net_device *orig_dev)
2584 atomic_inc(&skb->users);
2585 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2588 #if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
2590 #if defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE)
2591 /* This hook is defined here for ATM LANE */
2592 int (*br_fdb_test_addr_hook)(struct net_device *dev,
2593 unsigned char *addr) __read_mostly;
2594 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
2598 * If bridge module is loaded call bridging hook.
2599 * returns NULL if packet was consumed.
2601 struct sk_buff *(*br_handle_frame_hook)(struct net_bridge_port *p,
2602 struct sk_buff *skb) __read_mostly;
2603 EXPORT_SYMBOL_GPL(br_handle_frame_hook);
2605 static inline struct sk_buff *handle_bridge(struct sk_buff *skb,
2606 struct packet_type **pt_prev, int *ret,
2607 struct net_device *orig_dev)
2609 struct net_bridge_port *port;
2611 if (skb->pkt_type == PACKET_LOOPBACK ||
2612 (port = rcu_dereference(skb->dev->br_port)) == NULL)
2616 *ret = deliver_skb(skb, *pt_prev, orig_dev);
2620 return br_handle_frame_hook(port, skb);
2623 #define handle_bridge(skb, pt_prev, ret, orig_dev) (skb)
2626 #if defined(CONFIG_MACVLAN) || defined(CONFIG_MACVLAN_MODULE)
2627 struct sk_buff *(*macvlan_handle_frame_hook)(struct macvlan_port *p,
2628 struct sk_buff *skb) __read_mostly;
2629 EXPORT_SYMBOL_GPL(macvlan_handle_frame_hook);
2631 static inline struct sk_buff *handle_macvlan(struct sk_buff *skb,
2632 struct packet_type **pt_prev,
2634 struct net_device *orig_dev)
2636 struct macvlan_port *port;
2638 port = rcu_dereference(skb->dev->macvlan_port);
2643 *ret = deliver_skb(skb, *pt_prev, orig_dev);
2646 return macvlan_handle_frame_hook(port, skb);
2649 #define handle_macvlan(skb, pt_prev, ret, orig_dev) (skb)
2652 #ifdef CONFIG_NET_CLS_ACT
2653 /* TODO: Maybe we should just force sch_ingress to be compiled in
2654 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2655 * a compare and 2 stores extra right now if we dont have it on
2656 * but have CONFIG_NET_CLS_ACT
2657 * NOTE: This doesnt stop any functionality; if you dont have
2658 * the ingress scheduler, you just cant add policies on ingress.
2661 static int ing_filter(struct sk_buff *skb)
2663 struct net_device *dev = skb->dev;
2664 u32 ttl = G_TC_RTTL(skb->tc_verd);
2665 struct netdev_queue *rxq;
2666 int result = TC_ACT_OK;
2669 if (MAX_RED_LOOP < ttl++) {
2671 "Redir loop detected Dropping packet (%d->%d)\n",
2672 skb->skb_iif, dev->ifindex);
2676 skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
2677 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
2679 rxq = &dev->rx_queue;
2682 if (q != &noop_qdisc) {
2683 spin_lock(qdisc_lock(q));
2684 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
2685 result = qdisc_enqueue_root(skb, q);
2686 spin_unlock(qdisc_lock(q));
2692 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
2693 struct packet_type **pt_prev,
2694 int *ret, struct net_device *orig_dev)
2696 if (skb->dev->rx_queue.qdisc == &noop_qdisc)
2700 *ret = deliver_skb(skb, *pt_prev, orig_dev);
2703 /* Huh? Why does turning on AF_PACKET affect this? */
2704 skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
2707 switch (ing_filter(skb)) {
2721 * netif_nit_deliver - deliver received packets to network taps
2724 * This function is used to deliver incoming packets to network
2725 * taps. It should be used when the normal netif_receive_skb path
2726 * is bypassed, for example because of VLAN acceleration.
2728 void netif_nit_deliver(struct sk_buff *skb)
2730 struct packet_type *ptype;
2732 if (list_empty(&ptype_all))
2735 skb_reset_network_header(skb);
2736 skb_reset_transport_header(skb);
2737 skb->mac_len = skb->network_header - skb->mac_header;
2740 list_for_each_entry_rcu(ptype, &ptype_all, list) {
2741 if (!ptype->dev || ptype->dev == skb->dev)
2742 deliver_skb(skb, ptype, skb->dev);
2747 static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
2748 struct net_device *master)
2750 if (skb->pkt_type == PACKET_HOST) {
2751 u16 *dest = (u16 *) eth_hdr(skb)->h_dest;
2753 memcpy(dest, master->dev_addr, ETH_ALEN);
2757 /* On bonding slaves other than the currently active slave, suppress
2758 * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
2759 * ARP on active-backup slaves with arp_validate enabled.
2761 int __skb_bond_should_drop(struct sk_buff *skb, struct net_device *master)
2763 struct net_device *dev = skb->dev;
2765 if (master->priv_flags & IFF_MASTER_ARPMON)
2766 dev->last_rx = jiffies;
2768 if ((master->priv_flags & IFF_MASTER_ALB) && master->br_port) {
2769 /* Do address unmangle. The local destination address
2770 * will be always the one master has. Provides the right
2771 * functionality in a bridge.
2773 skb_bond_set_mac_by_master(skb, master);
2776 if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
2777 if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
2778 skb->protocol == __cpu_to_be16(ETH_P_ARP))
2781 if (master->priv_flags & IFF_MASTER_ALB) {
2782 if (skb->pkt_type != PACKET_BROADCAST &&
2783 skb->pkt_type != PACKET_MULTICAST)
2786 if (master->priv_flags & IFF_MASTER_8023AD &&
2787 skb->protocol == __cpu_to_be16(ETH_P_SLOW))
2794 EXPORT_SYMBOL(__skb_bond_should_drop);
2796 static int __netif_receive_skb(struct sk_buff *skb)
2798 struct packet_type *ptype, *pt_prev;
2799 struct net_device *orig_dev;
2800 struct net_device *master;
2801 struct net_device *null_or_orig;
2802 struct net_device *null_or_bond;
2803 int ret = NET_RX_DROP;
2806 if (!netdev_tstamp_prequeue)
2807 net_timestamp_check(skb);
2809 if (vlan_tx_tag_present(skb) && vlan_hwaccel_do_receive(skb))
2810 return NET_RX_SUCCESS;
2812 /* if we've gotten here through NAPI, check netpoll */
2813 if (netpoll_receive_skb(skb))
2817 skb->skb_iif = skb->dev->ifindex;
2819 null_or_orig = NULL;
2820 orig_dev = skb->dev;
2821 master = ACCESS_ONCE(orig_dev->master);
2823 if (skb_bond_should_drop(skb, master))
2824 null_or_orig = orig_dev; /* deliver only exact match */
2829 __get_cpu_var(softnet_data).processed++;
2831 skb_reset_network_header(skb);
2832 skb_reset_transport_header(skb);
2833 skb->mac_len = skb->network_header - skb->mac_header;
2839 #ifdef CONFIG_NET_CLS_ACT
2840 if (skb->tc_verd & TC_NCLS) {
2841 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
2846 list_for_each_entry_rcu(ptype, &ptype_all, list) {
2847 if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
2848 ptype->dev == orig_dev) {
2850 ret = deliver_skb(skb, pt_prev, orig_dev);
2855 #ifdef CONFIG_NET_CLS_ACT
2856 skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
2862 skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
2865 skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev);
2870 * Make sure frames received on VLAN interfaces stacked on
2871 * bonding interfaces still make their way to any base bonding
2872 * device that may have registered for a specific ptype. The
2873 * handler may have to adjust skb->dev and orig_dev.
2875 null_or_bond = NULL;
2876 if ((skb->dev->priv_flags & IFF_802_1Q_VLAN) &&
2877 (vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING)) {
2878 null_or_bond = vlan_dev_real_dev(skb->dev);
2881 type = skb->protocol;
2882 list_for_each_entry_rcu(ptype,
2883 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
2884 if (ptype->type == type && (ptype->dev == null_or_orig ||
2885 ptype->dev == skb->dev || ptype->dev == orig_dev ||
2886 ptype->dev == null_or_bond)) {
2888 ret = deliver_skb(skb, pt_prev, orig_dev);
2894 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2897 /* Jamal, now you will not able to escape explaining
2898 * me how you were going to use this. :-)
2909 * netif_receive_skb - process receive buffer from network
2910 * @skb: buffer to process
2912 * netif_receive_skb() is the main receive data processing function.
2913 * It always succeeds. The buffer may be dropped during processing
2914 * for congestion control or by the protocol layers.
2916 * This function may only be called from softirq context and interrupts
2917 * should be enabled.
2919 * Return values (usually ignored):
2920 * NET_RX_SUCCESS: no congestion
2921 * NET_RX_DROP: packet was dropped
2923 int netif_receive_skb(struct sk_buff *skb)
2925 if (netdev_tstamp_prequeue)
2926 net_timestamp_check(skb);
2930 struct rps_dev_flow voidflow, *rflow = &voidflow;
2935 cpu = get_rps_cpu(skb->dev, skb, &rflow);
2938 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2942 ret = __netif_receive_skb(skb);
2948 return __netif_receive_skb(skb);
2951 EXPORT_SYMBOL(netif_receive_skb);
2953 /* Network device is going away, flush any packets still pending
2954 * Called with irqs disabled.
2956 static void flush_backlog(void *arg)
2958 struct net_device *dev = arg;
2959 struct softnet_data *sd = &__get_cpu_var(softnet_data);
2960 struct sk_buff *skb, *tmp;
2963 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
2964 if (skb->dev == dev) {
2965 __skb_unlink(skb, &sd->input_pkt_queue);
2967 input_queue_head_add(sd, 1);
2972 skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
2973 if (skb->dev == dev) {
2974 __skb_unlink(skb, &sd->process_queue);
2980 static int napi_gro_complete(struct sk_buff *skb)
2982 struct packet_type *ptype;
2983 __be16 type = skb->protocol;
2984 struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
2987 if (NAPI_GRO_CB(skb)->count == 1) {
2988 skb_shinfo(skb)->gso_size = 0;
2993 list_for_each_entry_rcu(ptype, head, list) {
2994 if (ptype->type != type || ptype->dev || !ptype->gro_complete)
2997 err = ptype->gro_complete(skb);
3003 WARN_ON(&ptype->list == head);
3005 return NET_RX_SUCCESS;
3009 return netif_receive_skb(skb);
3012 static void napi_gro_flush(struct napi_struct *napi)
3014 struct sk_buff *skb, *next;
3016 for (skb = napi->gro_list; skb; skb = next) {
3019 napi_gro_complete(skb);
3022 napi->gro_count = 0;
3023 napi->gro_list = NULL;
3026 enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3028 struct sk_buff **pp = NULL;
3029 struct packet_type *ptype;
3030 __be16 type = skb->protocol;
3031 struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3034 enum gro_result ret;
3036 if (!(skb->dev->features & NETIF_F_GRO))
3039 if (skb_is_gso(skb) || skb_has_frags(skb))
3043 list_for_each_entry_rcu(ptype, head, list) {
3044 if (ptype->type != type || ptype->dev || !ptype->gro_receive)
3047 skb_set_network_header(skb, skb_gro_offset(skb));
3048 mac_len = skb->network_header - skb->mac_header;
3049 skb->mac_len = mac_len;
3050 NAPI_GRO_CB(skb)->same_flow = 0;
3051 NAPI_GRO_CB(skb)->flush = 0;
3052 NAPI_GRO_CB(skb)->free = 0;
3054 pp = ptype->gro_receive(&napi->gro_list, skb);
3059 if (&ptype->list == head)
3062 same_flow = NAPI_GRO_CB(skb)->same_flow;
3063 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3066 struct sk_buff *nskb = *pp;
3070 napi_gro_complete(nskb);
3077 if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3081 NAPI_GRO_CB(skb)->count = 1;
3082 skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3083 skb->next = napi->gro_list;
3084 napi->gro_list = skb;
3088 if (skb_headlen(skb) < skb_gro_offset(skb)) {
3089 int grow = skb_gro_offset(skb) - skb_headlen(skb);
3091 BUG_ON(skb->end - skb->tail < grow);
3093 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3096 skb->data_len -= grow;
3098 skb_shinfo(skb)->frags[0].page_offset += grow;
3099 skb_shinfo(skb)->frags[0].size -= grow;
3101 if (unlikely(!skb_shinfo(skb)->frags[0].size)) {
3102 put_page(skb_shinfo(skb)->frags[0].page);
3103 memmove(skb_shinfo(skb)->frags,
3104 skb_shinfo(skb)->frags + 1,
3105 --skb_shinfo(skb)->nr_frags);
3116 EXPORT_SYMBOL(dev_gro_receive);
3119 __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3123 if (netpoll_rx_on(skb))
3126 for (p = napi->gro_list; p; p = p->next) {
3127 NAPI_GRO_CB(p)->same_flow =
3128 (p->dev == skb->dev) &&
3129 !compare_ether_header(skb_mac_header(p),
3130 skb_gro_mac_header(skb));
3131 NAPI_GRO_CB(p)->flush = 0;
3134 return dev_gro_receive(napi, skb);
3137 gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3141 if (netif_receive_skb(skb))
3146 case GRO_MERGED_FREE:
3157 EXPORT_SYMBOL(napi_skb_finish);
3159 void skb_gro_reset_offset(struct sk_buff *skb)
3161 NAPI_GRO_CB(skb)->data_offset = 0;
3162 NAPI_GRO_CB(skb)->frag0 = NULL;
3163 NAPI_GRO_CB(skb)->frag0_len = 0;
3165 if (skb->mac_header == skb->tail &&
3166 !PageHighMem(skb_shinfo(skb)->frags[0].page)) {
3167 NAPI_GRO_CB(skb)->frag0 =
3168 page_address(skb_shinfo(skb)->frags[0].page) +
3169 skb_shinfo(skb)->frags[0].page_offset;
3170 NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size;
3173 EXPORT_SYMBOL(skb_gro_reset_offset);
3175 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3177 skb_gro_reset_offset(skb);
3179 return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
3181 EXPORT_SYMBOL(napi_gro_receive);
3183 void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3185 __skb_pull(skb, skb_headlen(skb));
3186 skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
3190 EXPORT_SYMBOL(napi_reuse_skb);
3192 struct sk_buff *napi_get_frags(struct napi_struct *napi)
3194 struct sk_buff *skb = napi->skb;
3197 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3203 EXPORT_SYMBOL(napi_get_frags);
3205 gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3211 skb->protocol = eth_type_trans(skb, skb->dev);
3213 if (ret == GRO_HELD)
3214 skb_gro_pull(skb, -ETH_HLEN);
3215 else if (netif_receive_skb(skb))
3220 case GRO_MERGED_FREE:
3221 napi_reuse_skb(napi, skb);
3230 EXPORT_SYMBOL(napi_frags_finish);
3232 struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3234 struct sk_buff *skb = napi->skb;
3241 skb_reset_mac_header(skb);
3242 skb_gro_reset_offset(skb);
3244 off = skb_gro_offset(skb);
3245 hlen = off + sizeof(*eth);
3246 eth = skb_gro_header_fast(skb, off);
3247 if (skb_gro_header_hard(skb, hlen)) {
3248 eth = skb_gro_header_slow(skb, hlen, off);
3249 if (unlikely(!eth)) {
3250 napi_reuse_skb(napi, skb);
3256 skb_gro_pull(skb, sizeof(*eth));
3259 * This works because the only protocols we care about don't require
3260 * special handling. We'll fix it up properly at the end.
3262 skb->protocol = eth->h_proto;
3267 EXPORT_SYMBOL(napi_frags_skb);
3269 gro_result_t napi_gro_frags(struct napi_struct *napi)
3271 struct sk_buff *skb = napi_frags_skb(napi);
3276 return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
3278 EXPORT_SYMBOL(napi_gro_frags);
3281 * net_rps_action sends any pending IPI's for rps.
3282 * Note: called with local irq disabled, but exits with local irq enabled.
3284 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3287 struct softnet_data *remsd = sd->rps_ipi_list;
3290 sd->rps_ipi_list = NULL;
3294 /* Send pending IPI's to kick RPS processing on remote cpus. */
3296 struct softnet_data *next = remsd->rps_ipi_next;
3298 if (cpu_online(remsd->cpu))
3299 __smp_call_function_single(remsd->cpu,
3308 static int process_backlog(struct napi_struct *napi, int quota)
3311 struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
3314 /* Check if we have pending ipi, its better to send them now,
3315 * not waiting net_rx_action() end.
3317 if (sd->rps_ipi_list) {
3318 local_irq_disable();
3319 net_rps_action_and_irq_enable(sd);
3322 napi->weight = weight_p;
3323 local_irq_disable();
3324 while (work < quota) {
3325 struct sk_buff *skb;
3328 while ((skb = __skb_dequeue(&sd->process_queue))) {
3330 __netif_receive_skb(skb);
3331 if (++work >= quota)
3333 local_irq_disable();
3337 qlen = skb_queue_len(&sd->input_pkt_queue);
3339 input_queue_head_add(sd, qlen);
3340 skb_queue_splice_tail_init(&sd->input_pkt_queue,
3341 &sd->process_queue);
3343 if (qlen < quota - work) {
3345 * Inline a custom version of __napi_complete().
3346 * only current cpu owns and manipulates this napi,
3347 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3348 * we can use a plain write instead of clear_bit(),
3349 * and we dont need an smp_mb() memory barrier.
3351 list_del(&napi->poll_list);
3354 quota = work + qlen;
3364 * __napi_schedule - schedule for receive
3365 * @n: entry to schedule
3367 * The entry's receive function will be scheduled to run
3369 void __napi_schedule(struct napi_struct *n)
3371 unsigned long flags;
3373 local_irq_save(flags);
3374 ____napi_schedule(&__get_cpu_var(softnet_data), n);
3375 local_irq_restore(flags);
3377 EXPORT_SYMBOL(__napi_schedule);
3379 void __napi_complete(struct napi_struct *n)
3381 BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3382 BUG_ON(n->gro_list);
3384 list_del(&n->poll_list);
3385 smp_mb__before_clear_bit();
3386 clear_bit(NAPI_STATE_SCHED, &n->state);
3388 EXPORT_SYMBOL(__napi_complete);
3390 void napi_complete(struct napi_struct *n)
3392 unsigned long flags;
3395 * don't let napi dequeue from the cpu poll list
3396 * just in case its running on a different cpu
3398 if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3402 local_irq_save(flags);
3404 local_irq_restore(flags);
3406 EXPORT_SYMBOL(napi_complete);
3408 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3409 int (*poll)(struct napi_struct *, int), int weight)
3411 INIT_LIST_HEAD(&napi->poll_list);
3412 napi->gro_count = 0;
3413 napi->gro_list = NULL;
3416 napi->weight = weight;
3417 list_add(&napi->dev_list, &dev->napi_list);
3419 #ifdef CONFIG_NETPOLL
3420 spin_lock_init(&napi->poll_lock);
3421 napi->poll_owner = -1;
3423 set_bit(NAPI_STATE_SCHED, &napi->state);
3425 EXPORT_SYMBOL(netif_napi_add);
3427 void netif_napi_del(struct napi_struct *napi)
3429 struct sk_buff *skb, *next;
3431 list_del_init(&napi->dev_list);
3432 napi_free_frags(napi);
3434 for (skb = napi->gro_list; skb; skb = next) {
3440 napi->gro_list = NULL;
3441 napi->gro_count = 0;
3443 EXPORT_SYMBOL(netif_napi_del);
3445 static void net_rx_action(struct softirq_action *h)
3447 struct softnet_data *sd = &__get_cpu_var(softnet_data);
3448 unsigned long time_limit = jiffies + 2;
3449 int budget = netdev_budget;
3452 local_irq_disable();
3454 while (!list_empty(&sd->poll_list)) {
3455 struct napi_struct *n;
3458 /* If softirq window is exhuasted then punt.
3459 * Allow this to run for 2 jiffies since which will allow
3460 * an average latency of 1.5/HZ.
3462 if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
3467 /* Even though interrupts have been re-enabled, this
3468 * access is safe because interrupts can only add new
3469 * entries to the tail of this list, and only ->poll()
3470 * calls can remove this head entry from the list.
3472 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
3474 have = netpoll_poll_lock(n);
3478 /* This NAPI_STATE_SCHED test is for avoiding a race
3479 * with netpoll's poll_napi(). Only the entity which
3480 * obtains the lock and sees NAPI_STATE_SCHED set will
3481 * actually make the ->poll() call. Therefore we avoid
3482 * accidently calling ->poll() when NAPI is not scheduled.
3485 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
3486 work = n->poll(n, weight);
3490 WARN_ON_ONCE(work > weight);
3494 local_irq_disable();
3496 /* Drivers must not modify the NAPI state if they
3497 * consume the entire weight. In such cases this code
3498 * still "owns" the NAPI instance and therefore can
3499 * move the instance around on the list at-will.
3501 if (unlikely(work == weight)) {
3502 if (unlikely(napi_disable_pending(n))) {
3505 local_irq_disable();
3507 list_move_tail(&n->poll_list, &sd->poll_list);
3510 netpoll_poll_unlock(have);
3513 net_rps_action_and_irq_enable(sd);
3515 #ifdef CONFIG_NET_DMA
3517 * There may not be any more sk_buffs coming right now, so push
3518 * any pending DMA copies to hardware
3520 dma_issue_pending_all();
3527 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3531 static gifconf_func_t *gifconf_list[NPROTO];
3534 * register_gifconf - register a SIOCGIF handler
3535 * @family: Address family
3536 * @gifconf: Function handler
3538 * Register protocol dependent address dumping routines. The handler
3539 * that is passed must not be freed or reused until it has been replaced
3540 * by another handler.
3542 int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
3544 if (family >= NPROTO)
3546 gifconf_list[family] = gifconf;
3549 EXPORT_SYMBOL(register_gifconf);
3553 * Map an interface index to its name (SIOCGIFNAME)
3557 * We need this ioctl for efficient implementation of the
3558 * if_indextoname() function required by the IPv6 API. Without
3559 * it, we would have to search all the interfaces to find a
3563 static int dev_ifname(struct net *net, struct ifreq __user *arg)
3565 struct net_device *dev;
3569 * Fetch the caller's info block.
3572 if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3576 dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
3582 strcpy(ifr.ifr_name, dev->name);
3585 if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
3591 * Perform a SIOCGIFCONF call. This structure will change
3592 * size eventually, and there is nothing I can do about it.
3593 * Thus we will need a 'compatibility mode'.
3596 static int dev_ifconf(struct net *net, char __user *arg)
3599 struct net_device *dev;
3606 * Fetch the caller's info block.
3609 if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
3616 * Loop over the interfaces, and write an info block for each.
3620 for_each_netdev(net, dev) {
3621 for (i = 0; i < NPROTO; i++) {
3622 if (gifconf_list[i]) {
3625 done = gifconf_list[i](dev, NULL, 0);
3627 done = gifconf_list[i](dev, pos + total,
3637 * All done. Write the updated control block back to the caller.
3639 ifc.ifc_len = total;
3642 * Both BSD and Solaris return 0 here, so we do too.
3644 return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
3647 #ifdef CONFIG_PROC_FS
3649 * This is invoked by the /proc filesystem handler to display a device
3652 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
3655 struct net *net = seq_file_net(seq);
3657 struct net_device *dev;
3661 return SEQ_START_TOKEN;
3664 for_each_netdev_rcu(net, dev)
3671 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3673 struct net_device *dev = (v == SEQ_START_TOKEN) ?
3674 first_net_device(seq_file_net(seq)) :
3675 next_net_device((struct net_device *)v);
3678 return rcu_dereference(dev);
3681 void dev_seq_stop(struct seq_file *seq, void *v)
3687 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
3689 const struct net_device_stats *stats = dev_get_stats(dev);
3691 seq_printf(seq, "%6s: %7lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu "
3692 "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n",
3693 dev->name, stats->rx_bytes, stats->rx_packets,
3695 stats->rx_dropped + stats->rx_missed_errors,
3696 stats->rx_fifo_errors,
3697 stats->rx_length_errors + stats->rx_over_errors +
3698 stats->rx_crc_errors + stats->rx_frame_errors,
3699 stats->rx_compressed, stats->multicast,
3700 stats->tx_bytes, stats->tx_packets,
3701 stats->tx_errors, stats->tx_dropped,
3702 stats->tx_fifo_errors, stats->collisions,
3703 stats->tx_carrier_errors +
3704 stats->tx_aborted_errors +
3705 stats->tx_window_errors +
3706 stats->tx_heartbeat_errors,
3707 stats->tx_compressed);
3711 * Called from the PROCfs module. This now uses the new arbitrary sized
3712 * /proc/net interface to create /proc/net/dev
3714 static int dev_seq_show(struct seq_file *seq, void *v)
3716 if (v == SEQ_START_TOKEN)
3717 seq_puts(seq, "Inter-| Receive "
3719 " face |bytes packets errs drop fifo frame "
3720 "compressed multicast|bytes packets errs "
3721 "drop fifo colls carrier compressed\n");
3723 dev_seq_printf_stats(seq, v);
3727 static struct softnet_data *softnet_get_online(loff_t *pos)
3729 struct softnet_data *sd = NULL;
3731 while (*pos < nr_cpu_ids)
3732 if (cpu_online(*pos)) {
3733 sd = &per_cpu(softnet_data, *pos);
3740 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
3742 return softnet_get_online(pos);
3745 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3748 return softnet_get_online(pos);
3751 static void softnet_seq_stop(struct seq_file *seq, void *v)
3755 static int softnet_seq_show(struct seq_file *seq, void *v)
3757 struct softnet_data *sd = v;
3759 seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
3760 sd->processed, sd->dropped, sd->time_squeeze, 0,
3761 0, 0, 0, 0, /* was fastroute */
3762 sd->cpu_collision, sd->received_rps);
3766 static const struct seq_operations dev_seq_ops = {
3767 .start = dev_seq_start,
3768 .next = dev_seq_next,
3769 .stop = dev_seq_stop,
3770 .show = dev_seq_show,
3773 static int dev_seq_open(struct inode *inode, struct file *file)
3775 return seq_open_net(inode, file, &dev_seq_ops,
3776 sizeof(struct seq_net_private));
3779 static const struct file_operations dev_seq_fops = {
3780 .owner = THIS_MODULE,
3781 .open = dev_seq_open,
3783 .llseek = seq_lseek,
3784 .release = seq_release_net,
3787 static const struct seq_operations softnet_seq_ops = {
3788 .start = softnet_seq_start,
3789 .next = softnet_seq_next,
3790 .stop = softnet_seq_stop,
3791 .show = softnet_seq_show,
3794 static int softnet_seq_open(struct inode *inode, struct file *file)
3796 return seq_open(file, &softnet_seq_ops);
3799 static const struct file_operations softnet_seq_fops = {
3800 .owner = THIS_MODULE,
3801 .open = softnet_seq_open,
3803 .llseek = seq_lseek,
3804 .release = seq_release,
3807 static void *ptype_get_idx(loff_t pos)
3809 struct packet_type *pt = NULL;
3813 list_for_each_entry_rcu(pt, &ptype_all, list) {
3819 for (t = 0; t < PTYPE_HASH_SIZE; t++) {
3820 list_for_each_entry_rcu(pt, &ptype_base[t], list) {
3829 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
3833 return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
3836 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3838 struct packet_type *pt;
3839 struct list_head *nxt;
3843 if (v == SEQ_START_TOKEN)
3844 return ptype_get_idx(0);
3847 nxt = pt->list.next;
3848 if (pt->type == htons(ETH_P_ALL)) {
3849 if (nxt != &ptype_all)
3852 nxt = ptype_base[0].next;
3854 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
3856 while (nxt == &ptype_base[hash]) {
3857 if (++hash >= PTYPE_HASH_SIZE)
3859 nxt = ptype_base[hash].next;
3862 return list_entry(nxt, struct packet_type, list);
3865 static void ptype_seq_stop(struct seq_file *seq, void *v)
3871 static int ptype_seq_show(struct seq_file *seq, void *v)
3873 struct packet_type *pt = v;
3875 if (v == SEQ_START_TOKEN)
3876 seq_puts(seq, "Type Device Function\n");
3877 else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
3878 if (pt->type == htons(ETH_P_ALL))
3879 seq_puts(seq, "ALL ");
3881 seq_printf(seq, "%04x", ntohs(pt->type));
3883 seq_printf(seq, " %-8s %pF\n",
3884 pt->dev ? pt->dev->name : "", pt->func);
3890 static const struct seq_operations ptype_seq_ops = {
3891 .start = ptype_seq_start,
3892 .next = ptype_seq_next,
3893 .stop = ptype_seq_stop,
3894 .show = ptype_seq_show,
3897 static int ptype_seq_open(struct inode *inode, struct file *file)
3899 return seq_open_net(inode, file, &ptype_seq_ops,
3900 sizeof(struct seq_net_private));
3903 static const struct file_operations ptype_seq_fops = {
3904 .owner = THIS_MODULE,
3905 .open = ptype_seq_open,
3907 .llseek = seq_lseek,
3908 .release = seq_release_net,
3912 static int __net_init dev_proc_net_init(struct net *net)
3916 if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
3918 if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
3920 if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
3923 if (wext_proc_init(net))
3929 proc_net_remove(net, "ptype");
3931 proc_net_remove(net, "softnet_stat");
3933 proc_net_remove(net, "dev");
3937 static void __net_exit dev_proc_net_exit(struct net *net)
3939 wext_proc_exit(net);
3941 proc_net_remove(net, "ptype");
3942 proc_net_remove(net, "softnet_stat");
3943 proc_net_remove(net, "dev");
3946 static struct pernet_operations __net_initdata dev_proc_ops = {
3947 .init = dev_proc_net_init,
3948 .exit = dev_proc_net_exit,
3951 static int __init dev_proc_init(void)
3953 return register_pernet_subsys(&dev_proc_ops);
3956 #define dev_proc_init() 0
3957 #endif /* CONFIG_PROC_FS */
3961 * netdev_set_master - set up master/slave pair
3962 * @slave: slave device
3963 * @master: new master device
3965 * Changes the master device of the slave. Pass %NULL to break the
3966 * bonding. The caller must hold the RTNL semaphore. On a failure
3967 * a negative errno code is returned. On success the reference counts
3968 * are adjusted, %RTM_NEWLINK is sent to the routing socket and the
3969 * function returns zero.
3971 int netdev_set_master(struct net_device *slave, struct net_device *master)
3973 struct net_device *old = slave->master;
3983 slave->master = master;
3990 slave->flags |= IFF_SLAVE;
3992 slave->flags &= ~IFF_SLAVE;
3994 rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
3997 EXPORT_SYMBOL(netdev_set_master);
3999 static void dev_change_rx_flags(struct net_device *dev, int flags)
4001 const struct net_device_ops *ops = dev->netdev_ops;
4003 if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4004 ops->ndo_change_rx_flags(dev, flags);
4007 static int __dev_set_promiscuity(struct net_device *dev, int inc)
4009 unsigned short old_flags = dev->flags;
4015 dev->flags |= IFF_PROMISC;
4016 dev->promiscuity += inc;
4017 if (dev->promiscuity == 0) {
4020 * If inc causes overflow, untouch promisc and return error.
4023 dev->flags &= ~IFF_PROMISC;
4025 dev->promiscuity -= inc;
4026 printk(KERN_WARNING "%s: promiscuity touches roof, "
4027 "set promiscuity failed, promiscuity feature "
4028 "of device might be broken.\n", dev->name);
4032 if (dev->flags != old_flags) {
4033 printk(KERN_INFO "device %s %s promiscuous mode\n",
4034 dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
4036 if (audit_enabled) {
4037 current_uid_gid(&uid, &gid);
4038 audit_log(current->audit_context, GFP_ATOMIC,
4039 AUDIT_ANOM_PROMISCUOUS,
4040 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4041 dev->name, (dev->flags & IFF_PROMISC),
4042 (old_flags & IFF_PROMISC),
4043 audit_get_loginuid(current),
4045 audit_get_sessionid(current));
4048 dev_change_rx_flags(dev, IFF_PROMISC);
4054 * dev_set_promiscuity - update promiscuity count on a device
4058 * Add or remove promiscuity from a device. While the count in the device
4059 * remains above zero the interface remains promiscuous. Once it hits zero
4060 * the device reverts back to normal filtering operation. A negative inc
4061 * value is used to drop promiscuity on the device.
4062 * Return 0 if successful or a negative errno code on error.
4064 int dev_set_promiscuity(struct net_device *dev, int inc)
4066 unsigned short old_flags = dev->flags;
4069 err = __dev_set_promiscuity(dev, inc);
4072 if (dev->flags != old_flags)
4073 dev_set_rx_mode(dev);
4076 EXPORT_SYMBOL(dev_set_promiscuity);
4079 * dev_set_allmulti - update allmulti count on a device
4083 * Add or remove reception of all multicast frames to a device. While the
4084 * count in the device remains above zero the interface remains listening
4085 * to all interfaces. Once it hits zero the device reverts back to normal
4086 * filtering operation. A negative @inc value is used to drop the counter
4087 * when releasing a resource needing all multicasts.
4088 * Return 0 if successful or a negative errno code on error.
4091 int dev_set_allmulti(struct net_device *dev, int inc)
4093 unsigned short old_flags = dev->flags;
4097 dev->flags |= IFF_ALLMULTI;
4098 dev->allmulti += inc;
4099 if (dev->allmulti == 0) {
4102 * If inc causes overflow, untouch allmulti and return error.
4105 dev->flags &= ~IFF_ALLMULTI;
4107 dev->allmulti -= inc;
4108 printk(KERN_WARNING "%s: allmulti touches roof, "
4109 "set allmulti failed, allmulti feature of "
4110 "device might be broken.\n", dev->name);
4114 if (dev->flags ^ old_flags) {
4115 dev_change_rx_flags(dev, IFF_ALLMULTI);
4116 dev_set_rx_mode(dev);
4120 EXPORT_SYMBOL(dev_set_allmulti);
4123 * Upload unicast and multicast address lists to device and
4124 * configure RX filtering. When the device doesn't support unicast
4125 * filtering it is put in promiscuous mode while unicast addresses
4128 void __dev_set_rx_mode(struct net_device *dev)
4130 const struct net_device_ops *ops = dev->netdev_ops;
4132 /* dev_open will call this function so the list will stay sane. */
4133 if (!(dev->flags&IFF_UP))
4136 if (!netif_device_present(dev))
4139 if (ops->ndo_set_rx_mode)
4140 ops->ndo_set_rx_mode(dev);
4142 /* Unicast addresses changes may only happen under the rtnl,
4143 * therefore calling __dev_set_promiscuity here is safe.
4145 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4146 __dev_set_promiscuity(dev, 1);
4147 dev->uc_promisc = 1;
4148 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4149 __dev_set_promiscuity(dev, -1);
4150 dev->uc_promisc = 0;
4153 if (ops->ndo_set_multicast_list)
4154 ops->ndo_set_multicast_list(dev);
4158 void dev_set_rx_mode(struct net_device *dev)
4160 netif_addr_lock_bh(dev);
4161 __dev_set_rx_mode(dev);
4162 netif_addr_unlock_bh(dev);
4166 * dev_get_flags - get flags reported to userspace
4169 * Get the combination of flag bits exported through APIs to userspace.
4171 unsigned dev_get_flags(const struct net_device *dev)
4175 flags = (dev->flags & ~(IFF_PROMISC |
4180 (dev->gflags & (IFF_PROMISC |
4183 if (netif_running(dev)) {
4184 if (netif_oper_up(dev))
4185 flags |= IFF_RUNNING;
4186 if (netif_carrier_ok(dev))
4187 flags |= IFF_LOWER_UP;
4188 if (netif_dormant(dev))
4189 flags |= IFF_DORMANT;
4194 EXPORT_SYMBOL(dev_get_flags);
4196 int __dev_change_flags(struct net_device *dev, unsigned int flags)
4198 int old_flags = dev->flags;
4204 * Set the flags on our device.
4207 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4208 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4210 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4214 * Load in the correct multicast list now the flags have changed.
4217 if ((old_flags ^ flags) & IFF_MULTICAST)
4218 dev_change_rx_flags(dev, IFF_MULTICAST);
4220 dev_set_rx_mode(dev);
4223 * Have we downed the interface. We handle IFF_UP ourselves
4224 * according to user attempts to set it, rather than blindly
4229 if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */
4230 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4233 dev_set_rx_mode(dev);
4236 if ((flags ^ dev->gflags) & IFF_PROMISC) {
4237 int inc = (flags & IFF_PROMISC) ? 1 : -1;
4239 dev->gflags ^= IFF_PROMISC;
4240 dev_set_promiscuity(dev, inc);
4243 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4244 is important. Some (broken) drivers set IFF_PROMISC, when
4245 IFF_ALLMULTI is requested not asking us and not reporting.
4247 if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4248 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4250 dev->gflags ^= IFF_ALLMULTI;
4251 dev_set_allmulti(dev, inc);
4257 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4259 unsigned int changes = dev->flags ^ old_flags;
4261 if (changes & IFF_UP) {
4262 if (dev->flags & IFF_UP)
4263 call_netdevice_notifiers(NETDEV_UP, dev);
4265 call_netdevice_notifiers(NETDEV_DOWN, dev);
4268 if (dev->flags & IFF_UP &&
4269 (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4270 call_netdevice_notifiers(NETDEV_CHANGE, dev);
4274 * dev_change_flags - change device settings
4276 * @flags: device state flags
4278 * Change settings on device based state flags. The flags are
4279 * in the userspace exported format.
4281 int dev_change_flags(struct net_device *dev, unsigned flags)
4284 int old_flags = dev->flags;
4286 ret = __dev_change_flags(dev, flags);
4290 changes = old_flags ^ dev->flags;
4292 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4294 __dev_notify_flags(dev, old_flags);
4297 EXPORT_SYMBOL(dev_change_flags);
4300 * dev_set_mtu - Change maximum transfer unit
4302 * @new_mtu: new transfer unit
4304 * Change the maximum transfer size of the network device.
4306 int dev_set_mtu(struct net_device *dev, int new_mtu)
4308 const struct net_device_ops *ops = dev->netdev_ops;
4311 if (new_mtu == dev->mtu)
4314 /* MTU must be positive. */
4318 if (!netif_device_present(dev))
4322 if (ops->ndo_change_mtu)
4323 err = ops->ndo_change_mtu(dev, new_mtu);
4327 if (!err && dev->flags & IFF_UP)
4328 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4331 EXPORT_SYMBOL(dev_set_mtu);
4334 * dev_set_mac_address - Change Media Access Control Address
4338 * Change the hardware (MAC) address of the device
4340 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4342 const struct net_device_ops *ops = dev->netdev_ops;
4345 if (!ops->ndo_set_mac_address)
4347 if (sa->sa_family != dev->type)
4349 if (!netif_device_present(dev))
4351 err = ops->ndo_set_mac_address(dev, sa);
4353 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4356 EXPORT_SYMBOL(dev_set_mac_address);
4359 * Perform the SIOCxIFxxx calls, inside rcu_read_lock()
4361 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4364 struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
4370 case SIOCGIFFLAGS: /* Get interface flags */
4371 ifr->ifr_flags = (short) dev_get_flags(dev);
4374 case SIOCGIFMETRIC: /* Get the metric on the interface
4375 (currently unused) */
4376 ifr->ifr_metric = 0;
4379 case SIOCGIFMTU: /* Get the MTU of a device */
4380 ifr->ifr_mtu = dev->mtu;
4385 memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4387 memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4388 min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4389 ifr->ifr_hwaddr.sa_family = dev->type;
4397 ifr->ifr_map.mem_start = dev->mem_start;
4398 ifr->ifr_map.mem_end = dev->mem_end;
4399 ifr->ifr_map.base_addr = dev->base_addr;
4400 ifr->ifr_map.irq = dev->irq;
4401 ifr->ifr_map.dma = dev->dma;
4402 ifr->ifr_map.port = dev->if_port;
4406 ifr->ifr_ifindex = dev->ifindex;
4410 ifr->ifr_qlen = dev->tx_queue_len;
4414 /* dev_ioctl() should ensure this case
4426 * Perform the SIOCxIFxxx calls, inside rtnl_lock()
4428 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4431 struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4432 const struct net_device_ops *ops;
4437 ops = dev->netdev_ops;
4440 case SIOCSIFFLAGS: /* Set interface flags */
4441 return dev_change_flags(dev, ifr->ifr_flags);
4443 case SIOCSIFMETRIC: /* Set the metric on the interface
4444 (currently unused) */
4447 case SIOCSIFMTU: /* Set the MTU of a device */
4448 return dev_set_mtu(dev, ifr->ifr_mtu);
4451 return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
4453 case SIOCSIFHWBROADCAST:
4454 if (ifr->ifr_hwaddr.sa_family != dev->type)
4456 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4457 min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4458 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4462 if (ops->ndo_set_config) {
4463 if (!netif_device_present(dev))
4465 return ops->ndo_set_config(dev, &ifr->ifr_map);
4470 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4471 ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4473 if (!netif_device_present(dev))
4475 return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
4478 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4479 ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4481 if (!netif_device_present(dev))
4483 return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
4486 if (ifr->ifr_qlen < 0)
4488 dev->tx_queue_len = ifr->ifr_qlen;
4492 ifr->ifr_newname[IFNAMSIZ-1] = '\0';
4493 return dev_change_name(dev, ifr->ifr_newname);
4496 * Unknown or private ioctl
4499 if ((cmd >= SIOCDEVPRIVATE &&
4500 cmd <= SIOCDEVPRIVATE + 15) ||
4501 cmd == SIOCBONDENSLAVE ||
4502 cmd == SIOCBONDRELEASE ||
4503 cmd == SIOCBONDSETHWADDR ||
4504 cmd == SIOCBONDSLAVEINFOQUERY ||
4505 cmd == SIOCBONDINFOQUERY ||
4506 cmd == SIOCBONDCHANGEACTIVE ||
4507 cmd == SIOCGMIIPHY ||
4508 cmd == SIOCGMIIREG ||
4509 cmd == SIOCSMIIREG ||
4510 cmd == SIOCBRADDIF ||
4511 cmd == SIOCBRDELIF ||
4512 cmd == SIOCSHWTSTAMP ||
4513 cmd == SIOCWANDEV) {
4515 if (ops->ndo_do_ioctl) {
4516 if (netif_device_present(dev))
4517 err = ops->ndo_do_ioctl(dev, ifr, cmd);
4529 * This function handles all "interface"-type I/O control requests. The actual
4530 * 'doing' part of this is dev_ifsioc above.
4534 * dev_ioctl - network device ioctl
4535 * @net: the applicable net namespace
4536 * @cmd: command to issue
4537 * @arg: pointer to a struct ifreq in user space
4539 * Issue ioctl functions to devices. This is normally called by the
4540 * user space syscall interfaces but can sometimes be useful for
4541 * other purposes. The return value is the return from the syscall if
4542 * positive or a negative errno code on error.
4545 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
4551 /* One special case: SIOCGIFCONF takes ifconf argument
4552 and requires shared lock, because it sleeps writing
4556 if (cmd == SIOCGIFCONF) {
4558 ret = dev_ifconf(net, (char __user *) arg);
4562 if (cmd == SIOCGIFNAME)
4563 return dev_ifname(net, (struct ifreq __user *)arg);
4565 if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4568 ifr.ifr_name[IFNAMSIZ-1] = 0;
4570 colon = strchr(ifr.ifr_name, ':');
4575 * See which interface the caller is talking about.
4580 * These ioctl calls:
4581 * - can be done by all.
4582 * - atomic and do not require locking.
4593 dev_load(net, ifr.ifr_name);
4595 ret = dev_ifsioc_locked(net, &ifr, cmd);
4600 if (copy_to_user(arg, &ifr,
4601 sizeof(struct ifreq)))
4607 dev_load(net, ifr.ifr_name);
4609 ret = dev_ethtool(net, &ifr);
4614 if (copy_to_user(arg, &ifr,
4615 sizeof(struct ifreq)))
4621 * These ioctl calls:
4622 * - require superuser power.
4623 * - require strict serialization.
4629 if (!capable(CAP_NET_ADMIN))
4631 dev_load(net, ifr.ifr_name);
4633 ret = dev_ifsioc(net, &ifr, cmd);
4638 if (copy_to_user(arg, &ifr,
4639 sizeof(struct ifreq)))
4645 * These ioctl calls:
4646 * - require superuser power.
4647 * - require strict serialization.
4648 * - do not return a value
4658 case SIOCSIFHWBROADCAST:
4661 case SIOCBONDENSLAVE:
4662 case SIOCBONDRELEASE:
4663 case SIOCBONDSETHWADDR:
4664 case SIOCBONDCHANGEACTIVE:
4668 if (!capable(CAP_NET_ADMIN))
4671 case SIOCBONDSLAVEINFOQUERY:
4672 case SIOCBONDINFOQUERY:
4673 dev_load(net, ifr.ifr_name);
4675 ret = dev_ifsioc(net, &ifr, cmd);
4680 /* Get the per device memory space. We can add this but
4681 * currently do not support it */
4683 /* Set the per device memory buffer space.
4684 * Not applicable in our case */
4689 * Unknown or private ioctl.
4692 if (cmd == SIOCWANDEV ||
4693 (cmd >= SIOCDEVPRIVATE &&
4694 cmd <= SIOCDEVPRIVATE + 15)) {
4695 dev_load(net, ifr.ifr_name);
4697 ret = dev_ifsioc(net, &ifr, cmd);
4699 if (!ret && copy_to_user(arg, &ifr,
4700 sizeof(struct ifreq)))
4704 /* Take care of Wireless Extensions */
4705 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
4706 return wext_handle_ioctl(net, &ifr, cmd, arg);
4713 * dev_new_index - allocate an ifindex
4714 * @net: the applicable net namespace
4716 * Returns a suitable unique value for a new device interface
4717 * number. The caller must hold the rtnl semaphore or the
4718 * dev_base_lock to be sure it remains unique.
4720 static int dev_new_index(struct net *net)
4726 if (!__dev_get_by_index(net, ifindex))
4731 /* Delayed registration/unregisteration */
4732 static LIST_HEAD(net_todo_list);
4734 static void net_set_todo(struct net_device *dev)
4736 list_add_tail(&dev->todo_list, &net_todo_list);
4739 static void rollback_registered_many(struct list_head *head)
4741 struct net_device *dev, *tmp;
4743 BUG_ON(dev_boot_phase);
4746 list_for_each_entry_safe(dev, tmp, head, unreg_list) {
4747 /* Some devices call without registering
4748 * for initialization unwind. Remove those
4749 * devices and proceed with the remaining.
4751 if (dev->reg_state == NETREG_UNINITIALIZED) {
4752 pr_debug("unregister_netdevice: device %s/%p never "
4753 "was registered\n", dev->name, dev);
4756 list_del(&dev->unreg_list);
4760 BUG_ON(dev->reg_state != NETREG_REGISTERED);
4762 /* If device is running, close it first. */
4765 /* And unlink it from device chain. */
4766 unlist_netdevice(dev);
4768 dev->reg_state = NETREG_UNREGISTERING;
4773 list_for_each_entry(dev, head, unreg_list) {
4774 /* Shutdown queueing discipline. */
4778 /* Notify protocols, that we are about to destroy
4779 this device. They should clean all the things.
4781 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4783 if (!dev->rtnl_link_ops ||
4784 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
4785 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
4788 * Flush the unicast and multicast chains
4793 if (dev->netdev_ops->ndo_uninit)
4794 dev->netdev_ops->ndo_uninit(dev);
4796 /* Notifier chain MUST detach us from master device. */
4797 WARN_ON(dev->master);
4799 /* Remove entries from kobject tree */
4800 netdev_unregister_kobject(dev);
4803 /* Process any work delayed until the end of the batch */
4804 dev = list_first_entry(head, struct net_device, unreg_list);
4805 call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
4809 list_for_each_entry(dev, head, unreg_list)
4813 static void rollback_registered(struct net_device *dev)
4817 list_add(&dev->unreg_list, &single);
4818 rollback_registered_many(&single);
4821 static void __netdev_init_queue_locks_one(struct net_device *dev,
4822 struct netdev_queue *dev_queue,
4825 spin_lock_init(&dev_queue->_xmit_lock);
4826 netdev_set_xmit_lockdep_class(&dev_queue->_xmit_lock, dev->type);
4827 dev_queue->xmit_lock_owner = -1;
4830 static void netdev_init_queue_locks(struct net_device *dev)
4832 netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL);
4833 __netdev_init_queue_locks_one(dev, &dev->rx_queue, NULL);
4836 unsigned long netdev_fix_features(unsigned long features, const char *name)
4838 /* Fix illegal SG+CSUM combinations. */
4839 if ((features & NETIF_F_SG) &&
4840 !(features & NETIF_F_ALL_CSUM)) {
4842 printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no "
4843 "checksum feature.\n", name);
4844 features &= ~NETIF_F_SG;
4847 /* TSO requires that SG is present as well. */
4848 if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) {
4850 printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no "
4851 "SG feature.\n", name);
4852 features &= ~NETIF_F_TSO;
4855 if (features & NETIF_F_UFO) {
4856 if (!(features & NETIF_F_GEN_CSUM)) {
4858 printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
4859 "since no NETIF_F_HW_CSUM feature.\n",
4861 features &= ~NETIF_F_UFO;
4864 if (!(features & NETIF_F_SG)) {
4866 printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
4867 "since no NETIF_F_SG feature.\n", name);
4868 features &= ~NETIF_F_UFO;
4874 EXPORT_SYMBOL(netdev_fix_features);
4877 * netif_stacked_transfer_operstate - transfer operstate
4878 * @rootdev: the root or lower level device to transfer state from
4879 * @dev: the device to transfer operstate to
4881 * Transfer operational state from root to device. This is normally
4882 * called when a stacking relationship exists between the root
4883 * device and the device(a leaf device).
4885 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
4886 struct net_device *dev)
4888 if (rootdev->operstate == IF_OPER_DORMANT)
4889 netif_dormant_on(dev);
4891 netif_dormant_off(dev);
4893 if (netif_carrier_ok(rootdev)) {
4894 if (!netif_carrier_ok(dev))
4895 netif_carrier_on(dev);
4897 if (netif_carrier_ok(dev))
4898 netif_carrier_off(dev);
4901 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
4904 * register_netdevice - register a network device
4905 * @dev: device to register
4907 * Take a completed network device structure and add it to the kernel
4908 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
4909 * chain. 0 is returned on success. A negative errno code is returned
4910 * on a failure to set up the device, or if the name is a duplicate.
4912 * Callers must hold the rtnl semaphore. You may want
4913 * register_netdev() instead of this.
4916 * The locking appears insufficient to guarantee two parallel registers
4917 * will not get the same name.
4920 int register_netdevice(struct net_device *dev)
4923 struct net *net = dev_net(dev);
4925 BUG_ON(dev_boot_phase);
4930 /* When net_device's are persistent, this will be fatal. */
4931 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
4934 spin_lock_init(&dev->addr_list_lock);
4935 netdev_set_addr_lockdep_class(dev);
4936 netdev_init_queue_locks(dev);
4941 if (!dev->num_rx_queues) {
4943 * Allocate a single RX queue if driver never called
4947 dev->_rx = kzalloc(sizeof(struct netdev_rx_queue), GFP_KERNEL);
4953 dev->_rx->first = dev->_rx;
4954 atomic_set(&dev->_rx->count, 1);
4955 dev->num_rx_queues = 1;
4958 /* Init, if this function is available */
4959 if (dev->netdev_ops->ndo_init) {
4960 ret = dev->netdev_ops->ndo_init(dev);
4968 ret = dev_get_valid_name(net, dev->name, dev->name, 0);
4972 dev->ifindex = dev_new_index(net);
4973 if (dev->iflink == -1)
4974 dev->iflink = dev->ifindex;
4976 /* Fix illegal checksum combinations */
4977 if ((dev->features & NETIF_F_HW_CSUM) &&
4978 (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
4979 printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n",
4981 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
4984 if ((dev->features & NETIF_F_NO_CSUM) &&
4985 (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
4986 printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n",
4988 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
4991 dev->features = netdev_fix_features(dev->features, dev->name);
4993 /* Enable software GSO if SG is supported. */
4994 if (dev->features & NETIF_F_SG)
4995 dev->features |= NETIF_F_GSO;
4997 netdev_initialize_kobject(dev);
4999 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5000 ret = notifier_to_errno(ret);
5004 ret = netdev_register_kobject(dev);
5007 dev->reg_state = NETREG_REGISTERED;
5010 * Default initial state at registry is that the
5011 * device is present.
5014 set_bit(__LINK_STATE_PRESENT, &dev->state);
5016 dev_init_scheduler(dev);
5018 list_netdevice(dev);
5020 /* Notify protocols, that a new device appeared. */
5021 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5022 ret = notifier_to_errno(ret);
5024 rollback_registered(dev);
5025 dev->reg_state = NETREG_UNREGISTERED;
5028 * Prevent userspace races by waiting until the network
5029 * device is fully setup before sending notifications.
5031 if (!dev->rtnl_link_ops ||
5032 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5033 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5039 if (dev->netdev_ops->ndo_uninit)
5040 dev->netdev_ops->ndo_uninit(dev);
5043 EXPORT_SYMBOL(register_netdevice);
5046 * init_dummy_netdev - init a dummy network device for NAPI
5047 * @dev: device to init
5049 * This takes a network device structure and initialize the minimum
5050 * amount of fields so it can be used to schedule NAPI polls without
5051 * registering a full blown interface. This is to be used by drivers
5052 * that need to tie several hardware interfaces to a single NAPI
5053 * poll scheduler due to HW limitations.
5055 int init_dummy_netdev(struct net_device *dev)
5057 /* Clear everything. Note we don't initialize spinlocks
5058 * are they aren't supposed to be taken by any of the
5059 * NAPI code and this dummy netdev is supposed to be
5060 * only ever used for NAPI polls
5062 memset(dev, 0, sizeof(struct net_device));
5064 /* make sure we BUG if trying to hit standard
5065 * register/unregister code path
5067 dev->reg_state = NETREG_DUMMY;
5069 /* initialize the ref count */
5070 atomic_set(&dev->refcnt, 1);
5072 /* NAPI wants this */
5073 INIT_LIST_HEAD(&dev->napi_list);
5075 /* a dummy interface is started by default */
5076 set_bit(__LINK_STATE_PRESENT, &dev->state);
5077 set_bit(__LINK_STATE_START, &dev->state);
5081 EXPORT_SYMBOL_GPL(init_dummy_netdev);
5085 * register_netdev - register a network device
5086 * @dev: device to register
5088 * Take a completed network device structure and add it to the kernel
5089 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5090 * chain. 0 is returned on success. A negative errno code is returned
5091 * on a failure to set up the device, or if the name is a duplicate.
5093 * This is a wrapper around register_netdevice that takes the rtnl semaphore
5094 * and expands the device name if you passed a format string to
5097 int register_netdev(struct net_device *dev)
5104 * If the name is a format string the caller wants us to do a
5107 if (strchr(dev->name, '%')) {
5108 err = dev_alloc_name(dev, dev->name);
5113 err = register_netdevice(dev);
5118 EXPORT_SYMBOL(register_netdev);
5121 * netdev_wait_allrefs - wait until all references are gone.
5123 * This is called when unregistering network devices.
5125 * Any protocol or device that holds a reference should register
5126 * for netdevice notification, and cleanup and put back the
5127 * reference if they receive an UNREGISTER event.
5128 * We can get stuck here if buggy protocols don't correctly
5131 static void netdev_wait_allrefs(struct net_device *dev)
5133 unsigned long rebroadcast_time, warning_time;
5135 linkwatch_forget_dev(dev);
5137 rebroadcast_time = warning_time = jiffies;
5138 while (atomic_read(&dev->refcnt) != 0) {
5139 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5142 /* Rebroadcast unregister notification */
5143 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5144 /* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
5145 * should have already handle it the first time */
5147 if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5149 /* We must not have linkwatch events
5150 * pending on unregister. If this
5151 * happens, we simply run the queue
5152 * unscheduled, resulting in a noop
5155 linkwatch_run_queue();
5160 rebroadcast_time = jiffies;
5165 if (time_after(jiffies, warning_time + 10 * HZ)) {
5166 printk(KERN_EMERG "unregister_netdevice: "
5167 "waiting for %s to become free. Usage "
5169 dev->name, atomic_read(&dev->refcnt));
5170 warning_time = jiffies;
5179 * register_netdevice(x1);
5180 * register_netdevice(x2);
5182 * unregister_netdevice(y1);
5183 * unregister_netdevice(y2);
5189 * We are invoked by rtnl_unlock().
5190 * This allows us to deal with problems:
5191 * 1) We can delete sysfs objects which invoke hotplug
5192 * without deadlocking with linkwatch via keventd.
5193 * 2) Since we run with the RTNL semaphore not held, we can sleep
5194 * safely in order to wait for the netdev refcnt to drop to zero.
5196 * We must not return until all unregister events added during
5197 * the interval the lock was held have been completed.
5199 void netdev_run_todo(void)
5201 struct list_head list;
5203 /* Snapshot list, allow later requests */
5204 list_replace_init(&net_todo_list, &list);
5208 while (!list_empty(&list)) {
5209 struct net_device *dev
5210 = list_first_entry(&list, struct net_device, todo_list);
5211 list_del(&dev->todo_list);
5213 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5214 printk(KERN_ERR "network todo '%s' but state %d\n",
5215 dev->name, dev->reg_state);
5220 dev->reg_state = NETREG_UNREGISTERED;
5222 on_each_cpu(flush_backlog, dev, 1);
5224 netdev_wait_allrefs(dev);
5227 BUG_ON(atomic_read(&dev->refcnt));
5228 WARN_ON(dev->ip_ptr);
5229 WARN_ON(dev->ip6_ptr);
5230 WARN_ON(dev->dn_ptr);
5232 if (dev->destructor)
5233 dev->destructor(dev);
5235 /* Free network device */
5236 kobject_put(&dev->dev.kobj);
5241 * dev_txq_stats_fold - fold tx_queues stats
5242 * @dev: device to get statistics from
5243 * @stats: struct net_device_stats to hold results
5245 void dev_txq_stats_fold(const struct net_device *dev,
5246 struct net_device_stats *stats)
5248 unsigned long tx_bytes = 0, tx_packets = 0, tx_dropped = 0;
5250 struct netdev_queue *txq;
5252 for (i = 0; i < dev->num_tx_queues; i++) {
5253 txq = netdev_get_tx_queue(dev, i);
5254 tx_bytes += txq->tx_bytes;
5255 tx_packets += txq->tx_packets;
5256 tx_dropped += txq->tx_dropped;
5258 if (tx_bytes || tx_packets || tx_dropped) {
5259 stats->tx_bytes = tx_bytes;
5260 stats->tx_packets = tx_packets;
5261 stats->tx_dropped = tx_dropped;
5264 EXPORT_SYMBOL(dev_txq_stats_fold);
5267 * dev_get_stats - get network device statistics
5268 * @dev: device to get statistics from
5270 * Get network statistics from device. The device driver may provide
5271 * its own method by setting dev->netdev_ops->get_stats; otherwise
5272 * the internal statistics structure is used.
5274 const struct net_device_stats *dev_get_stats(struct net_device *dev)
5276 const struct net_device_ops *ops = dev->netdev_ops;
5278 if (ops->ndo_get_stats)
5279 return ops->ndo_get_stats(dev);
5281 dev_txq_stats_fold(dev, &dev->stats);
5284 EXPORT_SYMBOL(dev_get_stats);
5286 static void netdev_init_one_queue(struct net_device *dev,
5287 struct netdev_queue *queue,
5293 static void netdev_init_queues(struct net_device *dev)
5295 netdev_init_one_queue(dev, &dev->rx_queue, NULL);
5296 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5297 spin_lock_init(&dev->tx_global_lock);
5301 * alloc_netdev_mq - allocate network device
5302 * @sizeof_priv: size of private data to allocate space for
5303 * @name: device name format string
5304 * @setup: callback to initialize device
5305 * @queue_count: the number of subqueues to allocate
5307 * Allocates a struct net_device with private data area for driver use
5308 * and performs basic initialization. Also allocates subquue structs
5309 * for each queue on the device at the end of the netdevice.
5311 struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
5312 void (*setup)(struct net_device *), unsigned int queue_count)
5314 struct netdev_queue *tx;
5315 struct net_device *dev;
5317 struct net_device *p;
5319 struct netdev_rx_queue *rx;
5323 BUG_ON(strlen(name) >= sizeof(dev->name));
5325 alloc_size = sizeof(struct net_device);
5327 /* ensure 32-byte alignment of private area */
5328 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5329 alloc_size += sizeof_priv;
5331 /* ensure 32-byte alignment of whole construct */
5332 alloc_size += NETDEV_ALIGN - 1;
5334 p = kzalloc(alloc_size, GFP_KERNEL);
5336 printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
5340 tx = kcalloc(queue_count, sizeof(struct netdev_queue), GFP_KERNEL);
5342 printk(KERN_ERR "alloc_netdev: Unable to allocate "
5348 rx = kcalloc(queue_count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5350 printk(KERN_ERR "alloc_netdev: Unable to allocate "
5355 atomic_set(&rx->count, queue_count);
5358 * Set a pointer to first element in the array which holds the
5361 for (i = 0; i < queue_count; i++)
5365 dev = PTR_ALIGN(p, NETDEV_ALIGN);
5366 dev->padded = (char *)dev - (char *)p;
5368 if (dev_addr_init(dev))
5374 dev_net_set(dev, &init_net);
5377 dev->num_tx_queues = queue_count;
5378 dev->real_num_tx_queues = queue_count;
5382 dev->num_rx_queues = queue_count;
5385 dev->gso_max_size = GSO_MAX_SIZE;
5387 netdev_init_queues(dev);
5389 INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list);
5390 dev->ethtool_ntuple_list.count = 0;
5391 INIT_LIST_HEAD(&dev->napi_list);
5392 INIT_LIST_HEAD(&dev->unreg_list);
5393 INIT_LIST_HEAD(&dev->link_watch_list);
5394 dev->priv_flags = IFF_XMIT_DST_RELEASE;
5396 strcpy(dev->name, name);
5409 EXPORT_SYMBOL(alloc_netdev_mq);
5412 * free_netdev - free network device
5415 * This function does the last stage of destroying an allocated device
5416 * interface. The reference to the device object is released.
5417 * If this is the last reference then it will be freed.
5419 void free_netdev(struct net_device *dev)
5421 struct napi_struct *p, *n;
5423 release_net(dev_net(dev));
5427 /* Flush device addresses */
5428 dev_addr_flush(dev);
5430 /* Clear ethtool n-tuple list */
5431 ethtool_ntuple_flush(dev);
5433 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5436 /* Compatibility with error handling in drivers */
5437 if (dev->reg_state == NETREG_UNINITIALIZED) {
5438 kfree((char *)dev - dev->padded);
5442 BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5443 dev->reg_state = NETREG_RELEASED;
5445 /* will free via device release */
5446 put_device(&dev->dev);
5448 EXPORT_SYMBOL(free_netdev);
5451 * synchronize_net - Synchronize with packet receive processing
5453 * Wait for packets currently being received to be done.
5454 * Does not block later packets from starting.
5456 void synchronize_net(void)
5461 EXPORT_SYMBOL(synchronize_net);
5464 * unregister_netdevice_queue - remove device from the kernel
5468 * This function shuts down a device interface and removes it
5469 * from the kernel tables.
5470 * If head not NULL, device is queued to be unregistered later.
5472 * Callers must hold the rtnl semaphore. You may want
5473 * unregister_netdev() instead of this.
5476 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
5481 list_move_tail(&dev->unreg_list, head);
5483 rollback_registered(dev);
5484 /* Finish processing unregister after unlock */
5488 EXPORT_SYMBOL(unregister_netdevice_queue);
5491 * unregister_netdevice_many - unregister many devices
5492 * @head: list of devices
5494 void unregister_netdevice_many(struct list_head *head)
5496 struct net_device *dev;
5498 if (!list_empty(head)) {
5499 rollback_registered_many(head);
5500 list_for_each_entry(dev, head, unreg_list)
5504 EXPORT_SYMBOL(unregister_netdevice_many);
5507 * unregister_netdev - remove device from the kernel
5510 * This function shuts down a device interface and removes it
5511 * from the kernel tables.
5513 * This is just a wrapper for unregister_netdevice that takes
5514 * the rtnl semaphore. In general you want to use this and not
5515 * unregister_netdevice.
5517 void unregister_netdev(struct net_device *dev)
5520 unregister_netdevice(dev);
5523 EXPORT_SYMBOL(unregister_netdev);
5526 * dev_change_net_namespace - move device to different nethost namespace
5528 * @net: network namespace
5529 * @pat: If not NULL name pattern to try if the current device name
5530 * is already taken in the destination network namespace.
5532 * This function shuts down a device interface and moves it
5533 * to a new network namespace. On success 0 is returned, on
5534 * a failure a netagive errno code is returned.
5536 * Callers must hold the rtnl semaphore.
5539 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
5545 /* Don't allow namespace local devices to be moved. */
5547 if (dev->features & NETIF_F_NETNS_LOCAL)
5551 /* Don't allow real devices to be moved when sysfs
5555 if (dev->dev.parent)
5559 /* Ensure the device has been registrered */
5561 if (dev->reg_state != NETREG_REGISTERED)
5564 /* Get out if there is nothing todo */
5566 if (net_eq(dev_net(dev), net))
5569 /* Pick the destination device name, and ensure
5570 * we can use it in the destination network namespace.
5573 if (__dev_get_by_name(net, dev->name)) {
5574 /* We get here if we can't use the current device name */
5577 if (dev_get_valid_name(net, pat, dev->name, 1))
5582 * And now a mini version of register_netdevice unregister_netdevice.
5585 /* If device is running close it first. */
5588 /* And unlink it from device chain */
5590 unlist_netdevice(dev);
5594 /* Shutdown queueing discipline. */
5597 /* Notify protocols, that we are about to destroy
5598 this device. They should clean all the things.
5600 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5601 call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5604 * Flush the unicast and multicast chains
5609 netdev_unregister_kobject(dev);
5611 /* Actually switch the network namespace */
5612 dev_net_set(dev, net);
5614 /* If there is an ifindex conflict assign a new one */
5615 if (__dev_get_by_index(net, dev->ifindex)) {
5616 int iflink = (dev->iflink == dev->ifindex);
5617 dev->ifindex = dev_new_index(net);
5619 dev->iflink = dev->ifindex;
5622 /* Fixup kobjects */
5623 err = netdev_register_kobject(dev);
5626 /* Add the device back in the hashes */
5627 list_netdevice(dev);
5629 /* Notify protocols, that a new device appeared. */
5630 call_netdevice_notifiers(NETDEV_REGISTER, dev);
5633 * Prevent userspace races by waiting until the network
5634 * device is fully setup before sending notifications.
5636 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5643 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
5645 static int dev_cpu_callback(struct notifier_block *nfb,
5646 unsigned long action,
5649 struct sk_buff **list_skb;
5650 struct sk_buff *skb;
5651 unsigned int cpu, oldcpu = (unsigned long)ocpu;
5652 struct softnet_data *sd, *oldsd;
5654 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
5657 local_irq_disable();
5658 cpu = smp_processor_id();
5659 sd = &per_cpu(softnet_data, cpu);
5660 oldsd = &per_cpu(softnet_data, oldcpu);
5662 /* Find end of our completion_queue. */
5663 list_skb = &sd->completion_queue;
5665 list_skb = &(*list_skb)->next;
5666 /* Append completion queue from offline CPU. */
5667 *list_skb = oldsd->completion_queue;
5668 oldsd->completion_queue = NULL;
5670 /* Append output queue from offline CPU. */
5671 if (oldsd->output_queue) {
5672 *sd->output_queue_tailp = oldsd->output_queue;
5673 sd->output_queue_tailp = oldsd->output_queue_tailp;
5674 oldsd->output_queue = NULL;
5675 oldsd->output_queue_tailp = &oldsd->output_queue;
5678 raise_softirq_irqoff(NET_TX_SOFTIRQ);
5681 /* Process offline CPU's input_pkt_queue */
5682 while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
5684 input_queue_head_add(oldsd, 1);
5686 while ((skb = __skb_dequeue(&oldsd->process_queue)))
5694 * netdev_increment_features - increment feature set by one
5695 * @all: current feature set
5696 * @one: new feature set
5697 * @mask: mask feature set
5699 * Computes a new feature set after adding a device with feature set
5700 * @one to the master device with current feature set @all. Will not
5701 * enable anything that is off in @mask. Returns the new feature set.
5703 unsigned long netdev_increment_features(unsigned long all, unsigned long one,
5706 /* If device needs checksumming, downgrade to it. */
5707 if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
5708 all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM);
5709 else if (mask & NETIF_F_ALL_CSUM) {
5710 /* If one device supports v4/v6 checksumming, set for all. */
5711 if (one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM) &&
5712 !(all & NETIF_F_GEN_CSUM)) {
5713 all &= ~NETIF_F_ALL_CSUM;
5714 all |= one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
5717 /* If one device supports hw checksumming, set for all. */
5718 if (one & NETIF_F_GEN_CSUM && !(all & NETIF_F_GEN_CSUM)) {
5719 all &= ~NETIF_F_ALL_CSUM;
5720 all |= NETIF_F_HW_CSUM;
5724 one |= NETIF_F_ALL_CSUM;
5726 one |= all & NETIF_F_ONE_FOR_ALL;
5727 all &= one | NETIF_F_LLTX | NETIF_F_GSO | NETIF_F_UFO;
5728 all |= one & mask & NETIF_F_ONE_FOR_ALL;
5732 EXPORT_SYMBOL(netdev_increment_features);
5734 static struct hlist_head *netdev_create_hash(void)
5737 struct hlist_head *hash;
5739 hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
5741 for (i = 0; i < NETDEV_HASHENTRIES; i++)
5742 INIT_HLIST_HEAD(&hash[i]);
5747 /* Initialize per network namespace state */
5748 static int __net_init netdev_init(struct net *net)
5750 INIT_LIST_HEAD(&net->dev_base_head);
5752 net->dev_name_head = netdev_create_hash();
5753 if (net->dev_name_head == NULL)
5756 net->dev_index_head = netdev_create_hash();
5757 if (net->dev_index_head == NULL)
5763 kfree(net->dev_name_head);
5769 * netdev_drivername - network driver for the device
5770 * @dev: network device
5771 * @buffer: buffer for resulting name
5772 * @len: size of buffer
5774 * Determine network driver for device.
5776 char *netdev_drivername(const struct net_device *dev, char *buffer, int len)
5778 const struct device_driver *driver;
5779 const struct device *parent;
5781 if (len <= 0 || !buffer)
5785 parent = dev->dev.parent;
5790 driver = parent->driver;
5791 if (driver && driver->name)
5792 strlcpy(buffer, driver->name, len);
5796 static void __net_exit netdev_exit(struct net *net)
5798 kfree(net->dev_name_head);
5799 kfree(net->dev_index_head);
5802 static struct pernet_operations __net_initdata netdev_net_ops = {
5803 .init = netdev_init,
5804 .exit = netdev_exit,
5807 static void __net_exit default_device_exit(struct net *net)
5809 struct net_device *dev, *aux;
5811 * Push all migratable network devices back to the
5812 * initial network namespace
5815 for_each_netdev_safe(net, dev, aux) {
5817 char fb_name[IFNAMSIZ];
5819 /* Ignore unmoveable devices (i.e. loopback) */
5820 if (dev->features & NETIF_F_NETNS_LOCAL)
5823 /* Leave virtual devices for the generic cleanup */
5824 if (dev->rtnl_link_ops)
5827 /* Push remaing network devices to init_net */
5828 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
5829 err = dev_change_net_namespace(dev, &init_net, fb_name);
5831 printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
5832 __func__, dev->name, err);
5839 static void __net_exit default_device_exit_batch(struct list_head *net_list)
5841 /* At exit all network devices most be removed from a network
5842 * namespace. Do this in the reverse order of registeration.
5843 * Do this across as many network namespaces as possible to
5844 * improve batching efficiency.
5846 struct net_device *dev;
5848 LIST_HEAD(dev_kill_list);
5851 list_for_each_entry(net, net_list, exit_list) {
5852 for_each_netdev_reverse(net, dev) {
5853 if (dev->rtnl_link_ops)
5854 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
5856 unregister_netdevice_queue(dev, &dev_kill_list);
5859 unregister_netdevice_many(&dev_kill_list);
5863 static struct pernet_operations __net_initdata default_device_ops = {
5864 .exit = default_device_exit,
5865 .exit_batch = default_device_exit_batch,
5869 * Initialize the DEV module. At boot time this walks the device list and
5870 * unhooks any devices that fail to initialise (normally hardware not
5871 * present) and leaves us with a valid list of present and active devices.
5876 * This is called single threaded during boot, so no need
5877 * to take the rtnl semaphore.
5879 static int __init net_dev_init(void)
5881 int i, rc = -ENOMEM;
5883 BUG_ON(!dev_boot_phase);
5885 if (dev_proc_init())
5888 if (netdev_kobject_init())
5891 INIT_LIST_HEAD(&ptype_all);
5892 for (i = 0; i < PTYPE_HASH_SIZE; i++)
5893 INIT_LIST_HEAD(&ptype_base[i]);
5895 if (register_pernet_subsys(&netdev_net_ops))
5899 * Initialise the packet receive queues.
5902 for_each_possible_cpu(i) {
5903 struct softnet_data *sd = &per_cpu(softnet_data, i);
5905 memset(sd, 0, sizeof(*sd));
5906 skb_queue_head_init(&sd->input_pkt_queue);
5907 skb_queue_head_init(&sd->process_queue);
5908 sd->completion_queue = NULL;
5909 INIT_LIST_HEAD(&sd->poll_list);
5910 sd->output_queue = NULL;
5911 sd->output_queue_tailp = &sd->output_queue;
5913 sd->csd.func = rps_trigger_softirq;
5919 sd->backlog.poll = process_backlog;
5920 sd->backlog.weight = weight_p;
5921 sd->backlog.gro_list = NULL;
5922 sd->backlog.gro_count = 0;
5927 /* The loopback device is special if any other network devices
5928 * is present in a network namespace the loopback device must
5929 * be present. Since we now dynamically allocate and free the
5930 * loopback device ensure this invariant is maintained by
5931 * keeping the loopback device as the first device on the
5932 * list of network devices. Ensuring the loopback devices
5933 * is the first device that appears and the last network device
5936 if (register_pernet_device(&loopback_net_ops))
5939 if (register_pernet_device(&default_device_ops))
5942 open_softirq(NET_TX_SOFTIRQ, net_tx_action);
5943 open_softirq(NET_RX_SOFTIRQ, net_rx_action);
5945 hotcpu_notifier(dev_cpu_callback, 0);
5953 subsys_initcall(net_dev_init);
5955 static int __init initialize_hashrnd(void)
5957 get_random_bytes(&hashrnd, sizeof(hashrnd));
5961 late_initcall_sync(initialize_hashrnd);