net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro
  11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17  *              David Hinds <dahinds@users.sourceforge.net>
  18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19  *              Adam Sulmicki <adam@cfar.umd.edu>
  20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <asm/system.h>
  77 #include <linux/bitops.h>
  78 #include <linux/capability.h>
  79 #include <linux/cpu.h>
  80 #include <linux/types.h>
  81 #include <linux/kernel.h>
  82 #include <linux/hash.h>
  83 #include <linux/slab.h>
  84 #include <linux/sched.h>
  85 #include <linux/mutex.h>
  86 #include <linux/string.h>
  87 #include <linux/mm.h>
  88 #include <linux/socket.h>
  89 #include <linux/sockios.h>
  90 #include <linux/errno.h>
  91 #include <linux/interrupt.h>
  92 #include <linux/if_ether.h>
  93 #include <linux/netdevice.h>
  94 #include <linux/etherdevice.h>
  95 #include <linux/ethtool.h>
  96 #include <linux/notifier.h>
  97 #include <linux/skbuff.h>
  98 #include <net/net_namespace.h>
  99 #include <net/sock.h>
 100 #include <linux/rtnetlink.h>
 101 #include <linux/proc_fs.h>
 102 #include <linux/seq_file.h>
 103 #include <linux/stat.h>
 104 #include <net/dst.h>
 105 #include <net/pkt_sched.h>
 106 #include <net/checksum.h>
 107 #include <net/xfrm.h>
 108 #include <linux/highmem.h>
 109 #include <linux/init.h>
 110 #include <linux/kmod.h>
 111 #include <linux/module.h>
 112 #include <linux/netpoll.h>
 113 #include <linux/rcupdate.h>
 114 #include <linux/delay.h>
 115 #include <net/wext.h>
 116 #include <net/iw_handler.h>
 117 #include <asm/current.h>
 118 #include <linux/audit.h>
 119 #include <linux/dmaengine.h>
 120 #include <linux/err.h>
 121 #include <linux/ctype.h>
 122 #include <linux/if_arp.h>
 123 #include <linux/if_vlan.h>
 124 #include <linux/ip.h>
 125 #include <net/ip.h>
 126 #include <linux/ipv6.h>
 127 #include <linux/in.h>
 128 #include <linux/jhash.h>
 129 #include <linux/random.h>
 130 #include <trace/events/napi.h>
 131 #include <trace/events/net.h>
 132 #include <trace/events/skb.h>
 133 #include <linux/pci.h>
 134 #include <linux/inetdevice.h>
 135 #include <linux/cpu_rmap.h>
 136
 137 #include "net-sysfs.h"
 138
 139 /* Instead of increasing this, you should create a hash table. */
 140 #define MAX_GRO_SKBS 8
 141
 142 /* This should be increased if a protocol with a bigger head is added. */
 143 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 144
 145 /*
 146  *      The list of packet types we will receive (as opposed to discard)
 147  *      and the routines to invoke.
 148  *
 149  *      Why 16. Because with 16 the only overlap we get on a hash of the
 150  *      low nibble of the protocol value is RARP/SNAP/X.25.
 151  *
 152  *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
 153  *             sure which should go first, but I bet it won't make much
 154  *             difference if we are running VLANs.  The good news is that
 155  *             this protocol won't be in the list unless compiled in, so
 156  *             the average user (w/out VLANs) will not be adversely affected.
 157  *             --BLG
 158  *
 159  *              0800    IP
 160  *              8100    802.1Q VLAN
 161  *              0001    802.3
 162  *              0002    AX.25
 163  *              0004    802.2
 164  *              8035    RARP
 165  *              0005    SNAP
 166  *              0805    X.25
 167  *              0806    ARP
 168  *              8137    IPX
 169  *              0009    Localtalk
 170  *              86DD    IPv6
 171  */
 172
 173 #define PTYPE_HASH_SIZE (16)
 174 #define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
 175
 176 static DEFINE_SPINLOCK(ptype_lock);
 177 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 178 static struct list_head ptype_all __read_mostly;        /* Taps */
 179
 180 /*
 181  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 182  * semaphore.
 183  *
 184  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 185  *
 186  * Writers must hold the rtnl semaphore while they loop through the
 187  * dev_base_head list, and hold dev_base_lock for writing when they do the
 188  * actual updates.  This allows pure readers to access the list even
 189  * while a writer is preparing to update it.
 190  *
 191  * To put it another way, dev_base_lock is held for writing only to
 192  * protect against pure readers; the rtnl semaphore provides the
 193  * protection against other writers.
 194  *
 195  * See, for example usages, register_netdevice() and
 196  * unregister_netdevice(), which must be called with the rtnl
 197  * semaphore held.
 198  */
 199 DEFINE_RWLOCK(dev_base_lock);
 200 EXPORT_SYMBOL(dev_base_lock);
 201
 202 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 203 {
 204         unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 205         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 206 }
 207
 208 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 209 {
 210         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 211 }
 212
 213 static inline void rps_lock(struct softnet_data *sd)
 214 {
 215 #ifdef CONFIG_RPS
 216         spin_lock(&sd->input_pkt_queue.lock);
 217 #endif
 218 }
 219
 220 static inline void rps_unlock(struct softnet_data *sd)
 221 {
 222 #ifdef CONFIG_RPS
 223         spin_unlock(&sd->input_pkt_queue.lock);
 224 #endif
 225 }
 226
 227 /* Device list insertion */
 228 static int list_netdevice(struct net_device *dev)
 229 {
 230         struct net *net = dev_net(dev);
 231
 232         ASSERT_RTNL();
 233
 234         write_lock_bh(&dev_base_lock);
 235         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 236         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 237         hlist_add_head_rcu(&dev->index_hlist,
 238                            dev_index_hash(net, dev->ifindex));
 239         write_unlock_bh(&dev_base_lock);
 240         return 0;
 241 }
 242
 243 /* Device list removal
 244  * caller must respect a RCU grace period before freeing/reusing dev
 245  */
 246 static void unlist_netdevice(struct net_device *dev)
 247 {
 248         ASSERT_RTNL();
 249
 250         /* Unlink dev from the device chain */
 251         write_lock_bh(&dev_base_lock);
 252         list_del_rcu(&dev->dev_list);
 253         hlist_del_rcu(&dev->name_hlist);
 254         hlist_del_rcu(&dev->index_hlist);
 255         write_unlock_bh(&dev_base_lock);
 256 }
 257
 258 /*
 259  *      Our notifier list
 260  */
 261
 262 static RAW_NOTIFIER_HEAD(netdev_chain);
 263
 264 /*
 265  *      Device drivers call our routines to queue packets here. We empty the
 266  *      queue in the local softnet handler.
 267  */
 268
 269 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 270 EXPORT_PER_CPU_SYMBOL(softnet_data);
 271
 272 #ifdef CONFIG_LOCKDEP
 273 /*
 274  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 275  * according to dev->type
 276  */
 277 static const unsigned short netdev_lock_type[] =
 278         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 279          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 280          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 281          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 282          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 283          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 284          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 285          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 286          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 287          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 288          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 289          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 290          ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
 291          ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
 292          ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,
 293          ARPHRD_VOID, ARPHRD_NONE};
 294
 295 static const char *const netdev_lock_name[] =
 296         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 297          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 298          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 299          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 300          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 301          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 302          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 303          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 304          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 305          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 306          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 307          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 308          "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
 309          "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
 310          "_xmit_PHONET_PIPE", "_xmit_IEEE802154",
 311          "_xmit_VOID", "_xmit_NONE"};
 312
 313 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 314 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 315
 316 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 317 {
 318         int i;
 319
 320         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 321                 if (netdev_lock_type[i] == dev_type)
 322                         return i;
 323         /* the last key is used by default */
 324         return ARRAY_SIZE(netdev_lock_type) - 1;
 325 }
 326
 327 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 328                                                  unsigned short dev_type)
 329 {
 330         int i;
 331
 332         i = netdev_lock_pos(dev_type);
 333         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 334                                    netdev_lock_name[i]);
 335 }
 336
 337 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 338 {
 339         int i;
 340
 341         i = netdev_lock_pos(dev->type);
 342         lockdep_set_class_and_name(&dev->addr_list_lock,
 343                                    &netdev_addr_lock_key[i],
 344                                    netdev_lock_name[i]);
 345 }
 346 #else
 347 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 348                                                  unsigned short dev_type)
 349 {
 350 }
 351 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 352 {
 353 }
 354 #endif
 355
 356 /*******************************************************************************
 357
 358                 Protocol management and registration routines
 359
 360 *******************************************************************************/
 361
 362 /*
 363  *      Add a protocol ID to the list. Now that the input handler is
 364  *      smarter we can dispense with all the messy stuff that used to be
 365  *      here.
 366  *
 367  *      BEWARE!!! Protocol handlers, mangling input packets,
 368  *      MUST BE last in hash buckets and checking protocol handlers
 369  *      MUST start from promiscuous ptype_all chain in net_bh.
 370  *      It is true now, do not change it.
 371  *      Explanation follows: if protocol handler, mangling packet, will
 372  *      be the first on list, it is not able to sense, that packet
 373  *      is cloned and should be copied-on-write, so that it will
 374  *      change it and subsequent readers will get broken packet.
 375  *                                                      --ANK (980803)
 376  */
 377
 378 static inline struct list_head *ptype_head(const struct packet_type *pt)
 379 {
 380         if (pt->type == htons(ETH_P_ALL))
 381                 return &ptype_all;
 382         else
 383                 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 384 }
 385
 386 /**
 387  *      dev_add_pack - add packet handler
 388  *      @pt: packet type declaration
 389  *
 390  *      Add a protocol handler to the networking stack. The passed &packet_type
 391  *      is linked into kernel lists and may not be freed until it has been
 392  *      removed from the kernel lists.
 393  *
 394  *      This call does not sleep therefore it can not
 395  *      guarantee all CPU's that are in middle of receiving packets
 396  *      will see the new packet type (until the next received packet).
 397  */
 398
 399 void dev_add_pack(struct packet_type *pt)
 400 {
 401         struct list_head *head = ptype_head(pt);
 402
 403         spin_lock(&ptype_lock);
 404         list_add_rcu(&pt->list, head);
 405         spin_unlock(&ptype_lock);
 406 }
 407 EXPORT_SYMBOL(dev_add_pack);
 408
 409 /**
 410  *      __dev_remove_pack        - remove packet handler
 411  *      @pt: packet type declaration
 412  *
 413  *      Remove a protocol handler that was previously added to the kernel
 414  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 415  *      from the kernel lists and can be freed or reused once this function
 416  *      returns.
 417  *
 418  *      The packet type might still be in use by receivers
 419  *      and must not be freed until after all the CPU's have gone
 420  *      through a quiescent state.
 421  */
 422 void __dev_remove_pack(struct packet_type *pt)
 423 {
 424         struct list_head *head = ptype_head(pt);
 425         struct packet_type *pt1;
 426
 427         spin_lock(&ptype_lock);
 428
 429         list_for_each_entry(pt1, head, list) {
 430                 if (pt == pt1) {
 431                         list_del_rcu(&pt->list);
 432                         goto out;
 433                 }
 434         }
 435
 436         printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
 437 out:
 438         spin_unlock(&ptype_lock);
 439 }
 440 EXPORT_SYMBOL(__dev_remove_pack);
 441
 442 /**
 443  *      dev_remove_pack  - remove packet handler
 444  *      @pt: packet type declaration
 445  *
 446  *      Remove a protocol handler that was previously added to the kernel
 447  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 448  *      from the kernel lists and can be freed or reused once this function
 449  *      returns.
 450  *
 451  *      This call sleeps to guarantee that no CPU is looking at the packet
 452  *      type after return.
 453  */
 454 void dev_remove_pack(struct packet_type *pt)
 455 {
 456         __dev_remove_pack(pt);
 457
 458         synchronize_net();
 459 }
 460 EXPORT_SYMBOL(dev_remove_pack);
 461
 462 /******************************************************************************
 463
 464                       Device Boot-time Settings Routines
 465
 466 *******************************************************************************/
 467
 468 /* Boot time configuration table */
 469 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 470
 471 /**
 472  *      netdev_boot_setup_add   - add new setup entry
 473  *      @name: name of the device
 474  *      @map: configured settings for the device
 475  *
 476  *      Adds new setup entry to the dev_boot_setup list.  The function
 477  *      returns 0 on error and 1 on success.  This is a generic routine to
 478  *      all netdevices.
 479  */
 480 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 481 {
 482         struct netdev_boot_setup *s;
 483         int i;
 484
 485         s = dev_boot_setup;
 486         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 487                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 488                         memset(s[i].name, 0, sizeof(s[i].name));
 489                         strlcpy(s[i].name, name, IFNAMSIZ);
 490                         memcpy(&s[i].map, map, sizeof(s[i].map));
 491                         break;
 492                 }
 493         }
 494
 495         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 496 }
 497
 498 /**
 499  *      netdev_boot_setup_check - check boot time settings
 500  *      @dev: the netdevice
 501  *
 502  *      Check boot time settings for the device.
 503  *      The found settings are set for the device to be used
 504  *      later in the device probing.
 505  *      Returns 0 if no settings found, 1 if they are.
 506  */
 507 int netdev_boot_setup_check(struct net_device *dev)
 508 {
 509         struct netdev_boot_setup *s = dev_boot_setup;
 510         int i;
 511
 512         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 513                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 514                     !strcmp(dev->name, s[i].name)) {
 515                         dev->irq        = s[i].map.irq;
 516                         dev->base_addr  = s[i].map.base_addr;
 517                         dev->mem_start  = s[i].map.mem_start;
 518                         dev->mem_end    = s[i].map.mem_end;
 519                         return 1;
 520                 }
 521         }
 522         return 0;
 523 }
 524 EXPORT_SYMBOL(netdev_boot_setup_check);
 525
 526
 527 /**
 528  *      netdev_boot_base        - get address from boot time settings
 529  *      @prefix: prefix for network device
 530  *      @unit: id for network device
 531  *
 532  *      Check boot time settings for the base address of device.
 533  *      The found settings are set for the device to be used
 534  *      later in the device probing.
 535  *      Returns 0 if no settings found.
 536  */
 537 unsigned long netdev_boot_base(const char *prefix, int unit)
 538 {
 539         const struct netdev_boot_setup *s = dev_boot_setup;
 540         char name[IFNAMSIZ];
 541         int i;
 542
 543         sprintf(name, "%s%d", prefix, unit);
 544
 545         /*
 546          * If device already registered then return base of 1
 547          * to indicate not to probe for this interface
 548          */
 549         if (__dev_get_by_name(&init_net, name))
 550                 return 1;
 551
 552         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 553                 if (!strcmp(name, s[i].name))
 554                         return s[i].map.base_addr;
 555         return 0;
 556 }
 557
 558 /*
 559  * Saves at boot time configured settings for any netdevice.
 560  */
 561 int __init netdev_boot_setup(char *str)
 562 {
 563         int ints[5];
 564         struct ifmap map;
 565
 566         str = get_options(str, ARRAY_SIZE(ints), ints);
 567         if (!str || !*str)
 568                 return 0;
 569
 570         /* Save settings */
 571         memset(&map, 0, sizeof(map));
 572         if (ints[0] > 0)
 573                 map.irq = ints[1];
 574         if (ints[0] > 1)
 575                 map.base_addr = ints[2];
 576         if (ints[0] > 2)
 577                 map.mem_start = ints[3];
 578         if (ints[0] > 3)
 579                 map.mem_end = ints[4];
 580
 581         /* Add new entry to the list */
 582         return netdev_boot_setup_add(str, &map);
 583 }
 584
 585 __setup("netdev=", netdev_boot_setup);
 586
 587 /*******************************************************************************
 588
 589                             Device Interface Subroutines
 590
 591 *******************************************************************************/
 592
 593 /**
 594  *      __dev_get_by_name       - find a device by its name
 595  *      @net: the applicable net namespace
 596  *      @name: name to find
 597  *
 598  *      Find an interface by name. Must be called under RTNL semaphore
 599  *      or @dev_base_lock. If the name is found a pointer to the device
 600  *      is returned. If the name is not found then %NULL is returned. The
 601  *      reference counters are not incremented so the caller must be
 602  *      careful with locks.
 603  */
 604
 605 struct net_device *__dev_get_by_name(struct net *net, const char *name)
 606 {
 607         struct hlist_node *p;
 608         struct net_device *dev;
 609         struct hlist_head *head = dev_name_hash(net, name);
 610
 611         hlist_for_each_entry(dev, p, head, name_hlist)
 612                 if (!strncmp(dev->name, name, IFNAMSIZ))
 613                         return dev;
 614
 615         return NULL;
 616 }
 617 EXPORT_SYMBOL(__dev_get_by_name);
 618
 619 /**
 620  *      dev_get_by_name_rcu     - find a device by its name
 621  *      @net: the applicable net namespace
 622  *      @name: name to find
 623  *
 624  *      Find an interface by name.
 625  *      If the name is found a pointer to the device is returned.
 626  *      If the name is not found then %NULL is returned.
 627  *      The reference counters are not incremented so the caller must be
 628  *      careful with locks. The caller must hold RCU lock.
 629  */
 630
 631 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 632 {
 633         struct hlist_node *p;
 634         struct net_device *dev;
 635         struct hlist_head *head = dev_name_hash(net, name);
 636
 637         hlist_for_each_entry_rcu(dev, p, head, name_hlist)
 638                 if (!strncmp(dev->name, name, IFNAMSIZ))
 639                         return dev;
 640
 641         return NULL;
 642 }
 643 EXPORT_SYMBOL(dev_get_by_name_rcu);
 644
 645 /**
 646  *      dev_get_by_name         - find a device by its name
 647  *      @net: the applicable net namespace
 648  *      @name: name to find
 649  *
 650  *      Find an interface by name. This can be called from any
 651  *      context and does its own locking. The returned handle has
 652  *      the usage count incremented and the caller must use dev_put() to
 653  *      release it when it is no longer needed. %NULL is returned if no
 654  *      matching device is found.
 655  */
 656
 657 struct net_device *dev_get_by_name(struct net *net, const char *name)
 658 {
 659         struct net_device *dev;
 660
 661         rcu_read_lock();
 662         dev = dev_get_by_name_rcu(net, name);
 663         if (dev)
 664                 dev_hold(dev);
 665         rcu_read_unlock();
 666         return dev;
 667 }
 668 EXPORT_SYMBOL(dev_get_by_name);
 669
 670 /**
 671  *      __dev_get_by_index - find a device by its ifindex
 672  *      @net: the applicable net namespace
 673  *      @ifindex: index of device
 674  *
 675  *      Search for an interface by index. Returns %NULL if the device
 676  *      is not found or a pointer to the device. The device has not
 677  *      had its reference counter increased so the caller must be careful
 678  *      about locking. The caller must hold either the RTNL semaphore
 679  *      or @dev_base_lock.
 680  */
 681
 682 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 683 {
 684         struct hlist_node *p;
 685         struct net_device *dev;
 686         struct hlist_head *head = dev_index_hash(net, ifindex);
 687
 688         hlist_for_each_entry(dev, p, head, index_hlist)
 689                 if (dev->ifindex == ifindex)
 690                         return dev;
 691
 692         return NULL;
 693 }
 694 EXPORT_SYMBOL(__dev_get_by_index);
 695
 696 /**
 697  *      dev_get_by_index_rcu - find a device by its ifindex
 698  *      @net: the applicable net namespace
 699  *      @ifindex: index of device
 700  *
 701  *      Search for an interface by index. Returns %NULL if the device
 702  *      is not found or a pointer to the device. The device has not
 703  *      had its reference counter increased so the caller must be careful
 704  *      about locking. The caller must hold RCU lock.
 705  */
 706
 707 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 708 {
 709         struct hlist_node *p;
 710         struct net_device *dev;
 711         struct hlist_head *head = dev_index_hash(net, ifindex);
 712
 713         hlist_for_each_entry_rcu(dev, p, head, index_hlist)
 714                 if (dev->ifindex == ifindex)
 715                         return dev;
 716
 717         return NULL;
 718 }
 719 EXPORT_SYMBOL(dev_get_by_index_rcu);
 720
 721
 722 /**
 723  *      dev_get_by_index - find a device by its ifindex
 724  *      @net: the applicable net namespace
 725  *      @ifindex: index of device
 726  *
 727  *      Search for an interface by index. Returns NULL if the device
 728  *      is not found or a pointer to the device. The device returned has
 729  *      had a reference added and the pointer is safe until the user calls
 730  *      dev_put to indicate they have finished with it.
 731  */
 732
 733 struct net_device *dev_get_by_index(struct net *net, int ifindex)
 734 {
 735         struct net_device *dev;
 736
 737         rcu_read_lock();
 738         dev = dev_get_by_index_rcu(net, ifindex);
 739         if (dev)
 740                 dev_hold(dev);
 741         rcu_read_unlock();
 742         return dev;
 743 }
 744 EXPORT_SYMBOL(dev_get_by_index);
 745
 746 /**
 747  *      dev_getbyhwaddr_rcu - find a device by its hardware address
 748  *      @net: the applicable net namespace
 749  *      @type: media type of device
 750  *      @ha: hardware address
 751  *
 752  *      Search for an interface by MAC address. Returns NULL if the device
 753  *      is not found or a pointer to the device.
 754  *      The caller must hold RCU or RTNL.
 755  *      The returned device has not had its ref count increased
 756  *      and the caller must therefore be careful about locking
 757  *
 758  */
 759
 760 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 761                                        const char *ha)
 762 {
 763         struct net_device *dev;
 764
 765         for_each_netdev_rcu(net, dev)
 766                 if (dev->type == type &&
 767                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 768                         return dev;
 769
 770         return NULL;
 771 }
 772 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 773
 774 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 775 {
 776         struct net_device *dev;
 777
 778         ASSERT_RTNL();
 779         for_each_netdev(net, dev)
 780                 if (dev->type == type)
 781                         return dev;
 782
 783         return NULL;
 784 }
 785 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 786
 787 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 788 {
 789         struct net_device *dev, *ret = NULL;
 790
 791         rcu_read_lock();
 792         for_each_netdev_rcu(net, dev)
 793                 if (dev->type == type) {
 794                         dev_hold(dev);
 795                         ret = dev;
 796                         break;
 797                 }
 798         rcu_read_unlock();
 799         return ret;
 800 }
 801 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 802
 803 /**
 804  *      dev_get_by_flags_rcu - find any device with given flags
 805  *      @net: the applicable net namespace
 806  *      @if_flags: IFF_* values
 807  *      @mask: bitmask of bits in if_flags to check
 808  *
 809  *      Search for any interface with the given flags. Returns NULL if a device
 810  *      is not found or a pointer to the device. Must be called inside
 811  *      rcu_read_lock(), and result refcount is unchanged.
 812  */
 813
 814 struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
 815                                     unsigned short mask)
 816 {
 817         struct net_device *dev, *ret;
 818
 819         ret = NULL;
 820         for_each_netdev_rcu(net, dev) {
 821                 if (((dev->flags ^ if_flags) & mask) == 0) {
 822                         ret = dev;
 823                         break;
 824                 }
 825         }
 826         return ret;
 827 }
 828 EXPORT_SYMBOL(dev_get_by_flags_rcu);
 829
 830 /**
 831  *      dev_valid_name - check if name is okay for network device
 832  *      @name: name string
 833  *
 834  *      Network device names need to be valid file names to
 835  *      to allow sysfs to work.  We also disallow any kind of
 836  *      whitespace.
 837  */
 838 int dev_valid_name(const char *name)
 839 {
 840         if (*name == '\0')
 841                 return 0;
 842         if (strlen(name) >= IFNAMSIZ)
 843                 return 0;
 844         if (!strcmp(name, ".") || !strcmp(name, ".."))
 845                 return 0;
 846
 847         while (*name) {
 848                 if (*name == '/' || isspace(*name))
 849                         return 0;
 850                 name++;
 851         }
 852         return 1;
 853 }
 854 EXPORT_SYMBOL(dev_valid_name);
 855
 856 /**
 857  *      __dev_alloc_name - allocate a name for a device
 858  *      @net: network namespace to allocate the device name in
 859  *      @name: name format string
 860  *      @buf:  scratch buffer and result name string
 861  *
 862  *      Passed a format string - eg "lt%d" it will try and find a suitable
 863  *      id. It scans list of devices to build up a free map, then chooses
 864  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 865  *      while allocating the name and adding the device in order to avoid
 866  *      duplicates.
 867  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 868  *      Returns the number of the unit assigned or a negative errno code.
 869  */
 870
 871 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 872 {
 873         int i = 0;
 874         const char *p;
 875         const int max_netdevices = 8*PAGE_SIZE;
 876         unsigned long *inuse;
 877         struct net_device *d;
 878
 879         p = strnchr(name, IFNAMSIZ-1, '%');
 880         if (p) {
 881                 /*
 882                  * Verify the string as this thing may have come from
 883                  * the user.  There must be either one "%d" and no other "%"
 884                  * characters.
 885                  */
 886                 if (p[1] != 'd' || strchr(p + 2, '%'))
 887                         return -EINVAL;
 888
 889                 /* Use one page as a bit array of possible slots */
 890                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 891                 if (!inuse)
 892                         return -ENOMEM;
 893
 894                 for_each_netdev(net, d) {
 895                         if (!sscanf(d->name, name, &i))
 896                                 continue;
 897                         if (i < 0 || i >= max_netdevices)
 898                                 continue;
 899
 900                         /*  avoid cases where sscanf is not exact inverse of printf */
 901                         snprintf(buf, IFNAMSIZ, name, i);
 902                         if (!strncmp(buf, d->name, IFNAMSIZ))
 903                                 set_bit(i, inuse);
 904                 }
 905
 906                 i = find_first_zero_bit(inuse, max_netdevices);
 907                 free_page((unsigned long) inuse);
 908         }
 909
 910         if (buf != name)
 911                 snprintf(buf, IFNAMSIZ, name, i);
 912         if (!__dev_get_by_name(net, buf))
 913                 return i;
 914
 915         /* It is possible to run out of possible slots
 916          * when the name is long and there isn't enough space left
 917          * for the digits, or if all bits are used.
 918          */
 919         return -ENFILE;
 920 }
 921
 922 /**
 923  *      dev_alloc_name - allocate a name for a device
 924  *      @dev: device
 925  *      @name: name format string
 926  *
 927  *      Passed a format string - eg "lt%d" it will try and find a suitable
 928  *      id. It scans list of devices to build up a free map, then chooses
 929  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 930  *      while allocating the name and adding the device in order to avoid
 931  *      duplicates.
 932  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 933  *      Returns the number of the unit assigned or a negative errno code.
 934  */
 935
 936 int dev_alloc_name(struct net_device *dev, const char *name)
 937 {
 938         char buf[IFNAMSIZ];
 939         struct net *net;
 940         int ret;
 941
 942         BUG_ON(!dev_net(dev));
 943         net = dev_net(dev);
 944         ret = __dev_alloc_name(net, name, buf);
 945         if (ret >= 0)
 946                 strlcpy(dev->name, buf, IFNAMSIZ);
 947         return ret;
 948 }
 949 EXPORT_SYMBOL(dev_alloc_name);
 950
 951 static int dev_get_valid_name(struct net_device *dev, const char *name, bool fmt)
 952 {
 953         struct net *net;
 954
 955         BUG_ON(!dev_net(dev));
 956         net = dev_net(dev);
 957
 958         if (!dev_valid_name(name))
 959                 return -EINVAL;
 960
 961         if (fmt && strchr(name, '%'))
 962                 return dev_alloc_name(dev, name);
 963         else if (__dev_get_by_name(net, name))
 964                 return -EEXIST;
 965         else if (dev->name != name)
 966                 strlcpy(dev->name, name, IFNAMSIZ);
 967
 968         return 0;
 969 }
 970
 971 /**
 972  *      dev_change_name - change name of a device
 973  *      @dev: device
 974  *      @newname: name (or format string) must be at least IFNAMSIZ
 975  *
 976  *      Change name of a device, can pass format strings "eth%d".
 977  *      for wildcarding.
 978  */
 979 int dev_change_name(struct net_device *dev, const char *newname)
 980 {
 981         char oldname[IFNAMSIZ];
 982         int err = 0;
 983         int ret;
 984         struct net *net;
 985
 986         ASSERT_RTNL();
 987         BUG_ON(!dev_net(dev));
 988
 989         net = dev_net(dev);
 990         if (dev->flags & IFF_UP)
 991                 return -EBUSY;
 992
 993         if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
 994                 return 0;
 995
 996         memcpy(oldname, dev->name, IFNAMSIZ);
 997
 998         err = dev_get_valid_name(dev, newname, 1);
 999         if (err < 0)
1000                 return err;
1001
1002 rollback:
1003         ret = device_rename(&dev->dev, dev->name);
1004         if (ret) {
1005                 memcpy(dev->name, oldname, IFNAMSIZ);
1006                 return ret;
1007         }
1008
1009         write_lock_bh(&dev_base_lock);
1010         hlist_del(&dev->name_hlist);
1011         write_unlock_bh(&dev_base_lock);
1012
1013         synchronize_rcu();
1014
1015         write_lock_bh(&dev_base_lock);
1016         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1017         write_unlock_bh(&dev_base_lock);
1018
1019         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1020         ret = notifier_to_errno(ret);
1021
1022         if (ret) {
1023                 /* err >= 0 after dev_alloc_name() or stores the first errno */
1024                 if (err >= 0) {
1025                         err = ret;
1026                         memcpy(dev->name, oldname, IFNAMSIZ);
1027                         goto rollback;
1028                 } else {
1029                         printk(KERN_ERR
1030                                "%s: name change rollback failed: %d.\n",
1031                                dev->name, ret);
1032                 }
1033         }
1034
1035         return err;
1036 }
1037
1038 /**
1039  *      dev_set_alias - change ifalias of a device
1040  *      @dev: device
1041  *      @alias: name up to IFALIASZ
1042  *      @len: limit of bytes to copy from info
1043  *
1044  *      Set ifalias for a device,
1045  */
1046 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1047 {
1048         ASSERT_RTNL();
1049
1050         if (len >= IFALIASZ)
1051                 return -EINVAL;
1052
1053         if (!len) {
1054                 if (dev->ifalias) {
1055                         kfree(dev->ifalias);
1056                         dev->ifalias = NULL;
1057                 }
1058                 return 0;
1059         }
1060
1061         dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1062         if (!dev->ifalias)
1063                 return -ENOMEM;
1064
1065         strlcpy(dev->ifalias, alias, len+1);
1066         return len;
1067 }
1068
1069
1070 /**
1071  *      netdev_features_change - device changes features
1072  *      @dev: device to cause notification
1073  *
1074  *      Called to indicate a device has changed features.
1075  */
1076 void netdev_features_change(struct net_device *dev)
1077 {
1078         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1079 }
1080 EXPORT_SYMBOL(netdev_features_change);
1081
1082 /**
1083  *      netdev_state_change - device changes state
1084  *      @dev: device to cause notification
1085  *
1086  *      Called to indicate a device has changed state. This function calls
1087  *      the notifier chains for netdev_chain and sends a NEWLINK message
1088  *      to the routing socket.
1089  */
1090 void netdev_state_change(struct net_device *dev)
1091 {
1092         if (dev->flags & IFF_UP) {
1093                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1094                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1095         }
1096 }
1097 EXPORT_SYMBOL(netdev_state_change);
1098
1099 int netdev_bonding_change(struct net_device *dev, unsigned long event)
1100 {
1101         return call_netdevice_notifiers(event, dev);
1102 }
1103 EXPORT_SYMBOL(netdev_bonding_change);
1104
1105 /**
1106  *      dev_load        - load a network module
1107  *      @net: the applicable net namespace
1108  *      @name: name of interface
1109  *
1110  *      If a network interface is not present and the process has suitable
1111  *      privileges this function loads the module. If module loading is not
1112  *      available in this kernel then it becomes a nop.
1113  */
1114
1115 void dev_load(struct net *net, const char *name)
1116 {
1117         struct net_device *dev;
1118
1119         rcu_read_lock();
1120         dev = dev_get_by_name_rcu(net, name);
1121         rcu_read_unlock();
1122
1123         if (!dev && capable(CAP_NET_ADMIN))
1124                 request_module("%s", name);
1125 }
1126 EXPORT_SYMBOL(dev_load);
1127
1128 static int __dev_open(struct net_device *dev)
1129 {
1130         const struct net_device_ops *ops = dev->netdev_ops;
1131         int ret;
1132
1133         ASSERT_RTNL();
1134
1135         /*
1136          *      Is it even present?
1137          */
1138         if (!netif_device_present(dev))
1139                 return -ENODEV;
1140
1141         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1142         ret = notifier_to_errno(ret);
1143         if (ret)
1144                 return ret;
1145
1146         /*
1147          *      Call device private open method
1148          */
1149         set_bit(__LINK_STATE_START, &dev->state);
1150
1151         if (ops->ndo_validate_addr)
1152                 ret = ops->ndo_validate_addr(dev);
1153
1154         if (!ret && ops->ndo_open)
1155                 ret = ops->ndo_open(dev);
1156
1157         /*
1158          *      If it went open OK then:
1159          */
1160
1161         if (ret)
1162                 clear_bit(__LINK_STATE_START, &dev->state);
1163         else {
1164                 /*
1165                  *      Set the flags.
1166                  */
1167                 dev->flags |= IFF_UP;
1168
1169                 /*
1170                  *      Enable NET_DMA
1171                  */
1172                 net_dmaengine_get();
1173
1174                 /*
1175                  *      Initialize multicasting status
1176                  */
1177                 dev_set_rx_mode(dev);
1178
1179                 /*
1180                  *      Wakeup transmit queue engine
1181                  */
1182                 dev_activate(dev);
1183         }
1184
1185         return ret;
1186 }
1187
1188 /**
1189  *      dev_open        - prepare an interface for use.
1190  *      @dev:   device to open
1191  *
1192  *      Takes a device from down to up state. The device's private open
1193  *      function is invoked and then the multicast lists are loaded. Finally
1194  *      the device is moved into the up state and a %NETDEV_UP message is
1195  *      sent to the netdev notifier chain.
1196  *
1197  *      Calling this function on an active interface is a nop. On a failure
1198  *      a negative errno code is returned.
1199  */
1200 int dev_open(struct net_device *dev)
1201 {
1202         int ret;
1203
1204         /*
1205          *      Is it already up?
1206          */
1207         if (dev->flags & IFF_UP)
1208                 return 0;
1209
1210         /*
1211          *      Open device
1212          */
1213         ret = __dev_open(dev);
1214         if (ret < 0)
1215                 return ret;
1216
1217         /*
1218          *      ... and announce new interface.
1219          */
1220         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1221         call_netdevice_notifiers(NETDEV_UP, dev);
1222
1223         return ret;
1224 }
1225 EXPORT_SYMBOL(dev_open);
1226
1227 static int __dev_close_many(struct list_head *head)
1228 {
1229         struct net_device *dev;
1230
1231         ASSERT_RTNL();
1232         might_sleep();
1233
1234         list_for_each_entry(dev, head, unreg_list) {
1235                 /*
1236                  *      Tell people we are going down, so that they can
1237                  *      prepare to death, when device is still operating.
1238                  */
1239                 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1240
1241                 clear_bit(__LINK_STATE_START, &dev->state);
1242
1243                 /* Synchronize to scheduled poll. We cannot touch poll list, it
1244                  * can be even on different cpu. So just clear netif_running().
1245                  *
1246                  * dev->stop() will invoke napi_disable() on all of it's
1247                  * napi_struct instances on this device.
1248                  */
1249                 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1250         }
1251
1252         dev_deactivate_many(head);
1253
1254         list_for_each_entry(dev, head, unreg_list) {
1255                 const struct net_device_ops *ops = dev->netdev_ops;
1256
1257                 /*
1258                  *      Call the device specific close. This cannot fail.
1259                  *      Only if device is UP
1260                  *
1261                  *      We allow it to be called even after a DETACH hot-plug
1262                  *      event.
1263                  */
1264                 if (ops->ndo_stop)
1265                         ops->ndo_stop(dev);
1266
1267                 /*
1268                  *      Device is now down.
1269                  */
1270
1271                 dev->flags &= ~IFF_UP;
1272
1273                 /*
1274                  *      Shutdown NET_DMA
1275                  */
1276                 net_dmaengine_put();
1277         }
1278
1279         return 0;
1280 }
1281
1282 static int __dev_close(struct net_device *dev)
1283 {
1284         LIST_HEAD(single);
1285
1286         list_add(&dev->unreg_list, &single);
1287         return __dev_close_many(&single);
1288 }
1289
1290 static int dev_close_many(struct list_head *head)
1291 {
1292         struct net_device *dev, *tmp;
1293         LIST_HEAD(tmp_list);
1294
1295         list_for_each_entry_safe(dev, tmp, head, unreg_list)
1296                 if (!(dev->flags & IFF_UP))
1297                         list_move(&dev->unreg_list, &tmp_list);
1298
1299         __dev_close_many(head);
1300
1301         /*
1302          * Tell people we are down
1303          */
1304         list_for_each_entry(dev, head, unreg_list) {
1305                 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1306                 call_netdevice_notifiers(NETDEV_DOWN, dev);
1307         }
1308
1309         /* rollback_registered_many needs the complete original list */
1310         list_splice(&tmp_list, head);
1311         return 0;
1312 }
1313
1314 /**
1315  *      dev_close - shutdown an interface.
1316  *      @dev: device to shutdown
1317  *
1318  *      This function moves an active device into down state. A
1319  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1320  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1321  *      chain.
1322  */
1323 int dev_close(struct net_device *dev)
1324 {
1325         LIST_HEAD(single);
1326
1327         list_add(&dev->unreg_list, &single);
1328         dev_close_many(&single);
1329
1330         return 0;
1331 }
1332 EXPORT_SYMBOL(dev_close);
1333
1334
1335 /**
1336  *      dev_disable_lro - disable Large Receive Offload on a device
1337  *      @dev: device
1338  *
1339  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1340  *      called under RTNL.  This is needed if received packets may be
1341  *      forwarded to another interface.
1342  */
1343 void dev_disable_lro(struct net_device *dev)
1344 {
1345         if (dev->ethtool_ops && dev->ethtool_ops->get_flags &&
1346             dev->ethtool_ops->set_flags) {
1347                 u32 flags = dev->ethtool_ops->get_flags(dev);
1348                 if (flags & ETH_FLAG_LRO) {
1349                         flags &= ~ETH_FLAG_LRO;
1350                         dev->ethtool_ops->set_flags(dev, flags);
1351                 }
1352         }
1353         WARN_ON(dev->features & NETIF_F_LRO);
1354 }
1355 EXPORT_SYMBOL(dev_disable_lro);
1356
1357
1358 static int dev_boot_phase = 1;
1359
1360 /*
1361  *      Device change register/unregister. These are not inline or static
1362  *      as we export them to the world.
1363  */
1364
1365 /**
1366  *      register_netdevice_notifier - register a network notifier block
1367  *      @nb: notifier
1368  *
1369  *      Register a notifier to be called when network device events occur.
1370  *      The notifier passed is linked into the kernel structures and must
1371  *      not be reused until it has been unregistered. A negative errno code
1372  *      is returned on a failure.
1373  *
1374  *      When registered all registration and up events are replayed
1375  *      to the new notifier to allow device to have a race free
1376  *      view of the network device list.
1377  */
1378
1379 int register_netdevice_notifier(struct notifier_block *nb)
1380 {
1381         struct net_device *dev;
1382         struct net_device *last;
1383         struct net *net;
1384         int err;
1385
1386         rtnl_lock();
1387         err = raw_notifier_chain_register(&netdev_chain, nb);
1388         if (err)
1389                 goto unlock;
1390         if (dev_boot_phase)
1391                 goto unlock;
1392         for_each_net(net) {
1393                 for_each_netdev(net, dev) {
1394                         err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1395                         err = notifier_to_errno(err);
1396                         if (err)
1397                                 goto rollback;
1398
1399                         if (!(dev->flags & IFF_UP))
1400                                 continue;
1401
1402                         nb->notifier_call(nb, NETDEV_UP, dev);
1403                 }
1404         }
1405
1406 unlock:
1407         rtnl_unlock();
1408         return err;
1409
1410 rollback:
1411         last = dev;
1412         for_each_net(net) {
1413                 for_each_netdev(net, dev) {
1414                         if (dev == last)
1415                                 break;
1416
1417                         if (dev->flags & IFF_UP) {
1418                                 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1419                                 nb->notifier_call(nb, NETDEV_DOWN, dev);
1420                         }
1421                         nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1422                         nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1423                 }
1424         }
1425
1426         raw_notifier_chain_unregister(&netdev_chain, nb);
1427         goto unlock;
1428 }
1429 EXPORT_SYMBOL(register_netdevice_notifier);
1430
1431 /**
1432  *      unregister_netdevice_notifier - unregister a network notifier block
1433  *      @nb: notifier
1434  *
1435  *      Unregister a notifier previously registered by
1436  *      register_netdevice_notifier(). The notifier is unlinked into the
1437  *      kernel structures and may then be reused. A negative errno code
1438  *      is returned on a failure.
1439  */
1440
1441 int unregister_netdevice_notifier(struct notifier_block *nb)
1442 {
1443         int err;
1444
1445         rtnl_lock();
1446         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1447         rtnl_unlock();
1448         return err;
1449 }
1450 EXPORT_SYMBOL(unregister_netdevice_notifier);
1451
1452 /**
1453  *      call_netdevice_notifiers - call all network notifier blocks
1454  *      @val: value passed unmodified to notifier function
1455  *      @dev: net_device pointer passed unmodified to notifier function
1456  *
1457  *      Call all network notifier blocks.  Parameters and return value
1458  *      are as for raw_notifier_call_chain().
1459  */
1460
1461 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1462 {
1463         ASSERT_RTNL();
1464         return raw_notifier_call_chain(&netdev_chain, val, dev);
1465 }
1466
1467 /* When > 0 there are consumers of rx skb time stamps */
1468 static atomic_t netstamp_needed = ATOMIC_INIT(0);
1469
1470 void net_enable_timestamp(void)
1471 {
1472         atomic_inc(&netstamp_needed);
1473 }
1474 EXPORT_SYMBOL(net_enable_timestamp);
1475
1476 void net_disable_timestamp(void)
1477 {
1478         atomic_dec(&netstamp_needed);
1479 }
1480 EXPORT_SYMBOL(net_disable_timestamp);
1481
1482 static inline void net_timestamp_set(struct sk_buff *skb)
1483 {
1484         if (atomic_read(&netstamp_needed))
1485                 __net_timestamp(skb);
1486         else
1487                 skb->tstamp.tv64 = 0;
1488 }
1489
1490 static inline void net_timestamp_check(struct sk_buff *skb)
1491 {
1492         if (!skb->tstamp.tv64 && atomic_read(&netstamp_needed))
1493                 __net_timestamp(skb);
1494 }
1495
1496 /**
1497  * dev_forward_skb - loopback an skb to another netif
1498  *
1499  * @dev: destination network device
1500  * @skb: buffer to forward
1501  *
1502  * return values:
1503  *      NET_RX_SUCCESS  (no congestion)
1504  *      NET_RX_DROP     (packet was dropped, but freed)
1505  *
1506  * dev_forward_skb can be used for injecting an skb from the
1507  * start_xmit function of one device into the receive queue
1508  * of another device.
1509  *
1510  * The receiving device may be in another namespace, so
1511  * we have to clear all information in the skb that could
1512  * impact namespace isolation.
1513  */
1514 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1515 {
1516         skb_orphan(skb);
1517         nf_reset(skb);
1518
1519         if (unlikely(!(dev->flags & IFF_UP) ||
1520                      (skb->len > (dev->mtu + dev->hard_header_len + VLAN_HLEN)))) {
1521                 atomic_long_inc(&dev->rx_dropped);
1522                 kfree_skb(skb);
1523                 return NET_RX_DROP;
1524         }
1525         skb_set_dev(skb, dev);
1526         skb->tstamp.tv64 = 0;
1527         skb->pkt_type = PACKET_HOST;
1528         skb->protocol = eth_type_trans(skb, dev);
1529         return netif_rx(skb);
1530 }
1531 EXPORT_SYMBOL_GPL(dev_forward_skb);
1532
1533 static inline int deliver_skb(struct sk_buff *skb,
1534                               struct packet_type *pt_prev,
1535                               struct net_device *orig_dev)
1536 {
1537         atomic_inc(&skb->users);
1538         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1539 }
1540
1541 /*
1542  *      Support routine. Sends outgoing frames to any network
1543  *      taps currently in use.
1544  */
1545
1546 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1547 {
1548         struct packet_type *ptype;
1549         struct sk_buff *skb2 = NULL;
1550         struct packet_type *pt_prev = NULL;
1551
1552         rcu_read_lock();
1553         list_for_each_entry_rcu(ptype, &ptype_all, list) {
1554                 /* Never send packets back to the socket
1555                  * they originated from - MvS (miquels@drinkel.ow.org)
1556                  */
1557                 if ((ptype->dev == dev || !ptype->dev) &&
1558                     (ptype->af_packet_priv == NULL ||
1559                      (struct sock *)ptype->af_packet_priv != skb->sk)) {
1560                         if (pt_prev) {
1561                                 deliver_skb(skb2, pt_prev, skb->dev);
1562                                 pt_prev = ptype;
1563                                 continue;
1564                         }
1565
1566                         skb2 = skb_clone(skb, GFP_ATOMIC);
1567                         if (!skb2)
1568                                 break;
1569
1570                         net_timestamp_set(skb2);
1571
1572                         /* skb->nh should be correctly
1573                            set by sender, so that the second statement is
1574                            just protection against buggy protocols.
1575                          */
1576                         skb_reset_mac_header(skb2);
1577
1578                         if (skb_network_header(skb2) < skb2->data ||
1579                             skb2->network_header > skb2->tail) {
1580                                 if (net_ratelimit())
1581                                         printk(KERN_CRIT "protocol %04x is "
1582                                                "buggy, dev %s\n",
1583                                                ntohs(skb2->protocol),
1584                                                dev->name);
1585                                 skb_reset_network_header(skb2);
1586                         }
1587
1588                         skb2->transport_header = skb2->network_header;
1589                         skb2->pkt_type = PACKET_OUTGOING;
1590                         pt_prev = ptype;
1591                 }
1592         }
1593         if (pt_prev)
1594                 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1595         rcu_read_unlock();
1596 }
1597
1598 /* netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1599  * @dev: Network device
1600  * @txq: number of queues available
1601  *
1602  * If real_num_tx_queues is changed the tc mappings may no longer be
1603  * valid. To resolve this verify the tc mapping remains valid and if
1604  * not NULL the mapping. With no priorities mapping to this
1605  * offset/count pair it will no longer be used. In the worst case TC0
1606  * is invalid nothing can be done so disable priority mappings. If is
1607  * expected that drivers will fix this mapping if they can before
1608  * calling netif_set_real_num_tx_queues.
1609  */
1610 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1611 {
1612         int i;
1613         struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1614
1615         /* If TC0 is invalidated disable TC mapping */
1616         if (tc->offset + tc->count > txq) {
1617                 pr_warning("Number of in use tx queues changed "
1618                            "invalidating tc mappings. Priority "
1619                            "traffic classification disabled!\n");
1620                 dev->num_tc = 0;
1621                 return;
1622         }
1623
1624         /* Invalidated prio to tc mappings set to TC0 */
1625         for (i = 1; i < TC_BITMASK + 1; i++) {
1626                 int q = netdev_get_prio_tc_map(dev, i);
1627
1628                 tc = &dev->tc_to_txq[q];
1629                 if (tc->offset + tc->count > txq) {
1630                         pr_warning("Number of in use tx queues "
1631                                    "changed. Priority %i to tc "
1632                                    "mapping %i is no longer valid "
1633                                    "setting map to 0\n",
1634                                    i, q);
1635                         netdev_set_prio_tc_map(dev, i, 0);
1636                 }
1637         }
1638 }
1639
1640 /*
1641  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1642  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1643  */
1644 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1645 {
1646         int rc;
1647
1648         if (txq < 1 || txq > dev->num_tx_queues)
1649                 return -EINVAL;
1650
1651         if (dev->reg_state == NETREG_REGISTERED) {
1652                 ASSERT_RTNL();
1653
1654                 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
1655                                                   txq);
1656                 if (rc)
1657                         return rc;
1658
1659                 if (dev->num_tc)
1660                         netif_setup_tc(dev, txq);
1661
1662                 if (txq < dev->real_num_tx_queues)
1663                         qdisc_reset_all_tx_gt(dev, txq);
1664         }
1665
1666         dev->real_num_tx_queues = txq;
1667         return 0;
1668 }
1669 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
1670
1671 #ifdef CONFIG_RPS
1672 /**
1673  *      netif_set_real_num_rx_queues - set actual number of RX queues used
1674  *      @dev: Network device
1675  *      @rxq: Actual number of RX queues
1676  *
1677  *      This must be called either with the rtnl_lock held or before
1678  *      registration of the net device.  Returns 0 on success, or a
1679  *      negative error code.  If called before registration, it always
1680  *      succeeds.
1681  */
1682 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
1683 {
1684         int rc;
1685
1686         if (rxq < 1 || rxq > dev->num_rx_queues)
1687                 return -EINVAL;
1688
1689         if (dev->reg_state == NETREG_REGISTERED) {
1690                 ASSERT_RTNL();
1691
1692                 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
1693                                                   rxq);
1694                 if (rc)
1695                         return rc;
1696         }
1697
1698         dev->real_num_rx_queues = rxq;
1699         return 0;
1700 }
1701 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
1702 #endif
1703
1704 static inline void __netif_reschedule(struct Qdisc *q)
1705 {
1706         struct softnet_data *sd;
1707         unsigned long flags;
1708
1709         local_irq_save(flags);
1710         sd = &__get_cpu_var(softnet_data);
1711         q->next_sched = NULL;
1712         *sd->output_queue_tailp = q;
1713         sd->output_queue_tailp = &q->next_sched;
1714         raise_softirq_irqoff(NET_TX_SOFTIRQ);
1715         local_irq_restore(flags);
1716 }
1717
1718 void __netif_schedule(struct Qdisc *q)
1719 {
1720         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1721                 __netif_reschedule(q);
1722 }
1723 EXPORT_SYMBOL(__netif_schedule);
1724
1725 void dev_kfree_skb_irq(struct sk_buff *skb)
1726 {
1727         if (atomic_dec_and_test(&skb->users)) {
1728                 struct softnet_data *sd;
1729                 unsigned long flags;
1730
1731                 local_irq_save(flags);
1732                 sd = &__get_cpu_var(softnet_data);
1733                 skb->next = sd->completion_queue;
1734                 sd->completion_queue = skb;
1735                 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1736                 local_irq_restore(flags);
1737         }
1738 }
1739 EXPORT_SYMBOL(dev_kfree_skb_irq);
1740
1741 void dev_kfree_skb_any(struct sk_buff *skb)
1742 {
1743         if (in_irq() || irqs_disabled())
1744                 dev_kfree_skb_irq(skb);
1745         else
1746                 dev_kfree_skb(skb);
1747 }
1748 EXPORT_SYMBOL(dev_kfree_skb_any);
1749
1750
1751 /**
1752  * netif_device_detach - mark device as removed
1753  * @dev: network device
1754  *
1755  * Mark device as removed from system and therefore no longer available.
1756  */
1757 void netif_device_detach(struct net_device *dev)
1758 {
1759         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1760             netif_running(dev)) {
1761                 netif_tx_stop_all_queues(dev);
1762         }
1763 }
1764 EXPORT_SYMBOL(netif_device_detach);
1765
1766 /**
1767  * netif_device_attach - mark device as attached
1768  * @dev: network device
1769  *
1770  * Mark device as attached from system and restart if needed.
1771  */
1772 void netif_device_attach(struct net_device *dev)
1773 {
1774         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1775             netif_running(dev)) {
1776                 netif_tx_wake_all_queues(dev);
1777                 __netdev_watchdog_up(dev);
1778         }
1779 }
1780 EXPORT_SYMBOL(netif_device_attach);
1781
1782 /**
1783  * skb_dev_set -- assign a new device to a buffer
1784  * @skb: buffer for the new device
1785  * @dev: network device
1786  *
1787  * If an skb is owned by a device already, we have to reset
1788  * all data private to the namespace a device belongs to
1789  * before assigning it a new device.
1790  */
1791 #ifdef CONFIG_NET_NS
1792 void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
1793 {
1794         skb_dst_drop(skb);
1795         if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
1796                 secpath_reset(skb);
1797                 nf_reset(skb);
1798                 skb_init_secmark(skb);
1799                 skb->mark = 0;
1800                 skb->priority = 0;
1801                 skb->nf_trace = 0;
1802                 skb->ipvs_property = 0;
1803 #ifdef CONFIG_NET_SCHED
1804                 skb->tc_index = 0;
1805 #endif
1806         }
1807         skb->dev = dev;
1808 }
1809 EXPORT_SYMBOL(skb_set_dev);
1810 #endif /* CONFIG_NET_NS */
1811
1812 /*
1813  * Invalidate hardware checksum when packet is to be mangled, and
1814  * complete checksum manually on outgoing path.
1815  */
1816 int skb_checksum_help(struct sk_buff *skb)
1817 {
1818         __wsum csum;
1819         int ret = 0, offset;
1820
1821         if (skb->ip_summed == CHECKSUM_COMPLETE)
1822                 goto out_set_summed;
1823
1824         if (unlikely(skb_shinfo(skb)->gso_size)) {
1825                 /* Let GSO fix up the checksum. */
1826                 goto out_set_summed;
1827         }
1828
1829         offset = skb_checksum_start_offset(skb);
1830         BUG_ON(offset >= skb_headlen(skb));
1831         csum = skb_checksum(skb, offset, skb->len - offset, 0);
1832
1833         offset += skb->csum_offset;
1834         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1835
1836         if (skb_cloned(skb) &&
1837             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1838                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1839                 if (ret)
1840                         goto out;
1841         }
1842
1843         *(__sum16 *)(skb->data + offset) = csum_fold(csum);
1844 out_set_summed:
1845         skb->ip_summed = CHECKSUM_NONE;
1846 out:
1847         return ret;
1848 }
1849 EXPORT_SYMBOL(skb_checksum_help);
1850
1851 /**
1852  *      skb_gso_segment - Perform segmentation on skb.
1853  *      @skb: buffer to segment
1854  *      @features: features for the output path (see dev->features)
1855  *
1856  *      This function segments the given skb and returns a list of segments.
1857  *
1858  *      It may return NULL if the skb requires no segmentation.  This is
1859  *      only possible when GSO is used for verifying header integrity.
1860  */
1861 struct sk_buff *skb_gso_segment(struct sk_buff *skb, u32 features)
1862 {
1863         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1864         struct packet_type *ptype;
1865         __be16 type = skb->protocol;
1866         int vlan_depth = ETH_HLEN;
1867         int err;
1868
1869         while (type == htons(ETH_P_8021Q)) {
1870                 struct vlan_hdr *vh;
1871
1872                 if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
1873                         return ERR_PTR(-EINVAL);
1874
1875                 vh = (struct vlan_hdr *)(skb->data + vlan_depth);
1876                 type = vh->h_vlan_encapsulated_proto;
1877                 vlan_depth += VLAN_HLEN;
1878         }
1879
1880         skb_reset_mac_header(skb);
1881         skb->mac_len = skb->network_header - skb->mac_header;
1882         __skb_pull(skb, skb->mac_len);
1883
1884         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1885                 struct net_device *dev = skb->dev;
1886                 struct ethtool_drvinfo info = {};
1887
1888                 if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1889                         dev->ethtool_ops->get_drvinfo(dev, &info);
1890
1891                 WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d ip_summed=%d\n",
1892                      info.driver, dev ? dev->features : 0L,
1893                      skb->sk ? skb->sk->sk_route_caps : 0L,
1894                      skb->len, skb->data_len, skb->ip_summed);
1895
1896                 if (skb_header_cloned(skb) &&
1897                     (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1898                         return ERR_PTR(err);
1899         }
1900
1901         rcu_read_lock();
1902         list_for_each_entry_rcu(ptype,
1903                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1904                 if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1905                         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1906                                 err = ptype->gso_send_check(skb);
1907                                 segs = ERR_PTR(err);
1908                                 if (err || skb_gso_ok(skb, features))
1909                                         break;
1910                                 __skb_push(skb, (skb->data -
1911                                                  skb_network_header(skb)));
1912                         }
1913                         segs = ptype->gso_segment(skb, features);
1914                         break;
1915                 }
1916         }
1917         rcu_read_unlock();
1918
1919         __skb_push(skb, skb->data - skb_mac_header(skb));
1920
1921         return segs;
1922 }
1923 EXPORT_SYMBOL(skb_gso_segment);
1924
1925 /* Take action when hardware reception checksum errors are detected. */
1926 #ifdef CONFIG_BUG
1927 void netdev_rx_csum_fault(struct net_device *dev)
1928 {
1929         if (net_ratelimit()) {
1930                 printk(KERN_ERR "%s: hw csum failure.\n",
1931                         dev ? dev->name : "<unknown>");
1932                 dump_stack();
1933         }
1934 }
1935 EXPORT_SYMBOL(netdev_rx_csum_fault);
1936 #endif
1937
1938 /* Actually, we should eliminate this check as soon as we know, that:
1939  * 1. IOMMU is present and allows to map all the memory.
1940  * 2. No high memory really exists on this machine.
1941  */
1942
1943 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1944 {
1945 #ifdef CONFIG_HIGHMEM
1946         int i;
1947         if (!(dev->features & NETIF_F_HIGHDMA)) {
1948                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1949                         if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1950                                 return 1;
1951         }
1952
1953         if (PCI_DMA_BUS_IS_PHYS) {
1954                 struct device *pdev = dev->dev.parent;
1955
1956                 if (!pdev)
1957                         return 0;
1958                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1959                         dma_addr_t addr = page_to_phys(skb_shinfo(skb)->frags[i].page);
1960                         if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
1961                                 return 1;
1962                 }
1963         }
1964 #endif
1965         return 0;
1966 }
1967
1968 struct dev_gso_cb {
1969         void (*destructor)(struct sk_buff *skb);
1970 };
1971
1972 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1973
1974 static void dev_gso_skb_destructor(struct sk_buff *skb)
1975 {
1976         struct dev_gso_cb *cb;
1977
1978         do {
1979                 struct sk_buff *nskb = skb->next;
1980
1981                 skb->next = nskb->next;
1982                 nskb->next = NULL;
1983                 kfree_skb(nskb);
1984         } while (skb->next);
1985
1986         cb = DEV_GSO_CB(skb);
1987         if (cb->destructor)
1988                 cb->destructor(skb);
1989 }
1990
1991 /**
1992  *      dev_gso_segment - Perform emulated hardware segmentation on skb.
1993  *      @skb: buffer to segment
1994  *      @features: device features as applicable to this skb
1995  *
1996  *      This function segments the given skb and stores the list of segments
1997  *      in skb->next.
1998  */
1999 static int dev_gso_segment(struct sk_buff *skb, int features)
2000 {
2001         struct sk_buff *segs;
2002
2003         segs = skb_gso_segment(skb, features);
2004
2005         /* Verifying header integrity only. */
2006         if (!segs)
2007                 return 0;
2008
2009         if (IS_ERR(segs))
2010                 return PTR_ERR(segs);
2011
2012         skb->next = segs;
2013         DEV_GSO_CB(skb)->destructor = skb->destructor;
2014         skb->destructor = dev_gso_skb_destructor;
2015
2016         return 0;
2017 }
2018
2019 /*
2020  * Try to orphan skb early, right before transmission by the device.
2021  * We cannot orphan skb if tx timestamp is requested or the sk-reference
2022  * is needed on driver level for other reasons, e.g. see net/can/raw.c
2023  */
2024 static inline void skb_orphan_try(struct sk_buff *skb)
2025 {
2026         struct sock *sk = skb->sk;
2027
2028         if (sk && !skb_shinfo(skb)->tx_flags) {
2029                 /* skb_tx_hash() wont be able to get sk.
2030                  * We copy sk_hash into skb->rxhash
2031                  */
2032                 if (!skb->rxhash)
2033                         skb->rxhash = sk->sk_hash;
2034                 skb_orphan(skb);
2035         }
2036 }
2037
2038 static bool can_checksum_protocol(unsigned long features, __be16 protocol)
2039 {
2040         return ((features & NETIF_F_GEN_CSUM) ||
2041                 ((features & NETIF_F_V4_CSUM) &&
2042                  protocol == htons(ETH_P_IP)) ||
2043                 ((features & NETIF_F_V6_CSUM) &&
2044                  protocol == htons(ETH_P_IPV6)) ||
2045                 ((features & NETIF_F_FCOE_CRC) &&
2046                  protocol == htons(ETH_P_FCOE)));
2047 }
2048
2049 static u32 harmonize_features(struct sk_buff *skb, __be16 protocol, u32 features)
2050 {
2051         if (!can_checksum_protocol(features, protocol)) {
2052                 features &= ~NETIF_F_ALL_CSUM;
2053                 features &= ~NETIF_F_SG;
2054         } else if (illegal_highdma(skb->dev, skb)) {
2055                 features &= ~NETIF_F_SG;
2056         }
2057
2058         return features;
2059 }
2060
2061 u32 netif_skb_features(struct sk_buff *skb)
2062 {
2063         __be16 protocol = skb->protocol;
2064         u32 features = skb->dev->features;
2065
2066         if (protocol == htons(ETH_P_8021Q)) {
2067                 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2068                 protocol = veh->h_vlan_encapsulated_proto;
2069         } else if (!vlan_tx_tag_present(skb)) {
2070                 return harmonize_features(skb, protocol, features);
2071         }
2072
2073         features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
2074
2075         if (protocol != htons(ETH_P_8021Q)) {
2076                 return harmonize_features(skb, protocol, features);
2077         } else {
2078                 features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2079                                 NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
2080                 return harmonize_features(skb, protocol, features);
2081         }
2082 }
2083 EXPORT_SYMBOL(netif_skb_features);
2084
2085 /*
2086  * Returns true if either:
2087  *      1. skb has frag_list and the device doesn't support FRAGLIST, or
2088  *      2. skb is fragmented and the device does not support SG, or if
2089  *         at least one of fragments is in highmem and device does not
2090  *         support DMA from it.
2091  */
2092 static inline int skb_needs_linearize(struct sk_buff *skb,
2093                                       int features)
2094 {
2095         return skb_is_nonlinear(skb) &&
2096                         ((skb_has_frag_list(skb) &&
2097                                 !(features & NETIF_F_FRAGLIST)) ||
2098                         (skb_shinfo(skb)->nr_frags &&
2099                                 !(features & NETIF_F_SG)));
2100 }
2101
2102 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2103                         struct netdev_queue *txq)
2104 {
2105         const struct net_device_ops *ops = dev->netdev_ops;
2106         int rc = NETDEV_TX_OK;
2107
2108         if (likely(!skb->next)) {
2109                 u32 features;
2110
2111                 /*
2112                  * If device doesnt need skb->dst, release it right now while
2113                  * its hot in this cpu cache
2114                  */
2115                 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2116                         skb_dst_drop(skb);
2117
2118                 if (!list_empty(&ptype_all))
2119                         dev_queue_xmit_nit(skb, dev);
2120
2121                 skb_orphan_try(skb);
2122
2123                 features = netif_skb_features(skb);
2124
2125                 if (vlan_tx_tag_present(skb) &&
2126                     !(features & NETIF_F_HW_VLAN_TX)) {
2127                         skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2128                         if (unlikely(!skb))
2129                                 goto out;
2130
2131                         skb->vlan_tci = 0;
2132                 }
2133
2134                 if (netif_needs_gso(skb, features)) {
2135                         if (unlikely(dev_gso_segment(skb, features)))
2136                                 goto out_kfree_skb;
2137                         if (skb->next)
2138                                 goto gso;
2139                 } else {
2140                         if (skb_needs_linearize(skb, features) &&
2141                             __skb_linearize(skb))
2142                                 goto out_kfree_skb;
2143
2144                         /* If packet is not checksummed and device does not
2145                          * support checksumming for this protocol, complete
2146                          * checksumming here.
2147                          */
2148                         if (skb->ip_summed == CHECKSUM_PARTIAL) {
2149                                 skb_set_transport_header(skb,
2150                                         skb_checksum_start_offset(skb));
2151                                 if (!(features & NETIF_F_ALL_CSUM) &&
2152                                      skb_checksum_help(skb))
2153                                         goto out_kfree_skb;
2154                         }
2155                 }
2156
2157                 rc = ops->ndo_start_xmit(skb, dev);
2158                 trace_net_dev_xmit(skb, rc);
2159                 if (rc == NETDEV_TX_OK)
2160                         txq_trans_update(txq);
2161                 return rc;
2162         }
2163
2164 gso:
2165         do {
2166                 struct sk_buff *nskb = skb->next;
2167
2168                 skb->next = nskb->next;
2169                 nskb->next = NULL;
2170
2171                 /*
2172                  * If device doesnt need nskb->dst, release it right now while
2173                  * its hot in this cpu cache
2174                  */
2175                 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2176                         skb_dst_drop(nskb);
2177
2178                 rc = ops->ndo_start_xmit(nskb, dev);
2179                 trace_net_dev_xmit(nskb, rc);
2180                 if (unlikely(rc != NETDEV_TX_OK)) {
2181                         if (rc & ~NETDEV_TX_MASK)
2182                                 goto out_kfree_gso_skb;
2183                         nskb->next = skb->next;
2184                         skb->next = nskb;
2185                         return rc;
2186                 }
2187                 txq_trans_update(txq);
2188                 if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
2189                         return NETDEV_TX_BUSY;
2190         } while (skb->next);
2191
2192 out_kfree_gso_skb:
2193         if (likely(skb->next == NULL))
2194                 skb->destructor = DEV_GSO_CB(skb)->destructor;
2195 out_kfree_skb:
2196         kfree_skb(skb);
2197 out:
2198         return rc;
2199 }
2200
2201 static u32 hashrnd __read_mostly;
2202
2203 /*
2204  * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2205  * to be used as a distribution range.
2206  */
2207 u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
2208                   unsigned int num_tx_queues)
2209 {
2210         u32 hash;
2211         u16 qoffset = 0;
2212         u16 qcount = num_tx_queues;
2213
2214         if (skb_rx_queue_recorded(skb)) {
2215                 hash = skb_get_rx_queue(skb);
2216                 while (unlikely(hash >= num_tx_queues))
2217                         hash -= num_tx_queues;
2218                 return hash;
2219         }
2220
2221         if (dev->num_tc) {
2222                 u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2223                 qoffset = dev->tc_to_txq[tc].offset;
2224                 qcount = dev->tc_to_txq[tc].count;
2225         }
2226
2227         if (skb->sk && skb->sk->sk_hash)
2228                 hash = skb->sk->sk_hash;
2229         else
2230                 hash = (__force u16) skb->protocol ^ skb->rxhash;
2231         hash = jhash_1word(hash, hashrnd);
2232
2233         return (u16) (((u64) hash * qcount) >> 32) + qoffset;
2234 }
2235 EXPORT_SYMBOL(__skb_tx_hash);
2236
2237 static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2238 {
2239         if (unlikely(queue_index >= dev->real_num_tx_queues)) {
2240                 if (net_ratelimit()) {
2241                         pr_warning("%s selects TX queue %d, but "
2242                                 "real number of TX queues is %d\n",
2243                                 dev->name, queue_index, dev->real_num_tx_queues);
2244                 }
2245                 return 0;
2246         }
2247         return queue_index;
2248 }
2249
2250 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2251 {
2252 #ifdef CONFIG_XPS
2253         struct xps_dev_maps *dev_maps;
2254         struct xps_map *map;
2255         int queue_index = -1;
2256
2257         rcu_read_lock();
2258         dev_maps = rcu_dereference(dev->xps_maps);
2259         if (dev_maps) {
2260                 map = rcu_dereference(
2261                     dev_maps->cpu_map[raw_smp_processor_id()]);
2262                 if (map) {
2263                         if (map->len == 1)
2264                                 queue_index = map->queues[0];
2265                         else {
2266                                 u32 hash;
2267                                 if (skb->sk && skb->sk->sk_hash)
2268                                         hash = skb->sk->sk_hash;
2269                                 else
2270                                         hash = (__force u16) skb->protocol ^
2271                                             skb->rxhash;
2272                                 hash = jhash_1word(hash, hashrnd);
2273                                 queue_index = map->queues[
2274                                     ((u64)hash * map->len) >> 32];
2275                         }
2276                         if (unlikely(queue_index >= dev->real_num_tx_queues))
2277                                 queue_index = -1;
2278                 }
2279         }
2280         rcu_read_unlock();
2281
2282         return queue_index;
2283 #else
2284         return -1;
2285 #endif
2286 }
2287
2288 static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2289                                         struct sk_buff *skb)
2290 {
2291         int queue_index;
2292         const struct net_device_ops *ops = dev->netdev_ops;
2293
2294         if (dev->real_num_tx_queues == 1)
2295                 queue_index = 0;
2296         else if (ops->ndo_select_queue) {
2297                 queue_index = ops->ndo_select_queue(dev, skb);
2298                 queue_index = dev_cap_txqueue(dev, queue_index);
2299         } else {
2300                 struct sock *sk = skb->sk;
2301                 queue_index = sk_tx_queue_get(sk);
2302
2303                 if (queue_index < 0 || skb->ooo_okay ||
2304                     queue_index >= dev->real_num_tx_queues) {
2305                         int old_index = queue_index;
2306
2307                         queue_index = get_xps_queue(dev, skb);
2308                         if (queue_index < 0)
2309                                 queue_index = skb_tx_hash(dev, skb);
2310
2311                         if (queue_index != old_index && sk) {
2312                                 struct dst_entry *dst =
2313                                     rcu_dereference_check(sk->sk_dst_cache, 1);
2314
2315                                 if (dst && skb_dst(skb) == dst)
2316                                         sk_tx_queue_set(sk, queue_index);
2317                         }
2318                 }
2319         }
2320
2321         skb_set_queue_mapping(skb, queue_index);
2322         return netdev_get_tx_queue(dev, queue_index);
2323 }
2324
2325 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2326                                  struct net_device *dev,
2327                                  struct netdev_queue *txq)
2328 {
2329         spinlock_t *root_lock = qdisc_lock(q);
2330         bool contended;
2331         int rc;
2332
2333         qdisc_skb_cb(skb)->pkt_len = skb->len;
2334         qdisc_calculate_pkt_len(skb, q);
2335         /*
2336          * Heuristic to force contended enqueues to serialize on a
2337          * separate lock before trying to get qdisc main lock.
2338          * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2339          * and dequeue packets faster.
2340          */
2341         contended = qdisc_is_running(q);
2342         if (unlikely(contended))
2343                 spin_lock(&q->busylock);
2344
2345         spin_lock(root_lock);
2346         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2347                 kfree_skb(skb);
2348                 rc = NET_XMIT_DROP;
2349         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2350                    qdisc_run_begin(q)) {
2351                 /*
2352                  * This is a work-conserving queue; there are no old skbs
2353                  * waiting to be sent out; and the qdisc is not running -
2354                  * xmit the skb directly.
2355                  */
2356                 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2357                         skb_dst_force(skb);
2358
2359                 qdisc_bstats_update(q, skb);
2360
2361                 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2362                         if (unlikely(contended)) {
2363                                 spin_unlock(&q->busylock);
2364                                 contended = false;
2365                         }
2366                         __qdisc_run(q);
2367                 } else
2368                         qdisc_run_end(q);
2369
2370                 rc = NET_XMIT_SUCCESS;
2371         } else {
2372                 skb_dst_force(skb);
2373                 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2374                 if (qdisc_run_begin(q)) {
2375                         if (unlikely(contended)) {
2376                                 spin_unlock(&q->busylock);
2377                                 contended = false;
2378                         }
2379                         __qdisc_run(q);
2380                 }
2381         }
2382         spin_unlock(root_lock);
2383         if (unlikely(contended))
2384                 spin_unlock(&q->busylock);
2385         return rc;
2386 }
2387
2388 static DEFINE_PER_CPU(int, xmit_recursion);
2389 #define RECURSION_LIMIT 10
2390
2391 /**
2392  *      dev_queue_xmit - transmit a buffer
2393  *      @skb: buffer to transmit
2394  *
2395  *      Queue a buffer for transmission to a network device. The caller must
2396  *      have set the device and priority and built the buffer before calling
2397  *      this function. The function can be called from an interrupt.
2398  *
2399  *      A negative errno code is returned on a failure. A success does not
2400  *      guarantee the frame will be transmitted as it may be dropped due
2401  *      to congestion or traffic shaping.
2402  *
2403  * -----------------------------------------------------------------------------------
2404  *      I notice this method can also return errors from the queue disciplines,
2405  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2406  *      be positive.
2407  *
2408  *      Regardless of the return value, the skb is consumed, so it is currently
2409  *      difficult to retry a send to this method.  (You can bump the ref count
2410  *      before sending to hold a reference for retry if you are careful.)
2411  *
2412  *      When calling this method, interrupts MUST be enabled.  This is because
2413  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2414  *          --BLG
2415  */
2416 int dev_queue_xmit(struct sk_buff *skb)
2417 {
2418         struct net_device *dev = skb->dev;
2419         struct netdev_queue *txq;
2420         struct Qdisc *q;
2421         int rc = -ENOMEM;
2422
2423         /* Disable soft irqs for various locks below. Also
2424          * stops preemption for RCU.
2425          */
2426         rcu_read_lock_bh();
2427
2428         txq = dev_pick_tx(dev, skb);
2429         q = rcu_dereference_bh(txq->qdisc);
2430
2431 #ifdef CONFIG_NET_CLS_ACT
2432         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2433 #endif
2434         trace_net_dev_queue(skb);
2435         if (q->enqueue) {
2436                 rc = __dev_xmit_skb(skb, q, dev, txq);
2437                 goto out;
2438         }
2439
2440         /* The device has no queue. Common case for software devices:
2441            loopback, all the sorts of tunnels...
2442
2443            Really, it is unlikely that netif_tx_lock protection is necessary
2444            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2445            counters.)
2446            However, it is possible, that they rely on protection
2447            made by us here.
2448
2449            Check this and shot the lock. It is not prone from deadlocks.
2450            Either shot noqueue qdisc, it is even simpler 8)
2451          */
2452         if (dev->flags & IFF_UP) {
2453                 int cpu = smp_processor_id(); /* ok because BHs are off */
2454
2455                 if (txq->xmit_lock_owner != cpu) {
2456
2457                         if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2458                                 goto recursion_alert;
2459
2460                         HARD_TX_LOCK(dev, txq, cpu);
2461
2462                         if (!netif_tx_queue_stopped(txq)) {
2463                                 __this_cpu_inc(xmit_recursion);
2464                                 rc = dev_hard_start_xmit(skb, dev, txq);
2465                                 __this_cpu_dec(xmit_recursion);
2466                                 if (dev_xmit_complete(rc)) {
2467                                         HARD_TX_UNLOCK(dev, txq);
2468                                         goto out;
2469                                 }
2470                         }
2471                         HARD_TX_UNLOCK(dev, txq);
2472                         if (net_ratelimit())
2473                                 printk(KERN_CRIT "Virtual device %s asks to "
2474                                        "queue packet!\n", dev->name);
2475                 } else {
2476                         /* Recursion is detected! It is possible,
2477                          * unfortunately
2478                          */
2479 recursion_alert:
2480                         if (net_ratelimit())
2481                                 printk(KERN_CRIT "Dead loop on virtual device "
2482                                        "%s, fix it urgently!\n", dev->name);
2483                 }
2484         }
2485
2486         rc = -ENETDOWN;
2487         rcu_read_unlock_bh();
2488
2489         kfree_skb(skb);
2490         return rc;
2491 out:
2492         rcu_read_unlock_bh();
2493         return rc;
2494 }
2495 EXPORT_SYMBOL(dev_queue_xmit);
2496
2497
2498 /*=======================================================================
2499                         Receiver routines
2500   =======================================================================*/
2501
2502 int netdev_max_backlog __read_mostly = 1000;
2503 int netdev_tstamp_prequeue __read_mostly = 1;
2504 int netdev_budget __read_mostly = 300;
2505 int weight_p __read_mostly = 64;            /* old backlog weight */
2506
2507 /* Called with irq disabled */
2508 static inline void ____napi_schedule(struct softnet_data *sd,
2509                                      struct napi_struct *napi)
2510 {
2511         list_add_tail(&napi->poll_list, &sd->poll_list);
2512         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2513 }
2514
2515 /*
2516  * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
2517  * and src/dst port numbers. Returns a non-zero hash number on success
2518  * and 0 on failure.
2519  */
2520 __u32 __skb_get_rxhash(struct sk_buff *skb)
2521 {
2522         int nhoff, hash = 0, poff;
2523         struct ipv6hdr *ip6;
2524         struct iphdr *ip;
2525         u8 ip_proto;
2526         u32 addr1, addr2, ihl;
2527         union {
2528                 u32 v32;
2529                 u16 v16[2];
2530         } ports;
2531
2532         nhoff = skb_network_offset(skb);
2533
2534         switch (skb->protocol) {
2535         case __constant_htons(ETH_P_IP):
2536                 if (!pskb_may_pull(skb, sizeof(*ip) + nhoff))
2537                         goto done;
2538
2539                 ip = (struct iphdr *) (skb->data + nhoff);
2540                 if (ip->frag_off & htons(IP_MF | IP_OFFSET))
2541                         ip_proto = 0;
2542                 else
2543                         ip_proto = ip->protocol;
2544                 addr1 = (__force u32) ip->saddr;
2545                 addr2 = (__force u32) ip->daddr;
2546                 ihl = ip->ihl;
2547                 break;
2548         case __constant_htons(ETH_P_IPV6):
2549                 if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff))
2550                         goto done;
2551
2552                 ip6 = (struct ipv6hdr *) (skb->data + nhoff);
2553                 ip_proto = ip6->nexthdr;
2554                 addr1 = (__force u32) ip6->saddr.s6_addr32[3];
2555                 addr2 = (__force u32) ip6->daddr.s6_addr32[3];
2556                 ihl = (40 >> 2);
2557                 break;
2558         default:
2559                 goto done;
2560         }
2561
2562         ports.v32 = 0;
2563         poff = proto_ports_offset(ip_proto);
2564         if (poff >= 0) {
2565                 nhoff += ihl * 4 + poff;
2566                 if (pskb_may_pull(skb, nhoff + 4)) {
2567                         ports.v32 = * (__force u32 *) (skb->data + nhoff);
2568                         if (ports.v16[1] < ports.v16[0])
2569                                 swap(ports.v16[0], ports.v16[1]);
2570                 }
2571         }
2572
2573         /* get a consistent hash (same value on both flow directions) */
2574         if (addr2 < addr1)
2575                 swap(addr1, addr2);
2576
2577         hash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
2578         if (!hash)
2579                 hash = 1;
2580
2581 done:
2582         return hash;
2583 }
2584 EXPORT_SYMBOL(__skb_get_rxhash);
2585
2586 #ifdef CONFIG_RPS
2587
2588 /* One global table that all flow-based protocols share. */
2589 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2590 EXPORT_SYMBOL(rps_sock_flow_table);
2591
2592 static struct rps_dev_flow *
2593 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2594             struct rps_dev_flow *rflow, u16 next_cpu)
2595 {
2596         u16 tcpu;
2597
2598         tcpu = rflow->cpu = next_cpu;
2599         if (tcpu != RPS_NO_CPU) {
2600 #ifdef CONFIG_RFS_ACCEL
2601                 struct netdev_rx_queue *rxqueue;
2602                 struct rps_dev_flow_table *flow_table;
2603                 struct rps_dev_flow *old_rflow;
2604                 u32 flow_id;
2605                 u16 rxq_index;
2606                 int rc;
2607
2608                 /* Should we steer this flow to a different hardware queue? */
2609                 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap)
2610                         goto out;
2611                 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2612                 if (rxq_index == skb_get_rx_queue(skb))
2613                         goto out;
2614
2615                 rxqueue = dev->_rx + rxq_index;
2616                 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2617                 if (!flow_table)
2618                         goto out;
2619                 flow_id = skb->rxhash & flow_table->mask;
2620                 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2621                                                         rxq_index, flow_id);
2622                 if (rc < 0)
2623                         goto out;
2624                 old_rflow = rflow;
2625                 rflow = &flow_table->flows[flow_id];
2626                 rflow->cpu = next_cpu;
2627                 rflow->filter = rc;
2628                 if (old_rflow->filter == rflow->filter)
2629                         old_rflow->filter = RPS_NO_FILTER;
2630         out:
2631 #endif
2632                 rflow->last_qtail =
2633                         per_cpu(softnet_data, tcpu).input_queue_head;
2634         }
2635
2636         return rflow;
2637 }
2638
2639 /*
2640  * get_rps_cpu is called from netif_receive_skb and returns the target
2641  * CPU from the RPS map of the receiving queue for a given skb.
2642  * rcu_read_lock must be held on entry.
2643  */
2644 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2645                        struct rps_dev_flow **rflowp)
2646 {
2647         struct netdev_rx_queue *rxqueue;
2648         struct rps_map *map;
2649         struct rps_dev_flow_table *flow_table;
2650         struct rps_sock_flow_table *sock_flow_table;
2651         int cpu = -1;
2652         u16 tcpu;
2653
2654         if (skb_rx_queue_recorded(skb)) {
2655                 u16 index = skb_get_rx_queue(skb);
2656                 if (unlikely(index >= dev->real_num_rx_queues)) {
2657                         WARN_ONCE(dev->real_num_rx_queues > 1,
2658                                   "%s received packet on queue %u, but number "
2659                                   "of RX queues is %u\n",
2660                                   dev->name, index, dev->real_num_rx_queues);
2661                         goto done;
2662                 }
2663                 rxqueue = dev->_rx + index;
2664         } else
2665                 rxqueue = dev->_rx;
2666
2667         map = rcu_dereference(rxqueue->rps_map);
2668         if (map) {
2669                 if (map->len == 1 &&
2670                     !rcu_dereference_raw(rxqueue->rps_flow_table)) {
2671                         tcpu = map->cpus[0];
2672                         if (cpu_online(tcpu))
2673                                 cpu = tcpu;
2674                         goto done;
2675                 }
2676         } else if (!rcu_dereference_raw(rxqueue->rps_flow_table)) {
2677                 goto done;
2678         }
2679
2680         skb_reset_network_header(skb);
2681         if (!skb_get_rxhash(skb))
2682                 goto done;
2683
2684         flow_table = rcu_dereference(rxqueue->rps_flow_table);
2685         sock_flow_table = rcu_dereference(rps_sock_flow_table);
2686         if (flow_table && sock_flow_table) {
2687                 u16 next_cpu;
2688                 struct rps_dev_flow *rflow;
2689
2690                 rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2691                 tcpu = rflow->cpu;
2692
2693                 next_cpu = sock_flow_table->ents[skb->rxhash &
2694                     sock_flow_table->mask];
2695
2696                 /*
2697                  * If the desired CPU (where last recvmsg was done) is
2698                  * different from current CPU (one in the rx-queue flow
2699                  * table entry), switch if one of the following holds:
2700                  *   - Current CPU is unset (equal to RPS_NO_CPU).
2701                  *   - Current CPU is offline.
2702                  *   - The current CPU's queue tail has advanced beyond the
2703                  *     last packet that was enqueued using this table entry.
2704                  *     This guarantees that all previous packets for the flow
2705                  *     have been dequeued, thus preserving in order delivery.
2706                  */
2707                 if (unlikely(tcpu != next_cpu) &&
2708                     (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2709                      ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2710                       rflow->last_qtail)) >= 0))
2711                         rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
2712
2713                 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2714                         *rflowp = rflow;
2715                         cpu = tcpu;
2716                         goto done;
2717                 }
2718         }
2719
2720         if (map) {
2721                 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2722
2723                 if (cpu_online(tcpu)) {
2724                         cpu = tcpu;
2725                         goto done;
2726                 }
2727         }
2728
2729 done:
2730         return cpu;
2731 }
2732
2733 #ifdef CONFIG_RFS_ACCEL
2734
2735 /**
2736  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
2737  * @dev: Device on which the filter was set
2738  * @rxq_index: RX queue index
2739  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
2740  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
2741  *
2742  * Drivers that implement ndo_rx_flow_steer() should periodically call
2743  * this function for each installed filter and remove the filters for
2744  * which it returns %true.
2745  */
2746 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
2747                          u32 flow_id, u16 filter_id)
2748 {
2749         struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
2750         struct rps_dev_flow_table *flow_table;
2751         struct rps_dev_flow *rflow;
2752         bool expire = true;
2753         int cpu;
2754
2755         rcu_read_lock();
2756         flow_table = rcu_dereference(rxqueue->rps_flow_table);
2757         if (flow_table && flow_id <= flow_table->mask) {
2758                 rflow = &flow_table->flows[flow_id];
2759                 cpu = ACCESS_ONCE(rflow->cpu);
2760                 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
2761                     ((int)(per_cpu(softnet_data, cpu).input_queue_head -
2762                            rflow->last_qtail) <
2763                      (int)(10 * flow_table->mask)))
2764                         expire = false;
2765         }
2766         rcu_read_unlock();
2767         return expire;
2768 }
2769 EXPORT_SYMBOL(rps_may_expire_flow);
2770
2771 #endif /* CONFIG_RFS_ACCEL */
2772
2773 /* Called from hardirq (IPI) context */
2774 static void rps_trigger_softirq(void *data)
2775 {
2776         struct softnet_data *sd = data;
2777
2778         ____napi_schedule(sd, &sd->backlog);
2779         sd->received_rps++;
2780 }
2781
2782 #endif /* CONFIG_RPS */
2783
2784 /*
2785  * Check if this softnet_data structure is another cpu one
2786  * If yes, queue it to our IPI list and return 1
2787  * If no, return 0
2788  */
2789 static int rps_ipi_queued(struct softnet_data *sd)
2790 {
2791 #ifdef CONFIG_RPS
2792         struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2793
2794         if (sd != mysd) {
2795                 sd->rps_ipi_next = mysd->rps_ipi_list;
2796                 mysd->rps_ipi_list = sd;
2797
2798                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2799                 return 1;
2800         }
2801 #endif /* CONFIG_RPS */
2802         return 0;
2803 }
2804
2805 /*
2806  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2807  * queue (may be a remote CPU queue).
2808  */
2809 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2810                               unsigned int *qtail)
2811 {
2812         struct softnet_data *sd;
2813         unsigned long flags;
2814
2815         sd = &per_cpu(softnet_data, cpu);
2816
2817         local_irq_save(flags);
2818
2819         rps_lock(sd);
2820         if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
2821                 if (skb_queue_len(&sd->input_pkt_queue)) {
2822 enqueue:
2823                         __skb_queue_tail(&sd->input_pkt_queue, skb);
2824                         input_queue_tail_incr_save(sd, qtail);
2825                         rps_unlock(sd);
2826                         local_irq_restore(flags);
2827                         return NET_RX_SUCCESS;
2828                 }
2829
2830                 /* Schedule NAPI for backlog device
2831                  * We can use non atomic operation since we own the queue lock
2832                  */
2833                 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
2834                         if (!rps_ipi_queued(sd))
2835                                 ____napi_schedule(sd, &sd->backlog);
2836                 }
2837                 goto enqueue;
2838         }
2839
2840         sd->dropped++;
2841         rps_unlock(sd);
2842
2843         local_irq_restore(flags);
2844
2845         atomic_long_inc(&skb->dev->rx_dropped);
2846         kfree_skb(skb);
2847         return NET_RX_DROP;
2848 }
2849
2850 /**
2851  *      netif_rx        -       post buffer to the network code
2852  *      @skb: buffer to post
2853  *
2854  *      This function receives a packet from a device driver and queues it for
2855  *      the upper (protocol) levels to process.  It always succeeds. The buffer
2856  *      may be dropped during processing for congestion control or by the
2857  *      protocol layers.
2858  *
2859  *      return values:
2860  *      NET_RX_SUCCESS  (no congestion)
2861  *      NET_RX_DROP     (packet was dropped)
2862  *
2863  */
2864
2865 int netif_rx(struct sk_buff *skb)
2866 {
2867         int ret;
2868
2869         /* if netpoll wants it, pretend we never saw it */
2870         if (netpoll_rx(skb))
2871                 return NET_RX_DROP;
2872
2873         if (netdev_tstamp_prequeue)
2874                 net_timestamp_check(skb);
2875
2876         trace_netif_rx(skb);
2877 #ifdef CONFIG_RPS
2878         {
2879                 struct rps_dev_flow voidflow, *rflow = &voidflow;
2880                 int cpu;
2881
2882                 preempt_disable();
2883                 rcu_read_lock();
2884
2885                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
2886                 if (cpu < 0)
2887                         cpu = smp_processor_id();
2888
2889                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2890
2891                 rcu_read_unlock();
2892                 preempt_enable();
2893         }
2894 #else
2895         {
2896                 unsigned int qtail;
2897                 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
2898                 put_cpu();
2899         }
2900 #endif
2901         return ret;
2902 }
2903 EXPORT_SYMBOL(netif_rx);
2904
2905 int netif_rx_ni(struct sk_buff *skb)
2906 {
2907         int err;
2908
2909         preempt_disable();
2910         err = netif_rx(skb);
2911         if (local_softirq_pending())
2912                 do_softirq();
2913         preempt_enable();
2914
2915         return err;
2916 }
2917 EXPORT_SYMBOL(netif_rx_ni);
2918
2919 static void net_tx_action(struct softirq_action *h)
2920 {
2921         struct softnet_data *sd = &__get_cpu_var(softnet_data);
2922
2923         if (sd->completion_queue) {
2924                 struct sk_buff *clist;
2925
2926                 local_irq_disable();
2927                 clist = sd->completion_queue;
2928                 sd->completion_queue = NULL;
2929                 local_irq_enable();
2930
2931                 while (clist) {
2932                         struct sk_buff *skb = clist;
2933                         clist = clist->next;
2934
2935                         WARN_ON(atomic_read(&skb->users));
2936                         trace_kfree_skb(skb, net_tx_action);
2937                         __kfree_skb(skb);
2938                 }
2939         }
2940
2941         if (sd->output_queue) {
2942                 struct Qdisc *head;
2943
2944                 local_irq_disable();
2945                 head = sd->output_queue;
2946                 sd->output_queue = NULL;
2947                 sd->output_queue_tailp = &sd->output_queue;
2948                 local_irq_enable();
2949
2950                 while (head) {
2951                         struct Qdisc *q = head;
2952                         spinlock_t *root_lock;
2953
2954                         head = head->next_sched;
2955
2956                         root_lock = qdisc_lock(q);
2957                         if (spin_trylock(root_lock)) {
2958                                 smp_mb__before_clear_bit();
2959                                 clear_bit(__QDISC_STATE_SCHED,
2960                                           &q->state);
2961                                 qdisc_run(q);
2962                                 spin_unlock(root_lock);
2963                         } else {
2964                                 if (!test_bit(__QDISC_STATE_DEACTIVATED,
2965                                               &q->state)) {
2966                                         __netif_reschedule(q);
2967                                 } else {
2968                                         smp_mb__before_clear_bit();
2969                                         clear_bit(__QDISC_STATE_SCHED,
2970                                                   &q->state);
2971                                 }
2972                         }
2973                 }
2974         }
2975 }
2976
2977 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
2978     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
2979 /* This hook is defined here for ATM LANE */
2980 int (*br_fdb_test_addr_hook)(struct net_device *dev,
2981                              unsigned char *addr) __read_mostly;
2982 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
2983 #endif
2984
2985 #ifdef CONFIG_NET_CLS_ACT
2986 /* TODO: Maybe we should just force sch_ingress to be compiled in
2987  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2988  * a compare and 2 stores extra right now if we dont have it on
2989  * but have CONFIG_NET_CLS_ACT
2990  * NOTE: This doesnt stop any functionality; if you dont have
2991  * the ingress scheduler, you just cant add policies on ingress.
2992  *
2993  */
2994 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
2995 {
2996         struct net_device *dev = skb->dev;
2997         u32 ttl = G_TC_RTTL(skb->tc_verd);
2998         int result = TC_ACT_OK;
2999         struct Qdisc *q;
3000
3001         if (unlikely(MAX_RED_LOOP < ttl++)) {
3002                 if (net_ratelimit())
3003                         pr_warning( "Redir loop detected Dropping packet (%d->%d)\n",
3004                                skb->skb_iif, dev->ifindex);
3005                 return TC_ACT_SHOT;
3006         }
3007
3008         skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3009         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3010
3011         q = rxq->qdisc;
3012         if (q != &noop_qdisc) {
3013                 spin_lock(qdisc_lock(q));
3014                 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3015                         result = qdisc_enqueue_root(skb, q);
3016                 spin_unlock(qdisc_lock(q));
3017         }
3018
3019         return result;
3020 }
3021
3022 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3023                                          struct packet_type **pt_prev,
3024                                          int *ret, struct net_device *orig_dev)
3025 {
3026         struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3027
3028         if (!rxq || rxq->qdisc == &noop_qdisc)
3029                 goto out;
3030
3031         if (*pt_prev) {
3032                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3033                 *pt_prev = NULL;
3034         }
3035
3036         switch (ing_filter(skb, rxq)) {
3037         case TC_ACT_SHOT:
3038         case TC_ACT_STOLEN:
3039                 kfree_skb(skb);
3040                 return NULL;
3041         }
3042
3043 out:
3044         skb->tc_verd = 0;
3045         return skb;
3046 }
3047 #endif
3048
3049 /**
3050  *      netdev_rx_handler_register - register receive handler
3051  *      @dev: device to register a handler for
3052  *      @rx_handler: receive handler to register
3053  *      @rx_handler_data: data pointer that is used by rx handler
3054  *
3055  *      Register a receive hander for a device. This handler will then be
3056  *      called from __netif_receive_skb. A negative errno code is returned
3057  *      on a failure.
3058  *
3059  *      The caller must hold the rtnl_mutex.
3060  */
3061 int netdev_rx_handler_register(struct net_device *dev,
3062                                rx_handler_func_t *rx_handler,
3063                                void *rx_handler_data)
3064 {
3065         ASSERT_RTNL();
3066
3067         if (dev->rx_handler)
3068                 return -EBUSY;
3069
3070         rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3071         rcu_assign_pointer(dev->rx_handler, rx_handler);
3072
3073         return 0;
3074 }
3075 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3076
3077 /**
3078  *      netdev_rx_handler_unregister - unregister receive handler
3079  *      @dev: device to unregister a handler from
3080  *
3081  *      Unregister a receive hander from a device.
3082  *
3083  *      The caller must hold the rtnl_mutex.
3084  */
3085 void netdev_rx_handler_unregister(struct net_device *dev)
3086 {
3087
3088         ASSERT_RTNL();
3089         rcu_assign_pointer(dev->rx_handler, NULL);
3090         rcu_assign_pointer(dev->rx_handler_data, NULL);
3091 }
3092 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3093
3094 static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
3095                                               struct net_device *master)
3096 {
3097         if (skb->pkt_type == PACKET_HOST) {
3098                 u16 *dest = (u16 *) eth_hdr(skb)->h_dest;
3099
3100                 memcpy(dest, master->dev_addr, ETH_ALEN);
3101         }
3102 }
3103
3104 /* On bonding slaves other than the currently active slave, suppress
3105  * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
3106  * ARP on active-backup slaves with arp_validate enabled.
3107  */
3108 int __skb_bond_should_drop(struct sk_buff *skb, struct net_device *master)
3109 {
3110         struct net_device *dev = skb->dev;
3111
3112         if (master->priv_flags & IFF_MASTER_ARPMON)
3113                 dev->last_rx = jiffies;
3114
3115         if ((master->priv_flags & IFF_MASTER_ALB) &&
3116             (master->priv_flags & IFF_BRIDGE_PORT)) {
3117                 /* Do address unmangle. The local destination address
3118                  * will be always the one master has. Provides the right
3119                  * functionality in a bridge.
3120                  */
3121                 skb_bond_set_mac_by_master(skb, master);
3122         }
3123
3124         if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
3125                 if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
3126                     skb->protocol == __cpu_to_be16(ETH_P_ARP))
3127                         return 0;
3128
3129                 if (master->priv_flags & IFF_MASTER_ALB) {
3130                         if (skb->pkt_type != PACKET_BROADCAST &&
3131                             skb->pkt_type != PACKET_MULTICAST)
3132                                 return 0;
3133                 }
3134                 if (master->priv_flags & IFF_MASTER_8023AD &&
3135                     skb->protocol == __cpu_to_be16(ETH_P_SLOW))
3136                         return 0;
3137
3138                 return 1;
3139         }
3140         return 0;
3141 }
3142 EXPORT_SYMBOL(__skb_bond_should_drop);
3143
3144 static int __netif_receive_skb(struct sk_buff *skb)
3145 {
3146         struct packet_type *ptype, *pt_prev;
3147         rx_handler_func_t *rx_handler;
3148         struct net_device *orig_dev;
3149         struct net_device *master;
3150         struct net_device *null_or_orig;
3151         struct net_device *orig_or_bond;
3152         int ret = NET_RX_DROP;
3153         __be16 type;
3154
3155         if (!netdev_tstamp_prequeue)
3156                 net_timestamp_check(skb);
3157
3158         trace_netif_receive_skb(skb);
3159
3160         /* if we've gotten here through NAPI, check netpoll */
3161         if (netpoll_receive_skb(skb))
3162                 return NET_RX_DROP;
3163
3164         if (!skb->skb_iif)
3165                 skb->skb_iif = skb->dev->ifindex;
3166
3167         /*
3168          * bonding note: skbs received on inactive slaves should only
3169          * be delivered to pkt handlers that are exact matches.  Also
3170          * the deliver_no_wcard flag will be set.  If packet handlers
3171          * are sensitive to duplicate packets these skbs will need to
3172          * be dropped at the handler.
3173          */
3174         null_or_orig = NULL;
3175         orig_dev = skb->dev;
3176         master = ACCESS_ONCE(orig_dev->master);
3177         if (skb->deliver_no_wcard)
3178                 null_or_orig = orig_dev;
3179         else if (master) {
3180                 if (skb_bond_should_drop(skb, master)) {
3181                         skb->deliver_no_wcard = 1;
3182                         null_or_orig = orig_dev; /* deliver only exact match */
3183                 } else
3184                         skb->dev = master;
3185         }
3186
3187         __this_cpu_inc(softnet_data.processed);
3188         skb_reset_network_header(skb);
3189         skb_reset_transport_header(skb);
3190         skb->mac_len = skb->network_header - skb->mac_header;
3191
3192         pt_prev = NULL;
3193
3194         rcu_read_lock();
3195
3196 #ifdef CONFIG_NET_CLS_ACT
3197         if (skb->tc_verd & TC_NCLS) {
3198                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3199                 goto ncls;
3200         }
3201 #endif
3202
3203         list_for_each_entry_rcu(ptype, &ptype_all, list) {
3204                 if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
3205                     ptype->dev == orig_dev) {
3206                         if (pt_prev)
3207                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3208                         pt_prev = ptype;
3209                 }
3210         }
3211
3212 #ifdef CONFIG_NET_CLS_ACT
3213         skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3214         if (!skb)
3215                 goto out;
3216 ncls:
3217 #endif
3218
3219         /* Handle special case of bridge or macvlan */
3220         rx_handler = rcu_dereference(skb->dev->rx_handler);
3221         if (rx_handler) {
3222                 if (pt_prev) {
3223                         ret = deliver_skb(skb, pt_prev, orig_dev);
3224                         pt_prev = NULL;
3225                 }
3226                 skb = rx_handler(skb);
3227                 if (!skb)
3228                         goto out;
3229         }
3230
3231         if (vlan_tx_tag_present(skb)) {
3232                 if (pt_prev) {
3233                         ret = deliver_skb(skb, pt_prev, orig_dev);
3234                         pt_prev = NULL;
3235                 }
3236                 if (vlan_hwaccel_do_receive(&skb)) {
3237                         ret = __netif_receive_skb(skb);
3238                         goto out;
3239                 } else if (unlikely(!skb))
3240                         goto out;
3241         }
3242
3243         /*
3244          * Make sure frames received on VLAN interfaces stacked on
3245          * bonding interfaces still make their way to any base bonding
3246          * device that may have registered for a specific ptype.  The
3247          * handler may have to adjust skb->dev and orig_dev.
3248          */
3249         orig_or_bond = orig_dev;
3250         if ((skb->dev->priv_flags & IFF_802_1Q_VLAN) &&
3251             (vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING)) {
3252                 orig_or_bond = vlan_dev_real_dev(skb->dev);
3253         }
3254
3255         type = skb->protocol;
3256         list_for_each_entry_rcu(ptype,
3257                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3258                 if (ptype->type == type && (ptype->dev == null_or_orig ||
3259                      ptype->dev == skb->dev || ptype->dev == orig_dev ||
3260                      ptype->dev == orig_or_bond)) {
3261                         if (pt_prev)
3262                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3263                         pt_prev = ptype;
3264                 }
3265         }
3266
3267         if (pt_prev) {
3268                 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3269         } else {
3270                 atomic_long_inc(&skb->dev->rx_dropped);
3271                 kfree_skb(skb);
3272                 /* Jamal, now you will not able to escape explaining
3273                  * me how you were going to use this. :-)
3274                  */
3275                 ret = NET_RX_DROP;
3276         }
3277
3278 out:
3279         rcu_read_unlock();
3280         return ret;
3281 }
3282
3283 /**
3284  *      netif_receive_skb - process receive buffer from network
3285  *      @skb: buffer to process
3286  *
3287  *      netif_receive_skb() is the main receive data processing function.
3288  *      It always succeeds. The buffer may be dropped during processing
3289  *      for congestion control or by the protocol layers.
3290  *
3291  *      This function may only be called from softirq context and interrupts
3292  *      should be enabled.
3293  *
3294  *      Return values (usually ignored):
3295  *      NET_RX_SUCCESS: no congestion
3296  *      NET_RX_DROP: packet was dropped
3297  */
3298 int netif_receive_skb(struct sk_buff *skb)
3299 {
3300         if (netdev_tstamp_prequeue)
3301                 net_timestamp_check(skb);
3302
3303         if (skb_defer_rx_timestamp(skb))
3304                 return NET_RX_SUCCESS;
3305
3306 #ifdef CONFIG_RPS
3307         {
3308                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3309                 int cpu, ret;
3310
3311                 rcu_read_lock();
3312
3313                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3314
3315                 if (cpu >= 0) {
3316                         ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3317                         rcu_read_unlock();
3318                 } else {
3319                         rcu_read_unlock();
3320                         ret = __netif_receive_skb(skb);
3321                 }
3322
3323                 return ret;
3324         }
3325 #else
3326         return __netif_receive_skb(skb);
3327 #endif
3328 }
3329 EXPORT_SYMBOL(netif_receive_skb);
3330
3331 /* Network device is going away, flush any packets still pending
3332  * Called with irqs disabled.
3333  */
3334 static void flush_backlog(void *arg)
3335 {
3336         struct net_device *dev = arg;
3337         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3338         struct sk_buff *skb, *tmp;
3339
3340         rps_lock(sd);
3341         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3342                 if (skb->dev == dev) {
3343                         __skb_unlink(skb, &sd->input_pkt_queue);
3344                         kfree_skb(skb);
3345                         input_queue_head_incr(sd);
3346                 }
3347         }
3348         rps_unlock(sd);
3349
3350         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3351                 if (skb->dev == dev) {
3352                         __skb_unlink(skb, &sd->process_queue);
3353                         kfree_skb(skb);
3354                         input_queue_head_incr(sd);
3355                 }
3356         }
3357 }
3358
3359 static int napi_gro_complete(struct sk_buff *skb)
3360 {
3361         struct packet_type *ptype;
3362         __be16 type = skb->protocol;
3363         struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3364         int err = -ENOENT;
3365
3366         if (NAPI_GRO_CB(skb)->count == 1) {
3367                 skb_shinfo(skb)->gso_size = 0;
3368                 goto out;
3369         }
3370
3371         rcu_read_lock();
3372         list_for_each_entry_rcu(ptype, head, list) {
3373                 if (ptype->type != type || ptype->dev || !ptype->gro_complete)
3374                         continue;
3375
3376                 err = ptype->gro_complete(skb);
3377                 break;
3378         }
3379         rcu_read_unlock();
3380
3381         if (err) {
3382                 WARN_ON(&ptype->list == head);
3383                 kfree_skb(skb);
3384                 return NET_RX_SUCCESS;
3385         }
3386
3387 out:
3388         return netif_receive_skb(skb);
3389 }
3390
3391 inline void napi_gro_flush(struct napi_struct *napi)
3392 {
3393         struct sk_buff *skb, *next;
3394
3395         for (skb = napi->gro_list; skb; skb = next) {
3396                 next = skb->next;
3397                 skb->next = NULL;
3398                 napi_gro_complete(skb);
3399         }
3400
3401         napi->gro_count = 0;
3402         napi->gro_list = NULL;
3403 }
3404 EXPORT_SYMBOL(napi_gro_flush);
3405
3406 enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3407 {
3408         struct sk_buff **pp = NULL;
3409         struct packet_type *ptype;
3410         __be16 type = skb->protocol;
3411         struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3412         int same_flow;
3413         int mac_len;
3414         enum gro_result ret;
3415
3416         if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3417                 goto normal;
3418
3419         if (skb_is_gso(skb) || skb_has_frag_list(skb))
3420                 goto normal;
3421
3422         rcu_read_lock();
3423         list_for_each_entry_rcu(ptype, head, list) {
3424                 if (ptype->type != type || ptype->dev || !ptype->gro_receive)
3425                         continue;
3426
3427                 skb_set_network_header(skb, skb_gro_offset(skb));
3428                 mac_len = skb->network_header - skb->mac_header;
3429                 skb->mac_len = mac_len;
3430                 NAPI_GRO_CB(skb)->same_flow = 0;
3431                 NAPI_GRO_CB(skb)->flush = 0;
3432                 NAPI_GRO_CB(skb)->free = 0;
3433
3434                 pp = ptype->gro_receive(&napi->gro_list, skb);
3435                 break;
3436         }
3437         rcu_read_unlock();
3438
3439         if (&ptype->list == head)
3440                 goto normal;
3441
3442         same_flow = NAPI_GRO_CB(skb)->same_flow;
3443         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3444
3445         if (pp) {
3446                 struct sk_buff *nskb = *pp;
3447
3448                 *pp = nskb->next;
3449                 nskb->next = NULL;
3450                 napi_gro_complete(nskb);
3451                 napi->gro_count--;
3452         }
3453
3454         if (same_flow)
3455                 goto ok;
3456
3457         if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3458                 goto normal;
3459
3460         napi->gro_count++;
3461         NAPI_GRO_CB(skb)->count = 1;
3462         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3463         skb->next = napi->gro_list;
3464         napi->gro_list = skb;
3465         ret = GRO_HELD;
3466
3467 pull:
3468         if (skb_headlen(skb) < skb_gro_offset(skb)) {
3469                 int grow = skb_gro_offset(skb) - skb_headlen(skb);
3470
3471                 BUG_ON(skb->end - skb->tail < grow);
3472
3473                 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3474
3475                 skb->tail += grow;
3476                 skb->data_len -= grow;
3477
3478                 skb_shinfo(skb)->frags[0].page_offset += grow;
3479                 skb_shinfo(skb)->frags[0].size -= grow;
3480
3481                 if (unlikely(!skb_shinfo(skb)->frags[0].size)) {
3482                         put_page(skb_shinfo(skb)->frags[0].page);
3483                         memmove(skb_shinfo(skb)->frags,
3484                                 skb_shinfo(skb)->frags + 1,
3485                                 --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3486                 }
3487         }
3488
3489 ok:
3490         return ret;
3491
3492 normal:
3493         ret = GRO_NORMAL;
3494         goto pull;
3495 }
3496 EXPORT_SYMBOL(dev_gro_receive);
3497
3498 static inline gro_result_t
3499 __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3500 {
3501         struct sk_buff *p;
3502
3503         for (p = napi->gro_list; p; p = p->next) {
3504                 unsigned long diffs;
3505
3506                 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3507                 diffs |= p->vlan_tci ^ skb->vlan_tci;
3508                 diffs |= compare_ether_header(skb_mac_header(p),
3509                                               skb_gro_mac_header(skb));
3510                 NAPI_GRO_CB(p)->same_flow = !diffs;
3511                 NAPI_GRO_CB(p)->flush = 0;
3512         }
3513
3514         return dev_gro_receive(napi, skb);
3515 }
3516
3517 gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3518 {
3519         switch (ret) {
3520         case GRO_NORMAL:
3521                 if (netif_receive_skb(skb))
3522                         ret = GRO_DROP;
3523                 break;
3524
3525         case GRO_DROP:
3526         case GRO_MERGED_FREE:
3527                 kfree_skb(skb);
3528                 break;
3529
3530         case GRO_HELD:
3531         case GRO_MERGED:
3532                 break;
3533         }
3534
3535         return ret;
3536 }
3537 EXPORT_SYMBOL(napi_skb_finish);
3538
3539 void skb_gro_reset_offset(struct sk_buff *skb)
3540 {
3541         NAPI_GRO_CB(skb)->data_offset = 0;
3542         NAPI_GRO_CB(skb)->frag0 = NULL;
3543         NAPI_GRO_CB(skb)->frag0_len = 0;
3544
3545         if (skb->mac_header == skb->tail &&
3546             !PageHighMem(skb_shinfo(skb)->frags[0].page)) {
3547                 NAPI_GRO_CB(skb)->frag0 =
3548                         page_address(skb_shinfo(skb)->frags[0].page) +
3549                         skb_shinfo(skb)->frags[0].page_offset;
3550                 NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size;
3551         }
3552 }
3553 EXPORT_SYMBOL(skb_gro_reset_offset);
3554
3555 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3556 {
3557         skb_gro_reset_offset(skb);
3558
3559         return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
3560 }
3561 EXPORT_SYMBOL(napi_gro_receive);
3562
3563 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3564 {
3565         __skb_pull(skb, skb_headlen(skb));
3566         skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
3567         skb->vlan_tci = 0;
3568         skb->dev = napi->dev;
3569         skb->skb_iif = 0;
3570
3571         napi->skb = skb;
3572 }
3573
3574 struct sk_buff *napi_get_frags(struct napi_struct *napi)
3575 {
3576         struct sk_buff *skb = napi->skb;
3577
3578         if (!skb) {
3579                 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3580                 if (skb)
3581                         napi->skb = skb;
3582         }
3583         return skb;
3584 }
3585 EXPORT_SYMBOL(napi_get_frags);
3586
3587 gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3588                                gro_result_t ret)
3589 {
3590         switch (ret) {
3591         case GRO_NORMAL:
3592         case GRO_HELD:
3593                 skb->protocol = eth_type_trans(skb, skb->dev);
3594
3595                 if (ret == GRO_HELD)
3596                         skb_gro_pull(skb, -ETH_HLEN);
3597                 else if (netif_receive_skb(skb))
3598                         ret = GRO_DROP;
3599                 break;
3600
3601         case GRO_DROP:
3602         case GRO_MERGED_FREE:
3603                 napi_reuse_skb(napi, skb);
3604                 break;
3605
3606         case GRO_MERGED:
3607                 break;
3608         }
3609
3610         return ret;
3611 }
3612 EXPORT_SYMBOL(napi_frags_finish);
3613
3614 struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3615 {
3616         struct sk_buff *skb = napi->skb;
3617         struct ethhdr *eth;
3618         unsigned int hlen;
3619         unsigned int off;
3620
3621         napi->skb = NULL;
3622
3623         skb_reset_mac_header(skb);
3624         skb_gro_reset_offset(skb);
3625
3626         off = skb_gro_offset(skb);
3627         hlen = off + sizeof(*eth);
3628         eth = skb_gro_header_fast(skb, off);
3629         if (skb_gro_header_hard(skb, hlen)) {
3630                 eth = skb_gro_header_slow(skb, hlen, off);
3631                 if (unlikely(!eth)) {
3632                         napi_reuse_skb(napi, skb);
3633                         skb = NULL;
3634                         goto out;
3635                 }
3636         }
3637
3638         skb_gro_pull(skb, sizeof(*eth));
3639
3640         /*
3641          * This works because the only protocols we care about don't require
3642          * special handling.  We'll fix it up properly at the end.
3643          */
3644         skb->protocol = eth->h_proto;
3645
3646 out:
3647         return skb;
3648 }
3649 EXPORT_SYMBOL(napi_frags_skb);
3650
3651 gro_result_t napi_gro_frags(struct napi_struct *napi)
3652 {
3653         struct sk_buff *skb = napi_frags_skb(napi);
3654
3655         if (!skb)
3656                 return GRO_DROP;
3657
3658         return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
3659 }
3660 EXPORT_SYMBOL(napi_gro_frags);
3661
3662 /*
3663  * net_rps_action sends any pending IPI's for rps.
3664  * Note: called with local irq disabled, but exits with local irq enabled.
3665  */
3666 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3667 {
3668 #ifdef CONFIG_RPS
3669         struct softnet_data *remsd = sd->rps_ipi_list;
3670
3671         if (remsd) {
3672                 sd->rps_ipi_list = NULL;
3673
3674                 local_irq_enable();
3675
3676                 /* Send pending IPI's to kick RPS processing on remote cpus. */
3677                 while (remsd) {
3678                         struct softnet_data *next = remsd->rps_ipi_next;
3679
3680                         if (cpu_online(remsd->cpu))
3681                                 __smp_call_function_single(remsd->cpu,
3682                                                            &remsd->csd, 0);
3683                         remsd = next;
3684                 }
3685         } else
3686 #endif
3687                 local_irq_enable();
3688 }
3689
3690 static int process_backlog(struct napi_struct *napi, int quota)
3691 {
3692         int work = 0;
3693         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
3694
3695 #ifdef CONFIG_RPS
3696         /* Check if we have pending ipi, its better to send them now,
3697          * not waiting net_rx_action() end.
3698          */
3699         if (sd->rps_ipi_list) {
3700                 local_irq_disable();
3701                 net_rps_action_and_irq_enable(sd);
3702         }
3703 #endif
3704         napi->weight = weight_p;
3705         local_irq_disable();
3706         while (work < quota) {
3707                 struct sk_buff *skb;
3708                 unsigned int qlen;
3709
3710                 while ((skb = __skb_dequeue(&sd->process_queue))) {
3711                         local_irq_enable();
3712                         __netif_receive_skb(skb);
3713                         local_irq_disable();
3714                         input_queue_head_incr(sd);
3715                         if (++work >= quota) {
3716                                 local_irq_enable();
3717                                 return work;
3718                         }
3719                 }
3720
3721                 rps_lock(sd);
3722                 qlen = skb_queue_len(&sd->input_pkt_queue);
3723                 if (qlen)
3724                         skb_queue_splice_tail_init(&sd->input_pkt_queue,
3725                                                    &sd->process_queue);
3726
3727                 if (qlen < quota - work) {
3728                         /*
3729                          * Inline a custom version of __napi_complete().
3730                          * only current cpu owns and manipulates this napi,
3731                          * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3732                          * we can use a plain write instead of clear_bit(),
3733                          * and we dont need an smp_mb() memory barrier.
3734                          */
3735                         list_del(&napi->poll_list);
3736                         napi->state = 0;
3737
3738                         quota = work + qlen;
3739                 }
3740                 rps_unlock(sd);
3741         }
3742         local_irq_enable();
3743
3744         return work;
3745 }
3746
3747 /**
3748  * __napi_schedule - schedule for receive
3749  * @n: entry to schedule
3750  *
3751  * The entry's receive function will be scheduled to run
3752  */
3753 void __napi_schedule(struct napi_struct *n)
3754 {
3755         unsigned long flags;
3756
3757         local_irq_save(flags);
3758         ____napi_schedule(&__get_cpu_var(softnet_data), n);
3759         local_irq_restore(flags);
3760 }
3761 EXPORT_SYMBOL(__napi_schedule);
3762
3763 void __napi_complete(struct napi_struct *n)
3764 {
3765         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3766         BUG_ON(n->gro_list);
3767
3768         list_del(&n->poll_list);
3769         smp_mb__before_clear_bit();
3770         clear_bit(NAPI_STATE_SCHED, &n->state);
3771 }
3772 EXPORT_SYMBOL(__napi_complete);
3773
3774 void napi_complete(struct napi_struct *n)
3775 {
3776         unsigned long flags;
3777
3778         /*
3779          * don't let napi dequeue from the cpu poll list
3780          * just in case its running on a different cpu
3781          */
3782         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3783                 return;
3784
3785         napi_gro_flush(n);
3786         local_irq_save(flags);
3787         __napi_complete(n);
3788         local_irq_restore(flags);
3789 }
3790 EXPORT_SYMBOL(napi_complete);
3791
3792 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3793                     int (*poll)(struct napi_struct *, int), int weight)
3794 {
3795         INIT_LIST_HEAD(&napi->poll_list);
3796         napi->gro_count = 0;
3797         napi->gro_list = NULL;
3798         napi->skb = NULL;
3799         napi->poll = poll;
3800         napi->weight = weight;
3801         list_add(&napi->dev_list, &dev->napi_list);
3802         napi->dev = dev;
3803 #ifdef CONFIG_NETPOLL
3804         spin_lock_init(&napi->poll_lock);
3805         napi->poll_owner = -1;
3806 #endif
3807         set_bit(NAPI_STATE_SCHED, &napi->state);
3808 }
3809 EXPORT_SYMBOL(netif_napi_add);
3810
3811 void netif_napi_del(struct napi_struct *napi)
3812 {
3813         struct sk_buff *skb, *next;
3814
3815         list_del_init(&napi->dev_list);
3816         napi_free_frags(napi);
3817
3818         for (skb = napi->gro_list; skb; skb = next) {
3819                 next = skb->next;
3820                 skb->next = NULL;
3821                 kfree_skb(skb);
3822         }
3823
3824         napi->gro_list = NULL;
3825         napi->gro_count = 0;
3826 }
3827 EXPORT_SYMBOL(netif_napi_del);
3828
3829 static void net_rx_action(struct softirq_action *h)
3830 {
3831         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3832         unsigned long time_limit = jiffies + 2;
3833         int budget = netdev_budget;
3834         void *have;
3835
3836         local_irq_disable();
3837
3838         while (!list_empty(&sd->poll_list)) {
3839                 struct napi_struct *n;
3840                 int work, weight;
3841
3842                 /* If softirq window is exhuasted then punt.
3843                  * Allow this to run for 2 jiffies since which will allow
3844                  * an average latency of 1.5/HZ.
3845                  */
3846                 if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
3847                         goto softnet_break;
3848
3849                 local_irq_enable();
3850
3851                 /* Even though interrupts have been re-enabled, this
3852                  * access is safe because interrupts can only add new
3853                  * entries to the tail of this list, and only ->poll()
3854                  * calls can remove this head entry from the list.
3855                  */
3856                 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
3857
3858                 have = netpoll_poll_lock(n);
3859
3860                 weight = n->weight;
3861
3862                 /* This NAPI_STATE_SCHED test is for avoiding a race
3863                  * with netpoll's poll_napi().  Only the entity which
3864                  * obtains the lock and sees NAPI_STATE_SCHED set will
3865                  * actually make the ->poll() call.  Therefore we avoid
3866                  * accidently calling ->poll() when NAPI is not scheduled.
3867                  */
3868                 work = 0;
3869                 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
3870                         work = n->poll(n, weight);
3871                         trace_napi_poll(n);
3872                 }
3873
3874                 WARN_ON_ONCE(work > weight);
3875
3876                 budget -= work;
3877
3878                 local_irq_disable();
3879
3880                 /* Drivers must not modify the NAPI state if they
3881                  * consume the entire weight.  In such cases this code
3882                  * still "owns" the NAPI instance and therefore can
3883                  * move the instance around on the list at-will.
3884                  */
3885                 if (unlikely(work == weight)) {
3886                         if (unlikely(napi_disable_pending(n))) {
3887                                 local_irq_enable();
3888                                 napi_complete(n);
3889                                 local_irq_disable();
3890                         } else
3891                                 list_move_tail(&n->poll_list, &sd->poll_list);
3892                 }
3893
3894                 netpoll_poll_unlock(have);
3895         }
3896 out:
3897         net_rps_action_and_irq_enable(sd);
3898
3899 #ifdef CONFIG_NET_DMA
3900         /*
3901          * There may not be any more sk_buffs coming right now, so push
3902          * any pending DMA copies to hardware
3903          */
3904         dma_issue_pending_all();
3905 #endif
3906
3907         return;
3908
3909 softnet_break:
3910         sd->time_squeeze++;
3911         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3912         goto out;
3913 }
3914
3915 static gifconf_func_t *gifconf_list[NPROTO];
3916
3917 /**
3918  *      register_gifconf        -       register a SIOCGIF handler
3919  *      @family: Address family
3920  *      @gifconf: Function handler
3921  *
3922  *      Register protocol dependent address dumping routines. The handler
3923  *      that is passed must not be freed or reused until it has been replaced
3924  *      by another handler.
3925  */
3926 int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
3927 {
3928         if (family >= NPROTO)
3929                 return -EINVAL;
3930         gifconf_list[family] = gifconf;
3931         return 0;
3932 }
3933 EXPORT_SYMBOL(register_gifconf);
3934
3935
3936 /*
3937  *      Map an interface index to its name (SIOCGIFNAME)
3938  */
3939
3940 /*
3941  *      We need this ioctl for efficient implementation of the
3942  *      if_indextoname() function required by the IPv6 API.  Without
3943  *      it, we would have to search all the interfaces to find a
3944  *      match.  --pb
3945  */
3946
3947 static int dev_ifname(struct net *net, struct ifreq __user *arg)
3948 {
3949         struct net_device *dev;
3950         struct ifreq ifr;
3951
3952         /*
3953          *      Fetch the caller's info block.
3954          */
3955
3956         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3957                 return -EFAULT;
3958
3959         rcu_read_lock();
3960         dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
3961         if (!dev) {
3962                 rcu_read_unlock();
3963                 return -ENODEV;
3964         }
3965
3966         strcpy(ifr.ifr_name, dev->name);
3967         rcu_read_unlock();
3968
3969         if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
3970                 return -EFAULT;
3971         return 0;
3972 }
3973
3974 /*
3975  *      Perform a SIOCGIFCONF call. This structure will change
3976  *      size eventually, and there is nothing I can do about it.
3977  *      Thus we will need a 'compatibility mode'.
3978  */
3979
3980 static int dev_ifconf(struct net *net, char __user *arg)
3981 {
3982         struct ifconf ifc;
3983         struct net_device *dev;
3984         char __user *pos;
3985         int len;
3986         int total;
3987         int i;
3988
3989         /*
3990          *      Fetch the caller's info block.
3991          */
3992
3993         if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
3994                 return -EFAULT;
3995
3996         pos = ifc.ifc_buf;
3997         len = ifc.ifc_len;
3998
3999         /*
4000          *      Loop over the interfaces, and write an info block for each.
4001          */
4002
4003         total = 0;
4004         for_each_netdev(net, dev) {
4005                 for (i = 0; i < NPROTO; i++) {
4006                         if (gifconf_list[i]) {
4007                                 int done;
4008                                 if (!pos)
4009                                         done = gifconf_list[i](dev, NULL, 0);
4010                                 else
4011                                         done = gifconf_list[i](dev, pos + total,
4012                                                                len - total);
4013                                 if (done < 0)
4014                                         return -EFAULT;
4015                                 total += done;
4016                         }
4017                 }
4018         }
4019
4020         /*
4021          *      All done.  Write the updated control block back to the caller.
4022          */
4023         ifc.ifc_len = total;
4024
4025         /*
4026          *      Both BSD and Solaris return 0 here, so we do too.
4027          */
4028         return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
4029 }
4030
4031 #ifdef CONFIG_PROC_FS
4032 /*
4033  *      This is invoked by the /proc filesystem handler to display a device
4034  *      in detail.
4035  */
4036 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
4037         __acquires(RCU)
4038 {
4039         struct net *net = seq_file_net(seq);
4040         loff_t off;
4041         struct net_device *dev;
4042
4043         rcu_read_lock();
4044         if (!*pos)
4045                 return SEQ_START_TOKEN;
4046
4047         off = 1;
4048         for_each_netdev_rcu(net, dev)
4049                 if (off++ == *pos)
4050                         return dev;
4051
4052         return NULL;
4053 }
4054
4055 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4056 {
4057         struct net_device *dev = v;
4058
4059         if (v == SEQ_START_TOKEN)
4060                 dev = first_net_device_rcu(seq_file_net(seq));
4061         else
4062                 dev = next_net_device_rcu(dev);
4063
4064         ++*pos;
4065         return dev;
4066 }
4067
4068 void dev_seq_stop(struct seq_file *seq, void *v)
4069         __releases(RCU)
4070 {
4071         rcu_read_unlock();
4072 }
4073
4074 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
4075 {
4076         struct rtnl_link_stats64 temp;
4077         const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
4078
4079         seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
4080                    "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
4081                    dev->name, stats->rx_bytes, stats->rx_packets,
4082                    stats->rx_errors,
4083                    stats->rx_dropped + stats->rx_missed_errors,
4084                    stats->rx_fifo_errors,
4085                    stats->rx_length_errors + stats->rx_over_errors +
4086                     stats->rx_crc_errors + stats->rx_frame_errors,
4087                    stats->rx_compressed, stats->multicast,
4088                    stats->tx_bytes, stats->tx_packets,
4089                    stats->tx_errors, stats->tx_dropped,
4090                    stats->tx_fifo_errors, stats->collisions,
4091                    stats->tx_carrier_errors +
4092                     stats->tx_aborted_errors +
4093                     stats->tx_window_errors +
4094                     stats->tx_heartbeat_errors,
4095                    stats->tx_compressed);
4096 }
4097
4098 /*
4099  *      Called from the PROCfs module. This now uses the new arbitrary sized
4100  *      /proc/net interface to create /proc/net/dev
4101  */
4102 static int dev_seq_show(struct seq_file *seq, void *v)
4103 {
4104         if (v == SEQ_START_TOKEN)
4105                 seq_puts(seq, "Inter-|   Receive                            "
4106                               "                    |  Transmit\n"
4107                               " face |bytes    packets errs drop fifo frame "
4108                               "compressed multicast|bytes    packets errs "
4109                               "drop fifo colls carrier compressed\n");
4110         else
4111                 dev_seq_printf_stats(seq, v);
4112         return 0;
4113 }
4114
4115 static struct softnet_data *softnet_get_online(loff_t *pos)
4116 {
4117         struct softnet_data *sd = NULL;
4118
4119         while (*pos < nr_cpu_ids)
4120                 if (cpu_online(*pos)) {
4121                         sd = &per_cpu(softnet_data, *pos);
4122                         break;
4123                 } else
4124                         ++*pos;
4125         return sd;
4126 }
4127
4128 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
4129 {
4130         return softnet_get_online(pos);
4131 }
4132
4133 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4134 {
4135         ++*pos;
4136         return softnet_get_online(pos);
4137 }
4138
4139 static void softnet_seq_stop(struct seq_file *seq, void *v)
4140 {
4141 }
4142
4143 static int softnet_seq_show(struct seq_file *seq, void *v)
4144 {
4145         struct softnet_data *sd = v;
4146
4147         seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
4148                    sd->processed, sd->dropped, sd->time_squeeze, 0,
4149                    0, 0, 0, 0, /* was fastroute */
4150                    sd->cpu_collision, sd->received_rps);
4151         return 0;
4152 }
4153
4154 static const struct seq_operations dev_seq_ops = {
4155         .start = dev_seq_start,
4156         .next  = dev_seq_next,
4157         .stop  = dev_seq_stop,
4158         .show  = dev_seq_show,
4159 };
4160
4161 static int dev_seq_open(struct inode *inode, struct file *file)
4162 {
4163         return seq_open_net(inode, file, &dev_seq_ops,
4164                             sizeof(struct seq_net_private));
4165 }
4166
4167 static const struct file_operations dev_seq_fops = {
4168         .owner   = THIS_MODULE,
4169         .open    = dev_seq_open,
4170         .read    = seq_read,
4171         .llseek  = seq_lseek,
4172         .release = seq_release_net,
4173 };
4174
4175 static const struct seq_operations softnet_seq_ops = {
4176         .start = softnet_seq_start,
4177         .next  = softnet_seq_next,
4178         .stop  = softnet_seq_stop,
4179         .show  = softnet_seq_show,
4180 };
4181
4182 static int softnet_seq_open(struct inode *inode, struct file *file)
4183 {
4184         return seq_open(file, &softnet_seq_ops);
4185 }
4186
4187 static const struct file_operations softnet_seq_fops = {
4188         .owner   = THIS_MODULE,
4189         .open    = softnet_seq_open,
4190         .read    = seq_read,
4191         .llseek  = seq_lseek,
4192         .release = seq_release,
4193 };
4194
4195 static void *ptype_get_idx(loff_t pos)
4196 {
4197         struct packet_type *pt = NULL;
4198         loff_t i = 0;
4199         int t;
4200
4201         list_for_each_entry_rcu(pt, &ptype_all, list) {
4202                 if (i == pos)
4203                         return pt;
4204                 ++i;
4205         }
4206
4207         for (t = 0; t < PTYPE_HASH_SIZE; t++) {
4208                 list_for_each_entry_rcu(pt, &ptype_base[t], list) {
4209                         if (i == pos)
4210                                 return pt;
4211                         ++i;
4212                 }
4213         }
4214         return NULL;
4215 }
4216
4217 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
4218         __acquires(RCU)
4219 {
4220         rcu_read_lock();
4221         return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
4222 }
4223
4224 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4225 {
4226         struct packet_type *pt;
4227         struct list_head *nxt;
4228         int hash;
4229
4230         ++*pos;
4231         if (v == SEQ_START_TOKEN)
4232                 return ptype_get_idx(0);
4233
4234         pt = v;
4235         nxt = pt->list.next;
4236         if (pt->type == htons(ETH_P_ALL)) {
4237                 if (nxt != &ptype_all)
4238                         goto found;
4239                 hash = 0;
4240                 nxt = ptype_base[0].next;
4241         } else
4242                 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
4243
4244         while (nxt == &ptype_base[hash]) {
4245                 if (++hash >= PTYPE_HASH_SIZE)
4246                         return NULL;
4247                 nxt = ptype_base[hash].next;
4248         }
4249 found:
4250         return list_entry(nxt, struct packet_type, list);
4251 }
4252
4253 static void ptype_seq_stop(struct seq_file *seq, void *v)
4254         __releases(RCU)
4255 {
4256         rcu_read_unlock();
4257 }
4258
4259 static int ptype_seq_show(struct seq_file *seq, void *v)
4260 {
4261         struct packet_type *pt = v;
4262
4263         if (v == SEQ_START_TOKEN)
4264                 seq_puts(seq, "Type Device      Function\n");
4265         else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
4266                 if (pt->type == htons(ETH_P_ALL))
4267                         seq_puts(seq, "ALL ");
4268                 else
4269                         seq_printf(seq, "%04x", ntohs(pt->type));
4270
4271                 seq_printf(seq, " %-8s %pF\n",
4272                            pt->dev ? pt->dev->name : "", pt->func);
4273         }
4274
4275         return 0;
4276 }
4277
4278 static const struct seq_operations ptype_seq_ops = {
4279         .start = ptype_seq_start,
4280         .next  = ptype_seq_next,
4281         .stop  = ptype_seq_stop,
4282         .show  = ptype_seq_show,
4283 };
4284
4285 static int ptype_seq_open(struct inode *inode, struct file *file)
4286 {
4287         return seq_open_net(inode, file, &ptype_seq_ops,
4288                         sizeof(struct seq_net_private));
4289 }
4290
4291 static const struct file_operations ptype_seq_fops = {
4292         .owner   = THIS_MODULE,
4293         .open    = ptype_seq_open,
4294         .read    = seq_read,
4295         .llseek  = seq_lseek,
4296         .release = seq_release_net,
4297 };
4298
4299
4300 static int __net_init dev_proc_net_init(struct net *net)
4301 {
4302         int rc = -ENOMEM;
4303
4304         if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
4305                 goto out;
4306         if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
4307                 goto out_dev;
4308         if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
4309                 goto out_softnet;
4310
4311         if (wext_proc_init(net))
4312                 goto out_ptype;
4313         rc = 0;
4314 out:
4315         return rc;
4316 out_ptype:
4317         proc_net_remove(net, "ptype");
4318 out_softnet:
4319         proc_net_remove(net, "softnet_stat");
4320 out_dev:
4321         proc_net_remove(net, "dev");
4322         goto out;
4323 }
4324
4325 static void __net_exit dev_proc_net_exit(struct net *net)
4326 {
4327         wext_proc_exit(net);
4328
4329         proc_net_remove(net, "ptype");
4330         proc_net_remove(net, "softnet_stat");
4331         proc_net_remove(net, "dev");
4332 }
4333
4334 static struct pernet_operations __net_initdata dev_proc_ops = {
4335         .init = dev_proc_net_init,
4336         .exit = dev_proc_net_exit,
4337 };
4338
4339 static int __init dev_proc_init(void)
4340 {
4341         return register_pernet_subsys(&dev_proc_ops);
4342 }
4343 #else
4344 #define dev_proc_init() 0
4345 #endif  /* CONFIG_PROC_FS */
4346
4347
4348 /**
4349  *      netdev_set_master       -       set up master/slave pair
4350  *      @slave: slave device
4351  *      @master: new master device
4352  *
4353  *      Changes the master device of the slave. Pass %NULL to break the
4354  *      bonding. The caller must hold the RTNL semaphore. On a failure
4355  *      a negative errno code is returned. On success the reference counts
4356  *      are adjusted, %RTM_NEWLINK is sent to the routing socket and the
4357  *      function returns zero.
4358  */
4359 int netdev_set_master(struct net_device *slave, struct net_device *master)
4360 {
4361         struct net_device *old = slave->master;
4362
4363         ASSERT_RTNL();
4364
4365         if (master) {
4366                 if (old)
4367                         return -EBUSY;
4368                 dev_hold(master);
4369         }
4370
4371         slave->master = master;
4372
4373         if (old) {
4374                 synchronize_net();
4375                 dev_put(old);
4376         }
4377         if (master)
4378                 slave->flags |= IFF_SLAVE;
4379         else
4380                 slave->flags &= ~IFF_SLAVE;
4381
4382         rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4383         return 0;
4384 }
4385 EXPORT_SYMBOL(netdev_set_master);
4386
4387 static void dev_change_rx_flags(struct net_device *dev, int flags)
4388 {
4389         const struct net_device_ops *ops = dev->netdev_ops;
4390
4391         if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4392                 ops->ndo_change_rx_flags(dev, flags);
4393 }
4394
4395 static int __dev_set_promiscuity(struct net_device *dev, int inc)
4396 {
4397         unsigned short old_flags = dev->flags;
4398         uid_t uid;
4399         gid_t gid;
4400
4401         ASSERT_RTNL();
4402
4403         dev->flags |= IFF_PROMISC;
4404         dev->promiscuity += inc;
4405         if (dev->promiscuity == 0) {
4406                 /*
4407                  * Avoid overflow.
4408                  * If inc causes overflow, untouch promisc and return error.
4409                  */
4410                 if (inc < 0)
4411                         dev->flags &= ~IFF_PROMISC;
4412                 else {
4413                         dev->promiscuity -= inc;
4414                         printk(KERN_WARNING "%s: promiscuity touches roof, "
4415                                 "set promiscuity failed, promiscuity feature "
4416                                 "of device might be broken.\n", dev->name);
4417                         return -EOVERFLOW;
4418                 }
4419         }
4420         if (dev->flags != old_flags) {
4421                 printk(KERN_INFO "device %s %s promiscuous mode\n",
4422                        dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
4423                                                                "left");
4424                 if (audit_enabled) {
4425                         current_uid_gid(&uid, &gid);
4426                         audit_log(current->audit_context, GFP_ATOMIC,
4427                                 AUDIT_ANOM_PROMISCUOUS,
4428                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4429                                 dev->name, (dev->flags & IFF_PROMISC),
4430                                 (old_flags & IFF_PROMISC),
4431                                 audit_get_loginuid(current),
4432                                 uid, gid,
4433                                 audit_get_sessionid(current));
4434                 }
4435
4436                 dev_change_rx_flags(dev, IFF_PROMISC);
4437         }
4438         return 0;
4439 }
4440
4441 /**
4442  *      dev_set_promiscuity     - update promiscuity count on a device
4443  *      @dev: device
4444  *      @inc: modifier
4445  *
4446  *      Add or remove promiscuity from a device. While the count in the device
4447  *      remains above zero the interface remains promiscuous. Once it hits zero
4448  *      the device reverts back to normal filtering operation. A negative inc
4449  *      value is used to drop promiscuity on the device.
4450  *      Return 0 if successful or a negative errno code on error.
4451  */
4452 int dev_set_promiscuity(struct net_device *dev, int inc)
4453 {
4454         unsigned short old_flags = dev->flags;
4455         int err;
4456
4457         err = __dev_set_promiscuity(dev, inc);
4458         if (err < 0)
4459                 return err;
4460         if (dev->flags != old_flags)
4461                 dev_set_rx_mode(dev);
4462         return err;
4463 }
4464 EXPORT_SYMBOL(dev_set_promiscuity);
4465
4466 /**
4467  *      dev_set_allmulti        - update allmulti count on a device
4468  *      @dev: device
4469  *      @inc: modifier
4470  *
4471  *      Add or remove reception of all multicast frames to a device. While the
4472  *      count in the device remains above zero the interface remains listening
4473  *      to all interfaces. Once it hits zero the device reverts back to normal
4474  *      filtering operation. A negative @inc value is used to drop the counter
4475  *      when releasing a resource needing all multicasts.
4476  *      Return 0 if successful or a negative errno code on error.
4477  */
4478
4479 int dev_set_allmulti(struct net_device *dev, int inc)
4480 {
4481         unsigned short old_flags = dev->flags;
4482
4483         ASSERT_RTNL();
4484
4485         dev->flags |= IFF_ALLMULTI;
4486         dev->allmulti += inc;
4487         if (dev->allmulti == 0) {
4488                 /*
4489                  * Avoid overflow.
4490                  * If inc causes overflow, untouch allmulti and return error.
4491                  */
4492                 if (inc < 0)
4493                         dev->flags &= ~IFF_ALLMULTI;
4494                 else {
4495                         dev->allmulti -= inc;
4496                         printk(KERN_WARNING "%s: allmulti touches roof, "
4497                                 "set allmulti failed, allmulti feature of "
4498                                 "device might be broken.\n", dev->name);
4499                         return -EOVERFLOW;
4500                 }
4501         }
4502         if (dev->flags ^ old_flags) {
4503                 dev_change_rx_flags(dev, IFF_ALLMULTI);
4504                 dev_set_rx_mode(dev);
4505         }
4506         return 0;
4507 }
4508 EXPORT_SYMBOL(dev_set_allmulti);
4509
4510 /*
4511  *      Upload unicast and multicast address lists to device and
4512  *      configure RX filtering. When the device doesn't support unicast
4513  *      filtering it is put in promiscuous mode while unicast addresses
4514  *      are present.
4515  */
4516 void __dev_set_rx_mode(struct net_device *dev)
4517 {
4518         const struct net_device_ops *ops = dev->netdev_ops;
4519
4520         /* dev_open will call this function so the list will stay sane. */
4521         if (!(dev->flags&IFF_UP))
4522                 return;
4523
4524         if (!netif_device_present(dev))
4525                 return;
4526
4527         if (ops->ndo_set_rx_mode)
4528                 ops->ndo_set_rx_mode(dev);
4529         else {
4530                 /* Unicast addresses changes may only happen under the rtnl,
4531                  * therefore calling __dev_set_promiscuity here is safe.
4532                  */
4533                 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4534                         __dev_set_promiscuity(dev, 1);
4535                         dev->uc_promisc = 1;
4536                 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4537                         __dev_set_promiscuity(dev, -1);
4538                         dev->uc_promisc = 0;
4539                 }
4540
4541                 if (ops->ndo_set_multicast_list)
4542                         ops->ndo_set_multicast_list(dev);
4543         }
4544 }
4545
4546 void dev_set_rx_mode(struct net_device *dev)
4547 {
4548         netif_addr_lock_bh(dev);
4549         __dev_set_rx_mode(dev);
4550         netif_addr_unlock_bh(dev);
4551 }
4552
4553 /**
4554  *      dev_get_flags - get flags reported to userspace
4555  *      @dev: device
4556  *
4557  *      Get the combination of flag bits exported through APIs to userspace.
4558  */
4559 unsigned dev_get_flags(const struct net_device *dev)
4560 {
4561         unsigned flags;
4562
4563         flags = (dev->flags & ~(IFF_PROMISC |
4564                                 IFF_ALLMULTI |
4565                                 IFF_RUNNING |
4566                                 IFF_LOWER_UP |
4567                                 IFF_DORMANT)) |
4568                 (dev->gflags & (IFF_PROMISC |
4569                                 IFF_ALLMULTI));
4570
4571         if (netif_running(dev)) {
4572                 if (netif_oper_up(dev))
4573                         flags |= IFF_RUNNING;
4574                 if (netif_carrier_ok(dev))
4575                         flags |= IFF_LOWER_UP;
4576                 if (netif_dormant(dev))
4577                         flags |= IFF_DORMANT;
4578         }
4579
4580         return flags;
4581 }
4582 EXPORT_SYMBOL(dev_get_flags);
4583
4584 int __dev_change_flags(struct net_device *dev, unsigned int flags)
4585 {
4586         int old_flags = dev->flags;
4587         int ret;
4588
4589         ASSERT_RTNL();
4590
4591         /*
4592          *      Set the flags on our device.
4593          */
4594
4595         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4596                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4597                                IFF_AUTOMEDIA)) |
4598                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4599                                     IFF_ALLMULTI));
4600
4601         /*
4602          *      Load in the correct multicast list now the flags have changed.
4603          */
4604
4605         if ((old_flags ^ flags) & IFF_MULTICAST)
4606                 dev_change_rx_flags(dev, IFF_MULTICAST);
4607
4608         dev_set_rx_mode(dev);
4609
4610         /*
4611          *      Have we downed the interface. We handle IFF_UP ourselves
4612          *      according to user attempts to set it, rather than blindly
4613          *      setting it.
4614          */
4615
4616         ret = 0;
4617         if ((old_flags ^ flags) & IFF_UP) {     /* Bit is different  ? */
4618                 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4619
4620                 if (!ret)
4621                         dev_set_rx_mode(dev);
4622         }
4623
4624         if ((flags ^ dev->gflags) & IFF_PROMISC) {
4625                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
4626
4627                 dev->gflags ^= IFF_PROMISC;
4628                 dev_set_promiscuity(dev, inc);
4629         }
4630
4631         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4632            is important. Some (broken) drivers set IFF_PROMISC, when
4633            IFF_ALLMULTI is requested not asking us and not reporting.
4634          */
4635         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4636                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4637
4638                 dev->gflags ^= IFF_ALLMULTI;
4639                 dev_set_allmulti(dev, inc);
4640         }
4641
4642         return ret;
4643 }
4644
4645 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4646 {
4647         unsigned int changes = dev->flags ^ old_flags;
4648
4649         if (changes & IFF_UP) {
4650                 if (dev->flags & IFF_UP)
4651                         call_netdevice_notifiers(NETDEV_UP, dev);
4652                 else
4653                         call_netdevice_notifiers(NETDEV_DOWN, dev);
4654         }
4655
4656         if (dev->flags & IFF_UP &&
4657             (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4658                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
4659 }
4660
4661 /**
4662  *      dev_change_flags - change device settings
4663  *      @dev: device
4664  *      @flags: device state flags
4665  *
4666  *      Change settings on device based state flags. The flags are
4667  *      in the userspace exported format.
4668  */
4669 int dev_change_flags(struct net_device *dev, unsigned flags)
4670 {
4671         int ret, changes;
4672         int old_flags = dev->flags;
4673
4674         ret = __dev_change_flags(dev, flags);
4675         if (ret < 0)
4676                 return ret;
4677
4678         changes = old_flags ^ dev->flags;
4679         if (changes)
4680                 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4681
4682         __dev_notify_flags(dev, old_flags);
4683         return ret;
4684 }
4685 EXPORT_SYMBOL(dev_change_flags);
4686
4687 /**
4688  *      dev_set_mtu - Change maximum transfer unit
4689  *      @dev: device
4690  *      @new_mtu: new transfer unit
4691  *
4692  *      Change the maximum transfer size of the network device.
4693  */
4694 int dev_set_mtu(struct net_device *dev, int new_mtu)
4695 {
4696         const struct net_device_ops *ops = dev->netdev_ops;
4697         int err;
4698
4699         if (new_mtu == dev->mtu)
4700                 return 0;
4701
4702         /*      MTU must be positive.    */
4703         if (new_mtu < 0)
4704                 return -EINVAL;
4705
4706         if (!netif_device_present(dev))
4707                 return -ENODEV;
4708
4709         err = 0;
4710         if (ops->ndo_change_mtu)
4711                 err = ops->ndo_change_mtu(dev, new_mtu);
4712         else
4713                 dev->mtu = new_mtu;
4714
4715         if (!err && dev->flags & IFF_UP)
4716                 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4717         return err;
4718 }
4719 EXPORT_SYMBOL(dev_set_mtu);
4720
4721 /**
4722  *      dev_set_group - Change group this device belongs to
4723  *      @dev: device
4724  *      @new_group: group this device should belong to
4725  */
4726 void dev_set_group(struct net_device *dev, int new_group)
4727 {
4728         dev->group = new_group;
4729 }
4730 EXPORT_SYMBOL(dev_set_group);
4731
4732 /**
4733  *      dev_set_mac_address - Change Media Access Control Address
4734  *      @dev: device
4735  *      @sa: new address
4736  *
4737  *      Change the hardware (MAC) address of the device
4738  */
4739 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4740 {
4741         const struct net_device_ops *ops = dev->netdev_ops;
4742         int err;
4743
4744         if (!ops->ndo_set_mac_address)
4745                 return -EOPNOTSUPP;
4746         if (sa->sa_family != dev->type)
4747                 return -EINVAL;
4748         if (!netif_device_present(dev))
4749                 return -ENODEV;
4750         err = ops->ndo_set_mac_address(dev, sa);
4751         if (!err)
4752                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4753         return err;
4754 }
4755 EXPORT_SYMBOL(dev_set_mac_address);
4756
4757 /*
4758  *      Perform the SIOCxIFxxx calls, inside rcu_read_lock()
4759  */
4760 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4761 {
4762         int err;
4763         struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
4764
4765         if (!dev)
4766                 return -ENODEV;
4767
4768         switch (cmd) {
4769         case SIOCGIFFLAGS:      /* Get interface flags */
4770                 ifr->ifr_flags = (short) dev_get_flags(dev);
4771                 return 0;
4772
4773         case SIOCGIFMETRIC:     /* Get the metric on the interface
4774                                    (currently unused) */
4775                 ifr->ifr_metric = 0;
4776                 return 0;
4777
4778         case SIOCGIFMTU:        /* Get the MTU of a device */
4779                 ifr->ifr_mtu = dev->mtu;
4780                 return 0;
4781
4782         case SIOCGIFHWADDR:
4783                 if (!dev->addr_len)
4784                         memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4785                 else
4786                         memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4787                                min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4788                 ifr->ifr_hwaddr.sa_family = dev->type;
4789                 return 0;
4790
4791         case SIOCGIFSLAVE:
4792                 err = -EINVAL;
4793                 break;
4794
4795         case SIOCGIFMAP:
4796                 ifr->ifr_map.mem_start = dev->mem_start;
4797                 ifr->ifr_map.mem_end   = dev->mem_end;
4798                 ifr->ifr_map.base_addr = dev->base_addr;
4799                 ifr->ifr_map.irq       = dev->irq;
4800                 ifr->ifr_map.dma       = dev->dma;
4801                 ifr->ifr_map.port      = dev->if_port;
4802                 return 0;
4803
4804         case SIOCGIFINDEX:
4805                 ifr->ifr_ifindex = dev->ifindex;
4806                 return 0;
4807
4808         case SIOCGIFTXQLEN:
4809                 ifr->ifr_qlen = dev->tx_queue_len;
4810                 return 0;
4811
4812         default:
4813                 /* dev_ioctl() should ensure this case
4814                  * is never reached
4815                  */
4816                 WARN_ON(1);
4817                 err = -EINVAL;
4818                 break;
4819
4820         }
4821         return err;
4822 }
4823
4824 /*
4825  *      Perform the SIOCxIFxxx calls, inside rtnl_lock()
4826  */
4827 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4828 {
4829         int err;
4830         struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4831         const struct net_device_ops *ops;
4832
4833         if (!dev)
4834                 return -ENODEV;
4835
4836         ops = dev->netdev_ops;
4837
4838         switch (cmd) {
4839         case SIOCSIFFLAGS:      /* Set interface flags */
4840                 return dev_change_flags(dev, ifr->ifr_flags);
4841
4842         case SIOCSIFMETRIC:     /* Set the metric on the interface
4843                                    (currently unused) */
4844                 return -EOPNOTSUPP;
4845
4846         case SIOCSIFMTU:        /* Set the MTU of a device */
4847                 return dev_set_mtu(dev, ifr->ifr_mtu);
4848
4849         case SIOCSIFHWADDR:
4850                 return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
4851
4852         case SIOCSIFHWBROADCAST:
4853                 if (ifr->ifr_hwaddr.sa_family != dev->type)
4854                         return -EINVAL;
4855                 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4856                        min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4857                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4858                 return 0;
4859
4860         case SIOCSIFMAP:
4861                 if (ops->ndo_set_config) {
4862                         if (!netif_device_present(dev))
4863                                 return -ENODEV;
4864                         return ops->ndo_set_config(dev, &ifr->ifr_map);
4865                 }
4866                 return -EOPNOTSUPP;
4867
4868         case SIOCADDMULTI:
4869                 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4870                     ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4871                         return -EINVAL;
4872                 if (!netif_device_present(dev))
4873                         return -ENODEV;
4874                 return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
4875
4876         case SIOCDELMULTI:
4877                 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4878                     ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4879                         return -EINVAL;
4880                 if (!netif_device_present(dev))
4881                         return -ENODEV;
4882                 return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
4883
4884         case SIOCSIFTXQLEN:
4885                 if (ifr->ifr_qlen < 0)
4886                         return -EINVAL;
4887                 dev->tx_queue_len = ifr->ifr_qlen;
4888                 return 0;
4889
4890         case SIOCSIFNAME:
4891                 ifr->ifr_newname[IFNAMSIZ-1] = '\0';
4892                 return dev_change_name(dev, ifr->ifr_newname);
4893
4894         /*
4895          *      Unknown or private ioctl
4896          */
4897         default:
4898                 if ((cmd >= SIOCDEVPRIVATE &&
4899                     cmd <= SIOCDEVPRIVATE + 15) ||
4900                     cmd == SIOCBONDENSLAVE ||
4901                     cmd == SIOCBONDRELEASE ||
4902                     cmd == SIOCBONDSETHWADDR ||
4903                     cmd == SIOCBONDSLAVEINFOQUERY ||
4904                     cmd == SIOCBONDINFOQUERY ||
4905                     cmd == SIOCBONDCHANGEACTIVE ||
4906                     cmd == SIOCGMIIPHY ||
4907                     cmd == SIOCGMIIREG ||
4908                     cmd == SIOCSMIIREG ||
4909                     cmd == SIOCBRADDIF ||
4910                     cmd == SIOCBRDELIF ||
4911                     cmd == SIOCSHWTSTAMP ||
4912                     cmd == SIOCWANDEV) {
4913                         err = -EOPNOTSUPP;
4914                         if (ops->ndo_do_ioctl) {
4915                                 if (netif_device_present(dev))
4916                                         err = ops->ndo_do_ioctl(dev, ifr, cmd);
4917                                 else
4918                                         err = -ENODEV;
4919                         }
4920                 } else
4921                         err = -EINVAL;
4922
4923         }
4924         return err;
4925 }
4926
4927 /*
4928  *      This function handles all "interface"-type I/O control requests. The actual
4929  *      'doing' part of this is dev_ifsioc above.
4930  */
4931
4932 /**
4933  *      dev_ioctl       -       network device ioctl
4934  *      @net: the applicable net namespace
4935  *      @cmd: command to issue
4936  *      @arg: pointer to a struct ifreq in user space
4937  *
4938  *      Issue ioctl functions to devices. This is normally called by the
4939  *      user space syscall interfaces but can sometimes be useful for
4940  *      other purposes. The return value is the return from the syscall if
4941  *      positive or a negative errno code on error.
4942  */
4943
4944 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
4945 {
4946         struct ifreq ifr;
4947         int ret;
4948         char *colon;
4949
4950         /* One special case: SIOCGIFCONF takes ifconf argument
4951            and requires shared lock, because it sleeps writing
4952            to user space.
4953          */
4954
4955         if (cmd == SIOCGIFCONF) {
4956                 rtnl_lock();
4957                 ret = dev_ifconf(net, (char __user *) arg);
4958                 rtnl_unlock();
4959                 return ret;
4960         }
4961         if (cmd == SIOCGIFNAME)
4962                 return dev_ifname(net, (struct ifreq __user *)arg);
4963
4964         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4965                 return -EFAULT;
4966
4967         ifr.ifr_name[IFNAMSIZ-1] = 0;
4968
4969         colon = strchr(ifr.ifr_name, ':');
4970         if (colon)
4971                 *colon = 0;
4972
4973         /*
4974          *      See which interface the caller is talking about.
4975          */
4976
4977         switch (cmd) {
4978         /*
4979          *      These ioctl calls:
4980          *      - can be done by all.
4981          *      - atomic and do not require locking.
4982          *      - return a value
4983          */
4984         case SIOCGIFFLAGS:
4985         case SIOCGIFMETRIC:
4986         case SIOCGIFMTU:
4987         case SIOCGIFHWADDR:
4988         case SIOCGIFSLAVE:
4989         case SIOCGIFMAP:
4990         case SIOCGIFINDEX:
4991         case SIOCGIFTXQLEN:
4992                 dev_load(net, ifr.ifr_name);
4993                 rcu_read_lock();
4994                 ret = dev_ifsioc_locked(net, &ifr, cmd);
4995                 rcu_read_unlock();
4996                 if (!ret) {
4997                         if (colon)
4998                                 *colon = ':';
4999                         if (copy_to_user(arg, &ifr,
5000                                          sizeof(struct ifreq)))
5001                                 ret = -EFAULT;
5002                 }
5003                 return ret;
5004
5005         case SIOCETHTOOL:
5006                 dev_load(net, ifr.ifr_name);
5007                 rtnl_lock();
5008                 ret = dev_ethtool(net, &ifr);
5009                 rtnl_unlock();
5010                 if (!ret) {
5011                         if (colon)
5012                                 *colon = ':';
5013                         if (copy_to_user(arg, &ifr,
5014                                          sizeof(struct ifreq)))
5015                                 ret = -EFAULT;
5016                 }
5017                 return ret;
5018
5019         /*
5020          *      These ioctl calls:
5021          *      - require superuser power.
5022          *      - require strict serialization.
5023          *      - return a value
5024          */
5025         case SIOCGMIIPHY:
5026         case SIOCGMIIREG:
5027         case SIOCSIFNAME:
5028                 if (!capable(CAP_NET_ADMIN))
5029                         return -EPERM;
5030                 dev_load(net, ifr.ifr_name);
5031                 rtnl_lock();
5032                 ret = dev_ifsioc(net, &ifr, cmd);
5033                 rtnl_unlock();
5034                 if (!ret) {
5035                         if (colon)
5036                                 *colon = ':';
5037                         if (copy_to_user(arg, &ifr,
5038                                          sizeof(struct ifreq)))
5039                                 ret = -EFAULT;
5040                 }
5041                 return ret;
5042
5043         /*
5044          *      These ioctl calls:
5045          *      - require superuser power.
5046          *      - require strict serialization.
5047          *      - do not return a value
5048          */
5049         case SIOCSIFFLAGS:
5050         case SIOCSIFMETRIC:
5051         case SIOCSIFMTU:
5052         case SIOCSIFMAP:
5053         case SIOCSIFHWADDR:
5054         case SIOCSIFSLAVE:
5055         case SIOCADDMULTI:
5056         case SIOCDELMULTI:
5057         case SIOCSIFHWBROADCAST:
5058         case SIOCSIFTXQLEN:
5059         case SIOCSMIIREG:
5060         case SIOCBONDENSLAVE:
5061         case SIOCBONDRELEASE:
5062         case SIOCBONDSETHWADDR:
5063         case SIOCBONDCHANGEACTIVE:
5064         case SIOCBRADDIF:
5065         case SIOCBRDELIF:
5066         case SIOCSHWTSTAMP:
5067                 if (!capable(CAP_NET_ADMIN))
5068                         return -EPERM;
5069                 /* fall through */
5070         case SIOCBONDSLAVEINFOQUERY:
5071         case SIOCBONDINFOQUERY:
5072                 dev_load(net, ifr.ifr_name);
5073                 rtnl_lock();
5074                 ret = dev_ifsioc(net, &ifr, cmd);
5075                 rtnl_unlock();
5076                 return ret;
5077
5078         case SIOCGIFMEM:
5079                 /* Get the per device memory space. We can add this but
5080                  * currently do not support it */
5081         case SIOCSIFMEM:
5082                 /* Set the per device memory buffer space.
5083                  * Not applicable in our case */
5084         case SIOCSIFLINK:
5085                 return -EINVAL;
5086
5087         /*
5088          *      Unknown or private ioctl.
5089          */
5090         default:
5091                 if (cmd == SIOCWANDEV ||
5092                     (cmd >= SIOCDEVPRIVATE &&
5093                      cmd <= SIOCDEVPRIVATE + 15)) {
5094                         dev_load(net, ifr.ifr_name);
5095                         rtnl_lock();
5096                         ret = dev_ifsioc(net, &ifr, cmd);
5097                         rtnl_unlock();
5098                         if (!ret && copy_to_user(arg, &ifr,
5099                                                  sizeof(struct ifreq)))
5100                                 ret = -EFAULT;
5101                         return ret;
5102                 }
5103                 /* Take care of Wireless Extensions */
5104                 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
5105                         return wext_handle_ioctl(net, &ifr, cmd, arg);
5106                 return -EINVAL;
5107         }
5108 }
5109
5110
5111 /**
5112  *      dev_new_index   -       allocate an ifindex
5113  *      @net: the applicable net namespace
5114  *
5115  *      Returns a suitable unique value for a new device interface
5116  *      number.  The caller must hold the rtnl semaphore or the
5117  *      dev_base_lock to be sure it remains unique.
5118  */
5119 static int dev_new_index(struct net *net)
5120 {
5121         static int ifindex;
5122         for (;;) {
5123                 if (++ifindex <= 0)
5124                         ifindex = 1;
5125                 if (!__dev_get_by_index(net, ifindex))
5126                         return ifindex;
5127         }
5128 }
5129
5130 /* Delayed registration/unregisteration */
5131 static LIST_HEAD(net_todo_list);
5132
5133 static void net_set_todo(struct net_device *dev)
5134 {
5135         list_add_tail(&dev->todo_list, &net_todo_list);
5136 }
5137
5138 static void rollback_registered_many(struct list_head *head)
5139 {
5140         struct net_device *dev, *tmp;
5141
5142         BUG_ON(dev_boot_phase);
5143         ASSERT_RTNL();
5144
5145         list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5146                 /* Some devices call without registering
5147                  * for initialization unwind. Remove those
5148                  * devices and proceed with the remaining.
5149                  */
5150                 if (dev->reg_state == NETREG_UNINITIALIZED) {
5151                         pr_debug("unregister_netdevice: device %s/%p never "
5152                                  "was registered\n", dev->name, dev);
5153
5154                         WARN_ON(1);
5155                         list_del(&dev->unreg_list);
5156                         continue;
5157                 }
5158
5159                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
5160         }
5161
5162         /* If device is running, close it first. */
5163         dev_close_many(head);
5164
5165         list_for_each_entry(dev, head, unreg_list) {
5166                 /* And unlink it from device chain. */
5167                 unlist_netdevice(dev);
5168
5169                 dev->reg_state = NETREG_UNREGISTERING;
5170         }
5171
5172         synchronize_net();
5173
5174         list_for_each_entry(dev, head, unreg_list) {
5175                 /* Shutdown queueing discipline. */
5176                 dev_shutdown(dev);
5177
5178
5179                 /* Notify protocols, that we are about to destroy
5180                    this device. They should clean all the things.
5181                 */
5182                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5183
5184                 if (!dev->rtnl_link_ops ||
5185                     dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5186                         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5187
5188                 /*
5189                  *      Flush the unicast and multicast chains
5190                  */
5191                 dev_uc_flush(dev);
5192                 dev_mc_flush(dev);
5193
5194                 if (dev->netdev_ops->ndo_uninit)
5195                         dev->netdev_ops->ndo_uninit(dev);
5196
5197                 /* Notifier chain MUST detach us from master device. */
5198                 WARN_ON(dev->master);
5199
5200                 /* Remove entries from kobject tree */
5201                 netdev_unregister_kobject(dev);
5202         }
5203
5204         /* Process any work delayed until the end of the batch */
5205         dev = list_first_entry(head, struct net_device, unreg_list);
5206         call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5207
5208         rcu_barrier();
5209
5210         list_for_each_entry(dev, head, unreg_list)
5211                 dev_put(dev);
5212 }
5213
5214 static void rollback_registered(struct net_device *dev)
5215 {
5216         LIST_HEAD(single);
5217
5218         list_add(&dev->unreg_list, &single);
5219         rollback_registered_many(&single);
5220 }
5221
5222 u32 netdev_fix_features(struct net_device *dev, u32 features)
5223 {
5224         /* Fix illegal checksum combinations */
5225         if ((features & NETIF_F_HW_CSUM) &&
5226             (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5227                 netdev_info(dev, "mixed HW and IP checksum settings.\n");
5228                 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5229         }
5230
5231         if ((features & NETIF_F_NO_CSUM) &&
5232             (features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5233                 netdev_info(dev, "mixed no checksumming and other settings.\n");
5234                 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
5235         }
5236
5237         /* Fix illegal SG+CSUM combinations. */
5238         if ((features & NETIF_F_SG) &&
5239             !(features & NETIF_F_ALL_CSUM)) {
5240                 netdev_info(dev,
5241                             "Dropping NETIF_F_SG since no checksum feature.\n");
5242                 features &= ~NETIF_F_SG;
5243         }
5244
5245         /* TSO requires that SG is present as well. */
5246         if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) {
5247                 netdev_info(dev, "Dropping NETIF_F_TSO since no SG feature.\n");
5248                 features &= ~NETIF_F_TSO;
5249         }
5250
5251         /* UFO needs SG and checksumming */
5252         if (features & NETIF_F_UFO) {
5253                 /* maybe split UFO into V4 and V6? */
5254                 if (!((features & NETIF_F_GEN_CSUM) ||
5255                     (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5256                             == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5257                         netdev_info(dev,
5258                                 "Dropping NETIF_F_UFO since no checksum offload features.\n");
5259                         features &= ~NETIF_F_UFO;
5260                 }
5261
5262                 if (!(features & NETIF_F_SG)) {
5263                         netdev_info(dev,
5264                                 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5265                         features &= ~NETIF_F_UFO;
5266                 }
5267         }
5268
5269         return features;
5270 }
5271 EXPORT_SYMBOL(netdev_fix_features);
5272
5273 /**
5274  *      netif_stacked_transfer_operstate -      transfer operstate
5275  *      @rootdev: the root or lower level device to transfer state from
5276  *      @dev: the device to transfer operstate to
5277  *
5278  *      Transfer operational state from root to device. This is normally
5279  *      called when a stacking relationship exists between the root
5280  *      device and the device(a leaf device).
5281  */
5282 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5283                                         struct net_device *dev)
5284 {
5285         if (rootdev->operstate == IF_OPER_DORMANT)
5286                 netif_dormant_on(dev);
5287         else
5288                 netif_dormant_off(dev);
5289
5290         if (netif_carrier_ok(rootdev)) {
5291                 if (!netif_carrier_ok(dev))
5292                         netif_carrier_on(dev);
5293         } else {
5294                 if (netif_carrier_ok(dev))
5295                         netif_carrier_off(dev);
5296         }
5297 }
5298 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5299
5300 #ifdef CONFIG_RPS
5301 static int netif_alloc_rx_queues(struct net_device *dev)
5302 {
5303         unsigned int i, count = dev->num_rx_queues;
5304         struct netdev_rx_queue *rx;
5305
5306         BUG_ON(count < 1);
5307
5308         rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5309         if (!rx) {
5310                 pr_err("netdev: Unable to allocate %u rx queues.\n", count);
5311                 return -ENOMEM;
5312         }
5313         dev->_rx = rx;
5314
5315         for (i = 0; i < count; i++)
5316                 rx[i].dev = dev;
5317         return 0;
5318 }
5319 #endif
5320
5321 static void netdev_init_one_queue(struct net_device *dev,
5322                                   struct netdev_queue *queue, void *_unused)
5323 {
5324         /* Initialize queue lock */
5325         spin_lock_init(&queue->_xmit_lock);
5326         netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5327         queue->xmit_lock_owner = -1;
5328         netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5329         queue->dev = dev;
5330 }
5331
5332 static int netif_alloc_netdev_queues(struct net_device *dev)
5333 {
5334         unsigned int count = dev->num_tx_queues;
5335         struct netdev_queue *tx;
5336
5337         BUG_ON(count < 1);
5338
5339         tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5340         if (!tx) {
5341                 pr_err("netdev: Unable to allocate %u tx queues.\n",
5342                        count);
5343                 return -ENOMEM;
5344         }
5345         dev->_tx = tx;
5346
5347         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5348         spin_lock_init(&dev->tx_global_lock);
5349
5350         return 0;
5351 }
5352
5353 /**
5354  *      register_netdevice      - register a network device
5355  *      @dev: device to register
5356  *
5357  *      Take a completed network device structure and add it to the kernel
5358  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5359  *      chain. 0 is returned on success. A negative errno code is returned
5360  *      on a failure to set up the device, or if the name is a duplicate.
5361  *
5362  *      Callers must hold the rtnl semaphore. You may want
5363  *      register_netdev() instead of this.
5364  *
5365  *      BUGS:
5366  *      The locking appears insufficient to guarantee two parallel registers
5367  *      will not get the same name.
5368  */
5369
5370 int register_netdevice(struct net_device *dev)
5371 {
5372         int ret;
5373         struct net *net = dev_net(dev);
5374
5375         BUG_ON(dev_boot_phase);
5376         ASSERT_RTNL();
5377
5378         might_sleep();
5379
5380         /* When net_device's are persistent, this will be fatal. */
5381         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5382         BUG_ON(!net);
5383
5384         spin_lock_init(&dev->addr_list_lock);
5385         netdev_set_addr_lockdep_class(dev);
5386
5387         dev->iflink = -1;
5388
5389         /* Init, if this function is available */
5390         if (dev->netdev_ops->ndo_init) {
5391                 ret = dev->netdev_ops->ndo_init(dev);
5392                 if (ret) {
5393                         if (ret > 0)
5394                                 ret = -EIO;
5395                         goto out;
5396                 }
5397         }
5398
5399         ret = dev_get_valid_name(dev, dev->name, 0);
5400         if (ret)
5401                 goto err_uninit;
5402
5403         dev->ifindex = dev_new_index(net);
5404         if (dev->iflink == -1)
5405                 dev->iflink = dev->ifindex;
5406
5407         dev->features = netdev_fix_features(dev, dev->features);
5408
5409         /* Enable software GSO if SG is supported. */
5410         if (dev->features & NETIF_F_SG)
5411                 dev->features |= NETIF_F_GSO;
5412
5413         /* Enable GRO and NETIF_F_HIGHDMA for vlans by default,
5414          * vlan_dev_init() will do the dev->features check, so these features
5415          * are enabled only if supported by underlying device.
5416          */
5417         dev->vlan_features |= (NETIF_F_GRO | NETIF_F_HIGHDMA);
5418
5419         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5420         ret = notifier_to_errno(ret);
5421         if (ret)
5422                 goto err_uninit;
5423
5424         ret = netdev_register_kobject(dev);
5425         if (ret)
5426                 goto err_uninit;
5427         dev->reg_state = NETREG_REGISTERED;
5428
5429         /*
5430          *      Default initial state at registry is that the
5431          *      device is present.
5432          */
5433
5434         set_bit(__LINK_STATE_PRESENT, &dev->state);
5435
5436         dev_init_scheduler(dev);
5437         dev_hold(dev);
5438         list_netdevice(dev);
5439
5440         /* Notify protocols, that a new device appeared. */
5441         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5442         ret = notifier_to_errno(ret);
5443         if (ret) {
5444                 rollback_registered(dev);
5445                 dev->reg_state = NETREG_UNREGISTERED;
5446         }
5447         /*
5448          *      Prevent userspace races by waiting until the network
5449          *      device is fully setup before sending notifications.
5450          */
5451         if (!dev->rtnl_link_ops ||
5452             dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5453                 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5454
5455 out:
5456         return ret;
5457
5458 err_uninit:
5459         if (dev->netdev_ops->ndo_uninit)
5460                 dev->netdev_ops->ndo_uninit(dev);
5461         goto out;
5462 }
5463 EXPORT_SYMBOL(register_netdevice);
5464
5465 /**
5466  *      init_dummy_netdev       - init a dummy network device for NAPI
5467  *      @dev: device to init
5468  *
5469  *      This takes a network device structure and initialize the minimum
5470  *      amount of fields so it can be used to schedule NAPI polls without
5471  *      registering a full blown interface. This is to be used by drivers
5472  *      that need to tie several hardware interfaces to a single NAPI
5473  *      poll scheduler due to HW limitations.
5474  */
5475 int init_dummy_netdev(struct net_device *dev)
5476 {
5477         /* Clear everything. Note we don't initialize spinlocks
5478          * are they aren't supposed to be taken by any of the
5479          * NAPI code and this dummy netdev is supposed to be
5480          * only ever used for NAPI polls
5481          */
5482         memset(dev, 0, sizeof(struct net_device));
5483
5484         /* make sure we BUG if trying to hit standard
5485          * register/unregister code path
5486          */
5487         dev->reg_state = NETREG_DUMMY;
5488
5489         /* NAPI wants this */
5490         INIT_LIST_HEAD(&dev->napi_list);
5491
5492         /* a dummy interface is started by default */
5493         set_bit(__LINK_STATE_PRESENT, &dev->state);
5494         set_bit(__LINK_STATE_START, &dev->state);
5495
5496         /* Note : We dont allocate pcpu_refcnt for dummy devices,
5497          * because users of this 'device' dont need to change
5498          * its refcount.
5499          */
5500
5501         return 0;
5502 }
5503 EXPORT_SYMBOL_GPL(init_dummy_netdev);
5504
5505
5506 /**
5507  *      register_netdev - register a network device
5508  *      @dev: device to register
5509  *
5510  *      Take a completed network device structure and add it to the kernel
5511  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5512  *      chain. 0 is returned on success. A negative errno code is returned
5513  *      on a failure to set up the device, or if the name is a duplicate.
5514  *
5515  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
5516  *      and expands the device name if you passed a format string to
5517  *      alloc_netdev.
5518  */
5519 int register_netdev(struct net_device *dev)
5520 {
5521         int err;
5522
5523         rtnl_lock();
5524
5525         /*
5526          * If the name is a format string the caller wants us to do a
5527          * name allocation.
5528          */
5529         if (strchr(dev->name, '%')) {
5530                 err = dev_alloc_name(dev, dev->name);
5531                 if (err < 0)
5532                         goto out;
5533         }
5534
5535         err = register_netdevice(dev);
5536 out:
5537         rtnl_unlock();
5538         return err;
5539 }
5540 EXPORT_SYMBOL(register_netdev);
5541
5542 int netdev_refcnt_read(const struct net_device *dev)
5543 {
5544         int i, refcnt = 0;
5545
5546         for_each_possible_cpu(i)
5547                 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5548         return refcnt;
5549 }
5550 EXPORT_SYMBOL(netdev_refcnt_read);
5551
5552 /*
5553  * netdev_wait_allrefs - wait until all references are gone.
5554  *
5555  * This is called when unregistering network devices.
5556  *
5557  * Any protocol or device that holds a reference should register
5558  * for netdevice notification, and cleanup and put back the
5559  * reference if they receive an UNREGISTER event.
5560  * We can get stuck here if buggy protocols don't correctly
5561  * call dev_put.
5562  */
5563 static void netdev_wait_allrefs(struct net_device *dev)
5564 {
5565         unsigned long rebroadcast_time, warning_time;
5566         int refcnt;
5567
5568         linkwatch_forget_dev(dev);
5569
5570         rebroadcast_time = warning_time = jiffies;
5571         refcnt = netdev_refcnt_read(dev);
5572
5573         while (refcnt != 0) {
5574                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5575                         rtnl_lock();
5576
5577                         /* Rebroadcast unregister notification */
5578                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5579                         /* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
5580                          * should have already handle it the first time */
5581
5582                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5583                                      &dev->state)) {
5584                                 /* We must not have linkwatch events
5585                                  * pending on unregister. If this
5586                                  * happens, we simply run the queue
5587                                  * unscheduled, resulting in a noop
5588                                  * for this device.
5589                                  */
5590                                 linkwatch_run_queue();
5591                         }
5592
5593                         __rtnl_unlock();
5594
5595                         rebroadcast_time = jiffies;
5596                 }
5597
5598                 msleep(250);
5599
5600                 refcnt = netdev_refcnt_read(dev);
5601
5602                 if (time_after(jiffies, warning_time + 10 * HZ)) {
5603                         printk(KERN_EMERG "unregister_netdevice: "
5604                                "waiting for %s to become free. Usage "
5605                                "count = %d\n",
5606                                dev->name, refcnt);
5607                         warning_time = jiffies;
5608                 }
5609         }
5610 }
5611
5612 /* The sequence is:
5613  *
5614  *      rtnl_lock();
5615  *      ...
5616  *      register_netdevice(x1);
5617  *      register_netdevice(x2);
5618  *      ...
5619  *      unregister_netdevice(y1);
5620  *      unregister_netdevice(y2);
5621  *      ...
5622  *      rtnl_unlock();
5623  *      free_netdev(y1);
5624  *      free_netdev(y2);
5625  *
5626  * We are invoked by rtnl_unlock().
5627  * This allows us to deal with problems:
5628  * 1) We can delete sysfs objects which invoke hotplug
5629  *    without deadlocking with linkwatch via keventd.
5630  * 2) Since we run with the RTNL semaphore not held, we can sleep
5631  *    safely in order to wait for the netdev refcnt to drop to zero.
5632  *
5633  * We must not return until all unregister events added during
5634  * the interval the lock was held have been completed.
5635  */
5636 void netdev_run_todo(void)
5637 {
5638         struct list_head list;
5639
5640         /* Snapshot list, allow later requests */
5641         list_replace_init(&net_todo_list, &list);
5642
5643         __rtnl_unlock();
5644
5645         while (!list_empty(&list)) {
5646                 struct net_device *dev
5647                         = list_first_entry(&list, struct net_device, todo_list);
5648                 list_del(&dev->todo_list);
5649
5650                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5651                         printk(KERN_ERR "network todo '%s' but state %d\n",
5652                                dev->name, dev->reg_state);
5653                         dump_stack();
5654                         continue;
5655                 }
5656
5657                 dev->reg_state = NETREG_UNREGISTERED;
5658
5659                 on_each_cpu(flush_backlog, dev, 1);
5660
5661                 netdev_wait_allrefs(dev);
5662
5663                 /* paranoia */
5664                 BUG_ON(netdev_refcnt_read(dev));
5665                 WARN_ON(rcu_dereference_raw(dev->ip_ptr));
5666                 WARN_ON(rcu_dereference_raw(dev->ip6_ptr));
5667                 WARN_ON(dev->dn_ptr);
5668
5669                 if (dev->destructor)
5670                         dev->destructor(dev);
5671
5672                 /* Free network device */
5673                 kobject_put(&dev->dev.kobj);
5674         }
5675 }
5676
5677 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
5678  * fields in the same order, with only the type differing.
5679  */
5680 static void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5681                                     const struct net_device_stats *netdev_stats)
5682 {
5683 #if BITS_PER_LONG == 64
5684         BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5685         memcpy(stats64, netdev_stats, sizeof(*stats64));
5686 #else
5687         size_t i, n = sizeof(*stats64) / sizeof(u64);
5688         const unsigned long *src = (const unsigned long *)netdev_stats;
5689         u64 *dst = (u64 *)stats64;
5690
5691         BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5692                      sizeof(*stats64) / sizeof(u64));
5693         for (i = 0; i < n; i++)
5694                 dst[i] = src[i];
5695 #endif
5696 }
5697
5698 /**
5699  *      dev_get_stats   - get network device statistics
5700  *      @dev: device to get statistics from
5701  *      @storage: place to store stats
5702  *
5703  *      Get network statistics from device. Return @storage.
5704  *      The device driver may provide its own method by setting
5705  *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5706  *      otherwise the internal statistics structure is used.
5707  */
5708 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5709                                         struct rtnl_link_stats64 *storage)
5710 {
5711         const struct net_device_ops *ops = dev->netdev_ops;
5712
5713         if (ops->ndo_get_stats64) {
5714                 memset(storage, 0, sizeof(*storage));
5715                 ops->ndo_get_stats64(dev, storage);
5716         } else if (ops->ndo_get_stats) {
5717                 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
5718         } else {
5719                 netdev_stats_to_stats64(storage, &dev->stats);
5720         }
5721         storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
5722         return storage;
5723 }
5724 EXPORT_SYMBOL(dev_get_stats);
5725
5726 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
5727 {
5728         struct netdev_queue *queue = dev_ingress_queue(dev);
5729
5730 #ifdef CONFIG_NET_CLS_ACT
5731         if (queue)
5732                 return queue;
5733         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5734         if (!queue)
5735                 return NULL;
5736         netdev_init_one_queue(dev, queue, NULL);
5737         queue->qdisc = &noop_qdisc;
5738         queue->qdisc_sleeping = &noop_qdisc;
5739         rcu_assign_pointer(dev->ingress_queue, queue);
5740 #endif
5741         return queue;
5742 }
5743
5744 /**
5745  *      alloc_netdev_mqs - allocate network device
5746  *      @sizeof_priv:   size of private data to allocate space for
5747  *      @name:          device name format string
5748  *      @setup:         callback to initialize device
5749  *      @txqs:          the number of TX subqueues to allocate
5750  *      @rxqs:          the number of RX subqueues to allocate
5751  *
5752  *      Allocates a struct net_device with private data area for driver use
5753  *      and performs basic initialization.  Also allocates subquue structs
5754  *      for each queue on the device.
5755  */
5756 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
5757                 void (*setup)(struct net_device *),
5758                 unsigned int txqs, unsigned int rxqs)
5759 {
5760         struct net_device *dev;
5761         size_t alloc_size;
5762         struct net_device *p;
5763
5764         BUG_ON(strlen(name) >= sizeof(dev->name));
5765
5766         if (txqs < 1) {
5767                 pr_err("alloc_netdev: Unable to allocate device "
5768                        "with zero queues.\n");
5769                 return NULL;
5770         }
5771
5772 #ifdef CONFIG_RPS
5773         if (rxqs < 1) {
5774                 pr_err("alloc_netdev: Unable to allocate device "
5775                        "with zero RX queues.\n");
5776                 return NULL;
5777         }
5778 #endif
5779
5780         alloc_size = sizeof(struct net_device);
5781         if (sizeof_priv) {
5782                 /* ensure 32-byte alignment of private area */
5783                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5784                 alloc_size += sizeof_priv;
5785         }
5786         /* ensure 32-byte alignment of whole construct */
5787         alloc_size += NETDEV_ALIGN - 1;
5788
5789         p = kzalloc(alloc_size, GFP_KERNEL);
5790         if (!p) {
5791                 printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
5792                 return NULL;
5793         }
5794
5795         dev = PTR_ALIGN(p, NETDEV_ALIGN);
5796         dev->padded = (char *)dev - (char *)p;
5797
5798         dev->pcpu_refcnt = alloc_percpu(int);
5799         if (!dev->pcpu_refcnt)
5800                 goto free_p;
5801
5802         if (dev_addr_init(dev))
5803                 goto free_pcpu;
5804
5805         dev_mc_init(dev);
5806         dev_uc_init(dev);
5807
5808         dev_net_set(dev, &init_net);
5809
5810         dev->gso_max_size = GSO_MAX_SIZE;
5811
5812         INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list);
5813         dev->ethtool_ntuple_list.count = 0;
5814         INIT_LIST_HEAD(&dev->napi_list);
5815         INIT_LIST_HEAD(&dev->unreg_list);
5816         INIT_LIST_HEAD(&dev->link_watch_list);
5817         dev->priv_flags = IFF_XMIT_DST_RELEASE;
5818         setup(dev);
5819
5820         dev->num_tx_queues = txqs;
5821         dev->real_num_tx_queues = txqs;
5822         if (netif_alloc_netdev_queues(dev))
5823                 goto free_all;
5824
5825 #ifdef CONFIG_RPS
5826         dev->num_rx_queues = rxqs;
5827         dev->real_num_rx_queues = rxqs;
5828         if (netif_alloc_rx_queues(dev))
5829                 goto free_all;
5830 #endif
5831
5832         strcpy(dev->name, name);
5833         dev->group = INIT_NETDEV_GROUP;
5834         return dev;
5835
5836 free_all:
5837         free_netdev(dev);
5838         return NULL;
5839
5840 free_pcpu:
5841         free_percpu(dev->pcpu_refcnt);
5842         kfree(dev->_tx);
5843 #ifdef CONFIG_RPS
5844         kfree(dev->_rx);
5845 #endif
5846
5847 free_p:
5848         kfree(p);
5849         return NULL;
5850 }
5851 EXPORT_SYMBOL(alloc_netdev_mqs);
5852
5853 /**
5854  *      free_netdev - free network device
5855  *      @dev: device
5856  *
5857  *      This function does the last stage of destroying an allocated device
5858  *      interface. The reference to the device object is released.
5859  *      If this is the last reference then it will be freed.
5860  */
5861 void free_netdev(struct net_device *dev)
5862 {
5863         struct napi_struct *p, *n;
5864
5865         release_net(dev_net(dev));
5866
5867         kfree(dev->_tx);
5868 #ifdef CONFIG_RPS
5869         kfree(dev->_rx);
5870 #endif
5871
5872         kfree(rcu_dereference_raw(dev->ingress_queue));
5873
5874         /* Flush device addresses */
5875         dev_addr_flush(dev);
5876
5877         /* Clear ethtool n-tuple list */
5878         ethtool_ntuple_flush(dev);
5879
5880         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5881                 netif_napi_del(p);
5882
5883         free_percpu(dev->pcpu_refcnt);
5884         dev->pcpu_refcnt = NULL;
5885
5886         /*  Compatibility with error handling in drivers */
5887         if (dev->reg_state == NETREG_UNINITIALIZED) {
5888                 kfree((char *)dev - dev->padded);
5889                 return;
5890         }
5891
5892         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5893         dev->reg_state = NETREG_RELEASED;
5894
5895         /* will free via device release */
5896         put_device(&dev->dev);
5897 }
5898 EXPORT_SYMBOL(free_netdev);
5899
5900 /**
5901  *      synchronize_net -  Synchronize with packet receive processing
5902  *
5903  *      Wait for packets currently being received to be done.
5904  *      Does not block later packets from starting.
5905  */
5906 void synchronize_net(void)
5907 {
5908         might_sleep();
5909         synchronize_rcu();
5910 }
5911 EXPORT_SYMBOL(synchronize_net);
5912
5913 /**
5914  *      unregister_netdevice_queue - remove device from the kernel
5915  *      @dev: device
5916  *      @head: list
5917  *
5918  *      This function shuts down a device interface and removes it
5919  *      from the kernel tables.
5920  *      If head not NULL, device is queued to be unregistered later.
5921  *
5922  *      Callers must hold the rtnl semaphore.  You may want
5923  *      unregister_netdev() instead of this.
5924  */
5925
5926 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
5927 {
5928         ASSERT_RTNL();
5929
5930         if (head) {
5931                 list_move_tail(&dev->unreg_list, head);
5932         } else {
5933                 rollback_registered(dev);
5934                 /* Finish processing unregister after unlock */
5935                 net_set_todo(dev);
5936         }
5937 }
5938 EXPORT_SYMBOL(unregister_netdevice_queue);
5939
5940 /**
5941  *      unregister_netdevice_many - unregister many devices
5942  *      @head: list of devices
5943  */
5944 void unregister_netdevice_many(struct list_head *head)
5945 {
5946         struct net_device *dev;
5947
5948         if (!list_empty(head)) {
5949                 rollback_registered_many(head);
5950                 list_for_each_entry(dev, head, unreg_list)
5951                         net_set_todo(dev);
5952         }
5953 }
5954 EXPORT_SYMBOL(unregister_netdevice_many);
5955
5956 /**
5957  *      unregister_netdev - remove device from the kernel
5958  *      @dev: device
5959  *
5960  *      This function shuts down a device interface and removes it
5961  *      from the kernel tables.
5962  *
5963  *      This is just a wrapper for unregister_netdevice that takes
5964  *      the rtnl semaphore.  In general you want to use this and not
5965  *      unregister_netdevice.
5966  */
5967 void unregister_netdev(struct net_device *dev)
5968 {
5969         rtnl_lock();
5970         unregister_netdevice(dev);
5971         rtnl_unlock();
5972 }
5973 EXPORT_SYMBOL(unregister_netdev);
5974
5975 /**
5976  *      dev_change_net_namespace - move device to different nethost namespace
5977  *      @dev: device
5978  *      @net: network namespace
5979  *      @pat: If not NULL name pattern to try if the current device name
5980  *            is already taken in the destination network namespace.
5981  *
5982  *      This function shuts down a device interface and moves it
5983  *      to a new network namespace. On success 0 is returned, on
5984  *      a failure a netagive errno code is returned.
5985  *
5986  *      Callers must hold the rtnl semaphore.
5987  */
5988
5989 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
5990 {
5991         int err;
5992
5993         ASSERT_RTNL();
5994
5995         /* Don't allow namespace local devices to be moved. */
5996         err = -EINVAL;
5997         if (dev->features & NETIF_F_NETNS_LOCAL)
5998                 goto out;
5999
6000         /* Ensure the device has been registrered */
6001         err = -EINVAL;
6002         if (dev->reg_state != NETREG_REGISTERED)
6003                 goto out;
6004
6005         /* Get out if there is nothing todo */
6006         err = 0;
6007         if (net_eq(dev_net(dev), net))
6008                 goto out;
6009
6010         /* Pick the destination device name, and ensure
6011          * we can use it in the destination network namespace.
6012          */
6013         err = -EEXIST;
6014         if (__dev_get_by_name(net, dev->name)) {
6015                 /* We get here if we can't use the current device name */
6016                 if (!pat)
6017                         goto out;
6018                 if (dev_get_valid_name(dev, pat, 1))
6019                         goto out;
6020         }
6021
6022         /*
6023          * And now a mini version of register_netdevice unregister_netdevice.
6024          */
6025
6026         /* If device is running close it first. */
6027         dev_close(dev);
6028
6029         /* And unlink it from device chain */
6030         err = -ENODEV;
6031         unlist_netdevice(dev);
6032
6033         synchronize_net();
6034
6035         /* Shutdown queueing discipline. */
6036         dev_shutdown(dev);
6037
6038         /* Notify protocols, that we are about to destroy
6039            this device. They should clean all the things.
6040
6041            Note that dev->reg_state stays at NETREG_REGISTERED.
6042            This is wanted because this way 8021q and macvlan know
6043            the device is just moving and can keep their slaves up.
6044         */
6045         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6046         call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
6047
6048         /*
6049          *      Flush the unicast and multicast chains
6050          */
6051         dev_uc_flush(dev);
6052         dev_mc_flush(dev);
6053
6054         /* Actually switch the network namespace */
6055         dev_net_set(dev, net);
6056
6057         /* If there is an ifindex conflict assign a new one */
6058         if (__dev_get_by_index(net, dev->ifindex)) {
6059                 int iflink = (dev->iflink == dev->ifindex);
6060                 dev->ifindex = dev_new_index(net);
6061                 if (iflink)
6062                         dev->iflink = dev->ifindex;
6063         }
6064
6065         /* Fixup kobjects */
6066         err = device_rename(&dev->dev, dev->name);
6067         WARN_ON(err);
6068
6069         /* Add the device back in the hashes */
6070         list_netdevice(dev);
6071
6072         /* Notify protocols, that a new device appeared. */
6073         call_netdevice_notifiers(NETDEV_REGISTER, dev);
6074
6075         /*
6076          *      Prevent userspace races by waiting until the network
6077          *      device is fully setup before sending notifications.
6078          */
6079         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6080
6081         synchronize_net();
6082         err = 0;
6083 out:
6084         return err;
6085 }
6086 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6087
6088 static int dev_cpu_callback(struct notifier_block *nfb,
6089                             unsigned long action,
6090                             void *ocpu)
6091 {
6092         struct sk_buff **list_skb;
6093         struct sk_buff *skb;
6094         unsigned int cpu, oldcpu = (unsigned long)ocpu;
6095         struct softnet_data *sd, *oldsd;
6096
6097         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6098                 return NOTIFY_OK;
6099
6100         local_irq_disable();
6101         cpu = smp_processor_id();
6102         sd = &per_cpu(softnet_data, cpu);
6103         oldsd = &per_cpu(softnet_data, oldcpu);
6104
6105         /* Find end of our completion_queue. */
6106         list_skb = &sd->completion_queue;
6107         while (*list_skb)
6108                 list_skb = &(*list_skb)->next;
6109         /* Append completion queue from offline CPU. */
6110         *list_skb = oldsd->completion_queue;
6111         oldsd->completion_queue = NULL;
6112
6113         /* Append output queue from offline CPU. */
6114         if (oldsd->output_queue) {
6115                 *sd->output_queue_tailp = oldsd->output_queue;
6116                 sd->output_queue_tailp = oldsd->output_queue_tailp;
6117                 oldsd->output_queue = NULL;
6118                 oldsd->output_queue_tailp = &oldsd->output_queue;
6119         }
6120
6121         raise_softirq_irqoff(NET_TX_SOFTIRQ);
6122         local_irq_enable();
6123
6124         /* Process offline CPU's input_pkt_queue */
6125         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6126                 netif_rx(skb);
6127                 input_queue_head_incr(oldsd);
6128         }
6129         while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6130                 netif_rx(skb);
6131                 input_queue_head_incr(oldsd);
6132         }
6133
6134         return NOTIFY_OK;
6135 }
6136
6137
6138 /**
6139  *      netdev_increment_features - increment feature set by one
6140  *      @all: current feature set
6141  *      @one: new feature set
6142  *      @mask: mask feature set
6143  *
6144  *      Computes a new feature set after adding a device with feature set
6145  *      @one to the master device with current feature set @all.  Will not
6146  *      enable anything that is off in @mask. Returns the new feature set.
6147  */
6148 u32 netdev_increment_features(u32 all, u32 one, u32 mask)
6149 {
6150         /* If device needs checksumming, downgrade to it. */
6151         if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
6152                 all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM);
6153         else if (mask & NETIF_F_ALL_CSUM) {
6154                 /* If one device supports v4/v6 checksumming, set for all. */
6155                 if (one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM) &&
6156                     !(all & NETIF_F_GEN_CSUM)) {
6157                         all &= ~NETIF_F_ALL_CSUM;
6158                         all |= one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
6159                 }
6160
6161                 /* If one device supports hw checksumming, set for all. */
6162                 if (one & NETIF_F_GEN_CSUM && !(all & NETIF_F_GEN_CSUM)) {
6163                         all &= ~NETIF_F_ALL_CSUM;
6164                         all |= NETIF_F_HW_CSUM;
6165                 }
6166         }
6167
6168         one |= NETIF_F_ALL_CSUM;
6169
6170         one |= all & NETIF_F_ONE_FOR_ALL;
6171         all &= one | NETIF_F_LLTX | NETIF_F_GSO | NETIF_F_UFO;
6172         all |= one & mask & NETIF_F_ONE_FOR_ALL;
6173
6174         return all;
6175 }
6176 EXPORT_SYMBOL(netdev_increment_features);
6177
6178 static struct hlist_head *netdev_create_hash(void)
6179 {
6180         int i;
6181         struct hlist_head *hash;
6182
6183         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6184         if (hash != NULL)
6185                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
6186                         INIT_HLIST_HEAD(&hash[i]);
6187
6188         return hash;
6189 }
6190
6191 /* Initialize per network namespace state */
6192 static int __net_init netdev_init(struct net *net)
6193 {
6194         INIT_LIST_HEAD(&net->dev_base_head);
6195
6196         net->dev_name_head = netdev_create_hash();
6197         if (net->dev_name_head == NULL)
6198                 goto err_name;
6199
6200         net->dev_index_head = netdev_create_hash();
6201         if (net->dev_index_head == NULL)
6202                 goto err_idx;
6203
6204         return 0;
6205
6206 err_idx:
6207         kfree(net->dev_name_head);
6208 err_name:
6209         return -ENOMEM;
6210 }
6211
6212 /**
6213  *      netdev_drivername - network driver for the device
6214  *      @dev: network device
6215  *      @buffer: buffer for resulting name
6216  *      @len: size of buffer
6217  *
6218  *      Determine network driver for device.
6219  */
6220 char *netdev_drivername(const struct net_device *dev, char *buffer, int len)
6221 {
6222         const struct device_driver *driver;
6223         const struct device *parent;
6224
6225         if (len <= 0 || !buffer)
6226                 return buffer;
6227         buffer[0] = 0;
6228
6229         parent = dev->dev.parent;
6230
6231         if (!parent)
6232                 return buffer;
6233
6234         driver = parent->driver;
6235         if (driver && driver->name)
6236                 strlcpy(buffer, driver->name, len);
6237         return buffer;
6238 }
6239
6240 static int __netdev_printk(const char *level, const struct net_device *dev,
6241                            struct va_format *vaf)
6242 {
6243         int r;
6244
6245         if (dev && dev->dev.parent)
6246                 r = dev_printk(level, dev->dev.parent, "%s: %pV",
6247                                netdev_name(dev), vaf);
6248         else if (dev)
6249                 r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6250         else
6251                 r = printk("%s(NULL net_device): %pV", level, vaf);
6252
6253         return r;
6254 }
6255
6256 int netdev_printk(const char *level, const struct net_device *dev,
6257                   const char *format, ...)
6258 {
6259         struct va_format vaf;
6260         va_list args;
6261         int r;
6262
6263         va_start(args, format);
6264
6265         vaf.fmt = format;
6266         vaf.va = &args;
6267
6268         r = __netdev_printk(level, dev, &vaf);
6269         va_end(args);
6270
6271         return r;
6272 }
6273 EXPORT_SYMBOL(netdev_printk);
6274
6275 #define define_netdev_printk_level(func, level)                 \
6276 int func(const struct net_device *dev, const char *fmt, ...)    \
6277 {                                                               \
6278         int r;                                                  \
6279         struct va_format vaf;                                   \
6280         va_list args;                                           \
6281                                                                 \
6282         va_start(args, fmt);                                    \
6283                                                                 \
6284         vaf.fmt = fmt;                                          \
6285         vaf.va = &args;                                         \
6286                                                                 \
6287         r = __netdev_printk(level, dev, &vaf);                  \
6288         va_end(args);                                           \
6289                                                                 \
6290         return r;                                               \
6291 }                                                               \
6292 EXPORT_SYMBOL(func);
6293
6294 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6295 define_netdev_printk_level(netdev_alert, KERN_ALERT);
6296 define_netdev_printk_level(netdev_crit, KERN_CRIT);
6297 define_netdev_printk_level(netdev_err, KERN_ERR);
6298 define_netdev_printk_level(netdev_warn, KERN_WARNING);
6299 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6300 define_netdev_printk_level(netdev_info, KERN_INFO);
6301
6302 static void __net_exit netdev_exit(struct net *net)
6303 {
6304         kfree(net->dev_name_head);
6305         kfree(net->dev_index_head);
6306 }
6307
6308 static struct pernet_operations __net_initdata netdev_net_ops = {
6309         .init = netdev_init,
6310         .exit = netdev_exit,
6311 };
6312
6313 static void __net_exit default_device_exit(struct net *net)
6314 {
6315         struct net_device *dev, *aux;
6316         /*
6317          * Push all migratable network devices back to the
6318          * initial network namespace
6319          */
6320         rtnl_lock();
6321         for_each_netdev_safe(net, dev, aux) {
6322                 int err;
6323                 char fb_name[IFNAMSIZ];
6324
6325                 /* Ignore unmoveable devices (i.e. loopback) */
6326                 if (dev->features & NETIF_F_NETNS_LOCAL)
6327                         continue;
6328
6329                 /* Leave virtual devices for the generic cleanup */
6330                 if (dev->rtnl_link_ops)
6331                         continue;
6332
6333                 /* Push remaing network devices to init_net */
6334                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6335                 err = dev_change_net_namespace(dev, &init_net, fb_name);
6336                 if (err) {
6337                         printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
6338                                 __func__, dev->name, err);
6339                         BUG();
6340                 }
6341         }
6342         rtnl_unlock();
6343 }
6344
6345 static void __net_exit default_device_exit_batch(struct list_head *net_list)
6346 {
6347         /* At exit all network devices most be removed from a network
6348          * namespace.  Do this in the reverse order of registration.
6349          * Do this across as many network namespaces as possible to
6350          * improve batching efficiency.
6351          */
6352         struct net_device *dev;
6353         struct net *net;
6354         LIST_HEAD(dev_kill_list);
6355
6356         rtnl_lock();
6357         list_for_each_entry(net, net_list, exit_list) {
6358                 for_each_netdev_reverse(net, dev) {
6359                         if (dev->rtnl_link_ops)
6360                                 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6361                         else
6362                                 unregister_netdevice_queue(dev, &dev_kill_list);
6363                 }
6364         }
6365         unregister_netdevice_many(&dev_kill_list);
6366         rtnl_unlock();
6367 }
6368
6369 static struct pernet_operations __net_initdata default_device_ops = {
6370         .exit = default_device_exit,
6371         .exit_batch = default_device_exit_batch,
6372 };
6373
6374 /*
6375  *      Initialize the DEV module. At boot time this walks the device list and
6376  *      unhooks any devices that fail to initialise (normally hardware not
6377  *      present) and leaves us with a valid list of present and active devices.
6378  *
6379  */
6380
6381 /*
6382  *       This is called single threaded during boot, so no need
6383  *       to take the rtnl semaphore.
6384  */
6385 static int __init net_dev_init(void)
6386 {
6387         int i, rc = -ENOMEM;
6388
6389         BUG_ON(!dev_boot_phase);
6390
6391         if (dev_proc_init())
6392                 goto out;
6393
6394         if (netdev_kobject_init())
6395                 goto out;
6396
6397         INIT_LIST_HEAD(&ptype_all);
6398         for (i = 0; i < PTYPE_HASH_SIZE; i++)
6399                 INIT_LIST_HEAD(&ptype_base[i]);
6400
6401         if (register_pernet_subsys(&netdev_net_ops))
6402                 goto out;
6403
6404         /*
6405          *      Initialise the packet receive queues.
6406          */
6407
6408         for_each_possible_cpu(i) {
6409                 struct softnet_data *sd = &per_cpu(softnet_data, i);
6410
6411                 memset(sd, 0, sizeof(*sd));
6412                 skb_queue_head_init(&sd->input_pkt_queue);
6413                 skb_queue_head_init(&sd->process_queue);
6414                 sd->completion_queue = NULL;
6415                 INIT_LIST_HEAD(&sd->poll_list);
6416                 sd->output_queue = NULL;
6417                 sd->output_queue_tailp = &sd->output_queue;
6418 #ifdef CONFIG_RPS
6419                 sd->csd.func = rps_trigger_softirq;
6420                 sd->csd.info = sd;
6421                 sd->csd.flags = 0;
6422                 sd->cpu = i;
6423 #endif
6424
6425                 sd->backlog.poll = process_backlog;
6426                 sd->backlog.weight = weight_p;
6427                 sd->backlog.gro_list = NULL;
6428                 sd->backlog.gro_count = 0;
6429         }
6430
6431         dev_boot_phase = 0;
6432
6433         /* The loopback device is special if any other network devices
6434          * is present in a network namespace the loopback device must
6435          * be present. Since we now dynamically allocate and free the
6436          * loopback device ensure this invariant is maintained by
6437          * keeping the loopback device as the first device on the
6438          * list of network devices.  Ensuring the loopback devices
6439          * is the first device that appears and the last network device
6440          * that disappears.
6441          */
6442         if (register_pernet_device(&loopback_net_ops))
6443                 goto out;
6444
6445         if (register_pernet_device(&default_device_ops))
6446                 goto out;
6447
6448         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6449         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6450
6451         hotcpu_notifier(dev_cpu_callback, 0);
6452         dst_init();
6453         dev_mcast_init();
6454         rc = 0;
6455 out:
6456         return rc;
6457 }
6458
6459 subsys_initcall(net_dev_init);
6460
6461 static int __init initialize_hashrnd(void)
6462 {
6463         get_random_bytes(&hashrnd, sizeof(hashrnd));
6464         return 0;
6465 }
6466
6467 late_initcall_sync(initialize_hashrnd);
6468