net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro
  11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17  *              David Hinds <dahinds@users.sourceforge.net>
  18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19  *              Adam Sulmicki <adam@cfar.umd.edu>
  20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <asm/system.h>
  77 #include <linux/bitops.h>
  78 #include <linux/capability.h>
  79 #include <linux/cpu.h>
  80 #include <linux/types.h>
  81 #include <linux/kernel.h>
  82 #include <linux/hash.h>
  83 #include <linux/slab.h>
  84 #include <linux/sched.h>
  85 #include <linux/mutex.h>
  86 #include <linux/string.h>
  87 #include <linux/mm.h>
  88 #include <linux/socket.h>
  89 #include <linux/sockios.h>
  90 #include <linux/errno.h>
  91 #include <linux/interrupt.h>
  92 #include <linux/if_ether.h>
  93 #include <linux/netdevice.h>
  94 #include <linux/etherdevice.h>
  95 #include <linux/ethtool.h>
  96 #include <linux/notifier.h>
  97 #include <linux/skbuff.h>
  98 #include <net/net_namespace.h>
  99 #include <net/sock.h>
 100 #include <linux/rtnetlink.h>
 101 #include <linux/proc_fs.h>
 102 #include <linux/seq_file.h>
 103 #include <linux/stat.h>
 104 #include <net/dst.h>
 105 #include <net/pkt_sched.h>
 106 #include <net/checksum.h>
 107 #include <net/xfrm.h>
 108 #include <linux/highmem.h>
 109 #include <linux/init.h>
 110 #include <linux/kmod.h>
 111 #include <linux/module.h>
 112 #include <linux/netpoll.h>
 113 #include <linux/rcupdate.h>
 114 #include <linux/delay.h>
 115 #include <net/wext.h>
 116 #include <net/iw_handler.h>
 117 #include <asm/current.h>
 118 #include <linux/audit.h>
 119 #include <linux/dmaengine.h>
 120 #include <linux/err.h>
 121 #include <linux/ctype.h>
 122 #include <linux/if_arp.h>
 123 #include <linux/if_vlan.h>
 124 #include <linux/ip.h>
 125 #include <net/ip.h>
 126 #include <linux/ipv6.h>
 127 #include <linux/in.h>
 128 #include <linux/jhash.h>
 129 #include <linux/random.h>
 130 #include <trace/events/napi.h>
 131 #include <trace/events/net.h>
 132 #include <trace/events/skb.h>
 133 #include <linux/pci.h>
 134 #include <linux/inetdevice.h>
 135 #include <linux/cpu_rmap.h>
 136 #include <linux/if_tunnel.h>
 137 #include <linux/if_pppox.h>
 138 #include <linux/ppp_defs.h>
 139 #include <linux/net_tstamp.h>
 140
 141 #include "net-sysfs.h"
 142
 143 /* Instead of increasing this, you should create a hash table. */
 144 #define MAX_GRO_SKBS 8
 145
 146 /* This should be increased if a protocol with a bigger head is added. */
 147 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 148
 149 /*
 150  *      The list of packet types we will receive (as opposed to discard)
 151  *      and the routines to invoke.
 152  *
 153  *      Why 16. Because with 16 the only overlap we get on a hash of the
 154  *      low nibble of the protocol value is RARP/SNAP/X.25.
 155  *
 156  *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
 157  *             sure which should go first, but I bet it won't make much
 158  *             difference if we are running VLANs.  The good news is that
 159  *             this protocol won't be in the list unless compiled in, so
 160  *             the average user (w/out VLANs) will not be adversely affected.
 161  *             --BLG
 162  *
 163  *              0800    IP
 164  *              8100    802.1Q VLAN
 165  *              0001    802.3
 166  *              0002    AX.25
 167  *              0004    802.2
 168  *              8035    RARP
 169  *              0005    SNAP
 170  *              0805    X.25
 171  *              0806    ARP
 172  *              8137    IPX
 173  *              0009    Localtalk
 174  *              86DD    IPv6
 175  */
 176
 177 #define PTYPE_HASH_SIZE (16)
 178 #define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
 179
 180 static DEFINE_SPINLOCK(ptype_lock);
 181 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 182 static struct list_head ptype_all __read_mostly;        /* Taps */
 183
 184 /*
 185  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 186  * semaphore.
 187  *
 188  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 189  *
 190  * Writers must hold the rtnl semaphore while they loop through the
 191  * dev_base_head list, and hold dev_base_lock for writing when they do the
 192  * actual updates.  This allows pure readers to access the list even
 193  * while a writer is preparing to update it.
 194  *
 195  * To put it another way, dev_base_lock is held for writing only to
 196  * protect against pure readers; the rtnl semaphore provides the
 197  * protection against other writers.
 198  *
 199  * See, for example usages, register_netdevice() and
 200  * unregister_netdevice(), which must be called with the rtnl
 201  * semaphore held.
 202  */
 203 DEFINE_RWLOCK(dev_base_lock);
 204 EXPORT_SYMBOL(dev_base_lock);
 205
 206 static inline void dev_base_seq_inc(struct net *net)
 207 {
 208         while (++net->dev_base_seq == 0);
 209 }
 210
 211 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 212 {
 213         unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 214         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 215 }
 216
 217 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 218 {
 219         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 220 }
 221
 222 static inline void rps_lock(struct softnet_data *sd)
 223 {
 224 #ifdef CONFIG_RPS
 225         spin_lock(&sd->input_pkt_queue.lock);
 226 #endif
 227 }
 228
 229 static inline void rps_unlock(struct softnet_data *sd)
 230 {
 231 #ifdef CONFIG_RPS
 232         spin_unlock(&sd->input_pkt_queue.lock);
 233 #endif
 234 }
 235
 236 /* Device list insertion */
 237 static int list_netdevice(struct net_device *dev)
 238 {
 239         struct net *net = dev_net(dev);
 240
 241         ASSERT_RTNL();
 242
 243         write_lock_bh(&dev_base_lock);
 244         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 245         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 246         hlist_add_head_rcu(&dev->index_hlist,
 247                            dev_index_hash(net, dev->ifindex));
 248         write_unlock_bh(&dev_base_lock);
 249
 250         dev_base_seq_inc(net);
 251
 252         return 0;
 253 }
 254
 255 /* Device list removal
 256  * caller must respect a RCU grace period before freeing/reusing dev
 257  */
 258 static void unlist_netdevice(struct net_device *dev)
 259 {
 260         ASSERT_RTNL();
 261
 262         /* Unlink dev from the device chain */
 263         write_lock_bh(&dev_base_lock);
 264         list_del_rcu(&dev->dev_list);
 265         hlist_del_rcu(&dev->name_hlist);
 266         hlist_del_rcu(&dev->index_hlist);
 267         write_unlock_bh(&dev_base_lock);
 268
 269         dev_base_seq_inc(dev_net(dev));
 270 }
 271
 272 /*
 273  *      Our notifier list
 274  */
 275
 276 static RAW_NOTIFIER_HEAD(netdev_chain);
 277
 278 /*
 279  *      Device drivers call our routines to queue packets here. We empty the
 280  *      queue in the local softnet handler.
 281  */
 282
 283 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 284 EXPORT_PER_CPU_SYMBOL(softnet_data);
 285
 286 #ifdef CONFIG_LOCKDEP
 287 /*
 288  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 289  * according to dev->type
 290  */
 291 static const unsigned short netdev_lock_type[] =
 292         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 293          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 294          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 295          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 296          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 297          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 298          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 299          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 300          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 301          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 302          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 303          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 304          ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
 305          ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
 306          ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,
 307          ARPHRD_VOID, ARPHRD_NONE};
 308
 309 static const char *const netdev_lock_name[] =
 310         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 311          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 312          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 313          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 314          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 315          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 316          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 317          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 318          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 319          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 320          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 321          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 322          "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
 323          "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
 324          "_xmit_PHONET_PIPE", "_xmit_IEEE802154",
 325          "_xmit_VOID", "_xmit_NONE"};
 326
 327 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 328 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 329
 330 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 331 {
 332         int i;
 333
 334         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 335                 if (netdev_lock_type[i] == dev_type)
 336                         return i;
 337         /* the last key is used by default */
 338         return ARRAY_SIZE(netdev_lock_type) - 1;
 339 }
 340
 341 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 342                                                  unsigned short dev_type)
 343 {
 344         int i;
 345
 346         i = netdev_lock_pos(dev_type);
 347         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 348                                    netdev_lock_name[i]);
 349 }
 350
 351 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 352 {
 353         int i;
 354
 355         i = netdev_lock_pos(dev->type);
 356         lockdep_set_class_and_name(&dev->addr_list_lock,
 357                                    &netdev_addr_lock_key[i],
 358                                    netdev_lock_name[i]);
 359 }
 360 #else
 361 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 362                                                  unsigned short dev_type)
 363 {
 364 }
 365 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 366 {
 367 }
 368 #endif
 369
 370 /*******************************************************************************
 371
 372                 Protocol management and registration routines
 373
 374 *******************************************************************************/
 375
 376 /*
 377  *      Add a protocol ID to the list. Now that the input handler is
 378  *      smarter we can dispense with all the messy stuff that used to be
 379  *      here.
 380  *
 381  *      BEWARE!!! Protocol handlers, mangling input packets,
 382  *      MUST BE last in hash buckets and checking protocol handlers
 383  *      MUST start from promiscuous ptype_all chain in net_bh.
 384  *      It is true now, do not change it.
 385  *      Explanation follows: if protocol handler, mangling packet, will
 386  *      be the first on list, it is not able to sense, that packet
 387  *      is cloned and should be copied-on-write, so that it will
 388  *      change it and subsequent readers will get broken packet.
 389  *                                                      --ANK (980803)
 390  */
 391
 392 static inline struct list_head *ptype_head(const struct packet_type *pt)
 393 {
 394         if (pt->type == htons(ETH_P_ALL))
 395                 return &ptype_all;
 396         else
 397                 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 398 }
 399
 400 /**
 401  *      dev_add_pack - add packet handler
 402  *      @pt: packet type declaration
 403  *
 404  *      Add a protocol handler to the networking stack. The passed &packet_type
 405  *      is linked into kernel lists and may not be freed until it has been
 406  *      removed from the kernel lists.
 407  *
 408  *      This call does not sleep therefore it can not
 409  *      guarantee all CPU's that are in middle of receiving packets
 410  *      will see the new packet type (until the next received packet).
 411  */
 412
 413 void dev_add_pack(struct packet_type *pt)
 414 {
 415         struct list_head *head = ptype_head(pt);
 416
 417         spin_lock(&ptype_lock);
 418         list_add_rcu(&pt->list, head);
 419         spin_unlock(&ptype_lock);
 420 }
 421 EXPORT_SYMBOL(dev_add_pack);
 422
 423 /**
 424  *      __dev_remove_pack        - remove packet handler
 425  *      @pt: packet type declaration
 426  *
 427  *      Remove a protocol handler that was previously added to the kernel
 428  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 429  *      from the kernel lists and can be freed or reused once this function
 430  *      returns.
 431  *
 432  *      The packet type might still be in use by receivers
 433  *      and must not be freed until after all the CPU's have gone
 434  *      through a quiescent state.
 435  */
 436 void __dev_remove_pack(struct packet_type *pt)
 437 {
 438         struct list_head *head = ptype_head(pt);
 439         struct packet_type *pt1;
 440
 441         spin_lock(&ptype_lock);
 442
 443         list_for_each_entry(pt1, head, list) {
 444                 if (pt == pt1) {
 445                         list_del_rcu(&pt->list);
 446                         goto out;
 447                 }
 448         }
 449
 450         printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
 451 out:
 452         spin_unlock(&ptype_lock);
 453 }
 454 EXPORT_SYMBOL(__dev_remove_pack);
 455
 456 /**
 457  *      dev_remove_pack  - remove packet handler
 458  *      @pt: packet type declaration
 459  *
 460  *      Remove a protocol handler that was previously added to the kernel
 461  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 462  *      from the kernel lists and can be freed or reused once this function
 463  *      returns.
 464  *
 465  *      This call sleeps to guarantee that no CPU is looking at the packet
 466  *      type after return.
 467  */
 468 void dev_remove_pack(struct packet_type *pt)
 469 {
 470         __dev_remove_pack(pt);
 471
 472         synchronize_net();
 473 }
 474 EXPORT_SYMBOL(dev_remove_pack);
 475
 476 /******************************************************************************
 477
 478                       Device Boot-time Settings Routines
 479
 480 *******************************************************************************/
 481
 482 /* Boot time configuration table */
 483 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 484
 485 /**
 486  *      netdev_boot_setup_add   - add new setup entry
 487  *      @name: name of the device
 488  *      @map: configured settings for the device
 489  *
 490  *      Adds new setup entry to the dev_boot_setup list.  The function
 491  *      returns 0 on error and 1 on success.  This is a generic routine to
 492  *      all netdevices.
 493  */
 494 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 495 {
 496         struct netdev_boot_setup *s;
 497         int i;
 498
 499         s = dev_boot_setup;
 500         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 501                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 502                         memset(s[i].name, 0, sizeof(s[i].name));
 503                         strlcpy(s[i].name, name, IFNAMSIZ);
 504                         memcpy(&s[i].map, map, sizeof(s[i].map));
 505                         break;
 506                 }
 507         }
 508
 509         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 510 }
 511
 512 /**
 513  *      netdev_boot_setup_check - check boot time settings
 514  *      @dev: the netdevice
 515  *
 516  *      Check boot time settings for the device.
 517  *      The found settings are set for the device to be used
 518  *      later in the device probing.
 519  *      Returns 0 if no settings found, 1 if they are.
 520  */
 521 int netdev_boot_setup_check(struct net_device *dev)
 522 {
 523         struct netdev_boot_setup *s = dev_boot_setup;
 524         int i;
 525
 526         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 527                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 528                     !strcmp(dev->name, s[i].name)) {
 529                         dev->irq        = s[i].map.irq;
 530                         dev->base_addr  = s[i].map.base_addr;
 531                         dev->mem_start  = s[i].map.mem_start;
 532                         dev->mem_end    = s[i].map.mem_end;
 533                         return 1;
 534                 }
 535         }
 536         return 0;
 537 }
 538 EXPORT_SYMBOL(netdev_boot_setup_check);
 539
 540
 541 /**
 542  *      netdev_boot_base        - get address from boot time settings
 543  *      @prefix: prefix for network device
 544  *      @unit: id for network device
 545  *
 546  *      Check boot time settings for the base address of device.
 547  *      The found settings are set for the device to be used
 548  *      later in the device probing.
 549  *      Returns 0 if no settings found.
 550  */
 551 unsigned long netdev_boot_base(const char *prefix, int unit)
 552 {
 553         const struct netdev_boot_setup *s = dev_boot_setup;
 554         char name[IFNAMSIZ];
 555         int i;
 556
 557         sprintf(name, "%s%d", prefix, unit);
 558
 559         /*
 560          * If device already registered then return base of 1
 561          * to indicate not to probe for this interface
 562          */
 563         if (__dev_get_by_name(&init_net, name))
 564                 return 1;
 565
 566         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 567                 if (!strcmp(name, s[i].name))
 568                         return s[i].map.base_addr;
 569         return 0;
 570 }
 571
 572 /*
 573  * Saves at boot time configured settings for any netdevice.
 574  */
 575 int __init netdev_boot_setup(char *str)
 576 {
 577         int ints[5];
 578         struct ifmap map;
 579
 580         str = get_options(str, ARRAY_SIZE(ints), ints);
 581         if (!str || !*str)
 582                 return 0;
 583
 584         /* Save settings */
 585         memset(&map, 0, sizeof(map));
 586         if (ints[0] > 0)
 587                 map.irq = ints[1];
 588         if (ints[0] > 1)
 589                 map.base_addr = ints[2];
 590         if (ints[0] > 2)
 591                 map.mem_start = ints[3];
 592         if (ints[0] > 3)
 593                 map.mem_end = ints[4];
 594
 595         /* Add new entry to the list */
 596         return netdev_boot_setup_add(str, &map);
 597 }
 598
 599 __setup("netdev=", netdev_boot_setup);
 600
 601 /*******************************************************************************
 602
 603                             Device Interface Subroutines
 604
 605 *******************************************************************************/
 606
 607 /**
 608  *      __dev_get_by_name       - find a device by its name
 609  *      @net: the applicable net namespace
 610  *      @name: name to find
 611  *
 612  *      Find an interface by name. Must be called under RTNL semaphore
 613  *      or @dev_base_lock. If the name is found a pointer to the device
 614  *      is returned. If the name is not found then %NULL is returned. The
 615  *      reference counters are not incremented so the caller must be
 616  *      careful with locks.
 617  */
 618
 619 struct net_device *__dev_get_by_name(struct net *net, const char *name)
 620 {
 621         struct hlist_node *p;
 622         struct net_device *dev;
 623         struct hlist_head *head = dev_name_hash(net, name);
 624
 625         hlist_for_each_entry(dev, p, head, name_hlist)
 626                 if (!strncmp(dev->name, name, IFNAMSIZ))
 627                         return dev;
 628
 629         return NULL;
 630 }
 631 EXPORT_SYMBOL(__dev_get_by_name);
 632
 633 /**
 634  *      dev_get_by_name_rcu     - find a device by its name
 635  *      @net: the applicable net namespace
 636  *      @name: name to find
 637  *
 638  *      Find an interface by name.
 639  *      If the name is found a pointer to the device is returned.
 640  *      If the name is not found then %NULL is returned.
 641  *      The reference counters are not incremented so the caller must be
 642  *      careful with locks. The caller must hold RCU lock.
 643  */
 644
 645 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 646 {
 647         struct hlist_node *p;
 648         struct net_device *dev;
 649         struct hlist_head *head = dev_name_hash(net, name);
 650
 651         hlist_for_each_entry_rcu(dev, p, head, name_hlist)
 652                 if (!strncmp(dev->name, name, IFNAMSIZ))
 653                         return dev;
 654
 655         return NULL;
 656 }
 657 EXPORT_SYMBOL(dev_get_by_name_rcu);
 658
 659 /**
 660  *      dev_get_by_name         - find a device by its name
 661  *      @net: the applicable net namespace
 662  *      @name: name to find
 663  *
 664  *      Find an interface by name. This can be called from any
 665  *      context and does its own locking. The returned handle has
 666  *      the usage count incremented and the caller must use dev_put() to
 667  *      release it when it is no longer needed. %NULL is returned if no
 668  *      matching device is found.
 669  */
 670
 671 struct net_device *dev_get_by_name(struct net *net, const char *name)
 672 {
 673         struct net_device *dev;
 674
 675         rcu_read_lock();
 676         dev = dev_get_by_name_rcu(net, name);
 677         if (dev)
 678                 dev_hold(dev);
 679         rcu_read_unlock();
 680         return dev;
 681 }
 682 EXPORT_SYMBOL(dev_get_by_name);
 683
 684 /**
 685  *      __dev_get_by_index - find a device by its ifindex
 686  *      @net: the applicable net namespace
 687  *      @ifindex: index of device
 688  *
 689  *      Search for an interface by index. Returns %NULL if the device
 690  *      is not found or a pointer to the device. The device has not
 691  *      had its reference counter increased so the caller must be careful
 692  *      about locking. The caller must hold either the RTNL semaphore
 693  *      or @dev_base_lock.
 694  */
 695
 696 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 697 {
 698         struct hlist_node *p;
 699         struct net_device *dev;
 700         struct hlist_head *head = dev_index_hash(net, ifindex);
 701
 702         hlist_for_each_entry(dev, p, head, index_hlist)
 703                 if (dev->ifindex == ifindex)
 704                         return dev;
 705
 706         return NULL;
 707 }
 708 EXPORT_SYMBOL(__dev_get_by_index);
 709
 710 /**
 711  *      dev_get_by_index_rcu - find a device by its ifindex
 712  *      @net: the applicable net namespace
 713  *      @ifindex: index of device
 714  *
 715  *      Search for an interface by index. Returns %NULL if the device
 716  *      is not found or a pointer to the device. The device has not
 717  *      had its reference counter increased so the caller must be careful
 718  *      about locking. The caller must hold RCU lock.
 719  */
 720
 721 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 722 {
 723         struct hlist_node *p;
 724         struct net_device *dev;
 725         struct hlist_head *head = dev_index_hash(net, ifindex);
 726
 727         hlist_for_each_entry_rcu(dev, p, head, index_hlist)
 728                 if (dev->ifindex == ifindex)
 729                         return dev;
 730
 731         return NULL;
 732 }
 733 EXPORT_SYMBOL(dev_get_by_index_rcu);
 734
 735
 736 /**
 737  *      dev_get_by_index - find a device by its ifindex
 738  *      @net: the applicable net namespace
 739  *      @ifindex: index of device
 740  *
 741  *      Search for an interface by index. Returns NULL if the device
 742  *      is not found or a pointer to the device. The device returned has
 743  *      had a reference added and the pointer is safe until the user calls
 744  *      dev_put to indicate they have finished with it.
 745  */
 746
 747 struct net_device *dev_get_by_index(struct net *net, int ifindex)
 748 {
 749         struct net_device *dev;
 750
 751         rcu_read_lock();
 752         dev = dev_get_by_index_rcu(net, ifindex);
 753         if (dev)
 754                 dev_hold(dev);
 755         rcu_read_unlock();
 756         return dev;
 757 }
 758 EXPORT_SYMBOL(dev_get_by_index);
 759
 760 /**
 761  *      dev_getbyhwaddr_rcu - find a device by its hardware address
 762  *      @net: the applicable net namespace
 763  *      @type: media type of device
 764  *      @ha: hardware address
 765  *
 766  *      Search for an interface by MAC address. Returns NULL if the device
 767  *      is not found or a pointer to the device.
 768  *      The caller must hold RCU or RTNL.
 769  *      The returned device has not had its ref count increased
 770  *      and the caller must therefore be careful about locking
 771  *
 772  */
 773
 774 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 775                                        const char *ha)
 776 {
 777         struct net_device *dev;
 778
 779         for_each_netdev_rcu(net, dev)
 780                 if (dev->type == type &&
 781                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 782                         return dev;
 783
 784         return NULL;
 785 }
 786 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 787
 788 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 789 {
 790         struct net_device *dev;
 791
 792         ASSERT_RTNL();
 793         for_each_netdev(net, dev)
 794                 if (dev->type == type)
 795                         return dev;
 796
 797         return NULL;
 798 }
 799 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 800
 801 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 802 {
 803         struct net_device *dev, *ret = NULL;
 804
 805         rcu_read_lock();
 806         for_each_netdev_rcu(net, dev)
 807                 if (dev->type == type) {
 808                         dev_hold(dev);
 809                         ret = dev;
 810                         break;
 811                 }
 812         rcu_read_unlock();
 813         return ret;
 814 }
 815 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 816
 817 /**
 818  *      dev_get_by_flags_rcu - find any device with given flags
 819  *      @net: the applicable net namespace
 820  *      @if_flags: IFF_* values
 821  *      @mask: bitmask of bits in if_flags to check
 822  *
 823  *      Search for any interface with the given flags. Returns NULL if a device
 824  *      is not found or a pointer to the device. Must be called inside
 825  *      rcu_read_lock(), and result refcount is unchanged.
 826  */
 827
 828 struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
 829                                     unsigned short mask)
 830 {
 831         struct net_device *dev, *ret;
 832
 833         ret = NULL;
 834         for_each_netdev_rcu(net, dev) {
 835                 if (((dev->flags ^ if_flags) & mask) == 0) {
 836                         ret = dev;
 837                         break;
 838                 }
 839         }
 840         return ret;
 841 }
 842 EXPORT_SYMBOL(dev_get_by_flags_rcu);
 843
 844 /**
 845  *      dev_valid_name - check if name is okay for network device
 846  *      @name: name string
 847  *
 848  *      Network device names need to be valid file names to
 849  *      to allow sysfs to work.  We also disallow any kind of
 850  *      whitespace.
 851  */
 852 int dev_valid_name(const char *name)
 853 {
 854         if (*name == '\0')
 855                 return 0;
 856         if (strlen(name) >= IFNAMSIZ)
 857                 return 0;
 858         if (!strcmp(name, ".") || !strcmp(name, ".."))
 859                 return 0;
 860
 861         while (*name) {
 862                 if (*name == '/' || isspace(*name))
 863                         return 0;
 864                 name++;
 865         }
 866         return 1;
 867 }
 868 EXPORT_SYMBOL(dev_valid_name);
 869
 870 /**
 871  *      __dev_alloc_name - allocate a name for a device
 872  *      @net: network namespace to allocate the device name in
 873  *      @name: name format string
 874  *      @buf:  scratch buffer and result name string
 875  *
 876  *      Passed a format string - eg "lt%d" it will try and find a suitable
 877  *      id. It scans list of devices to build up a free map, then chooses
 878  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 879  *      while allocating the name and adding the device in order to avoid
 880  *      duplicates.
 881  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 882  *      Returns the number of the unit assigned or a negative errno code.
 883  */
 884
 885 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 886 {
 887         int i = 0;
 888         const char *p;
 889         const int max_netdevices = 8*PAGE_SIZE;
 890         unsigned long *inuse;
 891         struct net_device *d;
 892
 893         p = strnchr(name, IFNAMSIZ-1, '%');
 894         if (p) {
 895                 /*
 896                  * Verify the string as this thing may have come from
 897                  * the user.  There must be either one "%d" and no other "%"
 898                  * characters.
 899                  */
 900                 if (p[1] != 'd' || strchr(p + 2, '%'))
 901                         return -EINVAL;
 902
 903                 /* Use one page as a bit array of possible slots */
 904                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 905                 if (!inuse)
 906                         return -ENOMEM;
 907
 908                 for_each_netdev(net, d) {
 909                         if (!sscanf(d->name, name, &i))
 910                                 continue;
 911                         if (i < 0 || i >= max_netdevices)
 912                                 continue;
 913
 914                         /*  avoid cases where sscanf is not exact inverse of printf */
 915                         snprintf(buf, IFNAMSIZ, name, i);
 916                         if (!strncmp(buf, d->name, IFNAMSIZ))
 917                                 set_bit(i, inuse);
 918                 }
 919
 920                 i = find_first_zero_bit(inuse, max_netdevices);
 921                 free_page((unsigned long) inuse);
 922         }
 923
 924         if (buf != name)
 925                 snprintf(buf, IFNAMSIZ, name, i);
 926         if (!__dev_get_by_name(net, buf))
 927                 return i;
 928
 929         /* It is possible to run out of possible slots
 930          * when the name is long and there isn't enough space left
 931          * for the digits, or if all bits are used.
 932          */
 933         return -ENFILE;
 934 }
 935
 936 /**
 937  *      dev_alloc_name - allocate a name for a device
 938  *      @dev: device
 939  *      @name: name format string
 940  *
 941  *      Passed a format string - eg "lt%d" it will try and find a suitable
 942  *      id. It scans list of devices to build up a free map, then chooses
 943  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 944  *      while allocating the name and adding the device in order to avoid
 945  *      duplicates.
 946  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 947  *      Returns the number of the unit assigned or a negative errno code.
 948  */
 949
 950 int dev_alloc_name(struct net_device *dev, const char *name)
 951 {
 952         char buf[IFNAMSIZ];
 953         struct net *net;
 954         int ret;
 955
 956         BUG_ON(!dev_net(dev));
 957         net = dev_net(dev);
 958         ret = __dev_alloc_name(net, name, buf);
 959         if (ret >= 0)
 960                 strlcpy(dev->name, buf, IFNAMSIZ);
 961         return ret;
 962 }
 963 EXPORT_SYMBOL(dev_alloc_name);
 964
 965 static int dev_get_valid_name(struct net_device *dev, const char *name)
 966 {
 967         struct net *net;
 968
 969         BUG_ON(!dev_net(dev));
 970         net = dev_net(dev);
 971
 972         if (!dev_valid_name(name))
 973                 return -EINVAL;
 974
 975         if (strchr(name, '%'))
 976                 return dev_alloc_name(dev, name);
 977         else if (__dev_get_by_name(net, name))
 978                 return -EEXIST;
 979         else if (dev->name != name)
 980                 strlcpy(dev->name, name, IFNAMSIZ);
 981
 982         return 0;
 983 }
 984
 985 /**
 986  *      dev_change_name - change name of a device
 987  *      @dev: device
 988  *      @newname: name (or format string) must be at least IFNAMSIZ
 989  *
 990  *      Change name of a device, can pass format strings "eth%d".
 991  *      for wildcarding.
 992  */
 993 int dev_change_name(struct net_device *dev, const char *newname)
 994 {
 995         char oldname[IFNAMSIZ];
 996         int err = 0;
 997         int ret;
 998         struct net *net;
 999
1000         ASSERT_RTNL();
1001         BUG_ON(!dev_net(dev));
1002
1003         net = dev_net(dev);
1004         if (dev->flags & IFF_UP)
1005                 return -EBUSY;
1006
1007         if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
1008                 return 0;
1009
1010         memcpy(oldname, dev->name, IFNAMSIZ);
1011
1012         err = dev_get_valid_name(dev, newname);
1013         if (err < 0)
1014                 return err;
1015
1016 rollback:
1017         ret = device_rename(&dev->dev, dev->name);
1018         if (ret) {
1019                 memcpy(dev->name, oldname, IFNAMSIZ);
1020                 return ret;
1021         }
1022
1023         write_lock_bh(&dev_base_lock);
1024         hlist_del_rcu(&dev->name_hlist);
1025         write_unlock_bh(&dev_base_lock);
1026
1027         synchronize_rcu();
1028
1029         write_lock_bh(&dev_base_lock);
1030         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1031         write_unlock_bh(&dev_base_lock);
1032
1033         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1034         ret = notifier_to_errno(ret);
1035
1036         if (ret) {
1037                 /* err >= 0 after dev_alloc_name() or stores the first errno */
1038                 if (err >= 0) {
1039                         err = ret;
1040                         memcpy(dev->name, oldname, IFNAMSIZ);
1041                         goto rollback;
1042                 } else {
1043                         printk(KERN_ERR
1044                                "%s: name change rollback failed: %d.\n",
1045                                dev->name, ret);
1046                 }
1047         }
1048
1049         return err;
1050 }
1051
1052 /**
1053  *      dev_set_alias - change ifalias of a device
1054  *      @dev: device
1055  *      @alias: name up to IFALIASZ
1056  *      @len: limit of bytes to copy from info
1057  *
1058  *      Set ifalias for a device,
1059  */
1060 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1061 {
1062         ASSERT_RTNL();
1063
1064         if (len >= IFALIASZ)
1065                 return -EINVAL;
1066
1067         if (!len) {
1068                 if (dev->ifalias) {
1069                         kfree(dev->ifalias);
1070                         dev->ifalias = NULL;
1071                 }
1072                 return 0;
1073         }
1074
1075         dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1076         if (!dev->ifalias)
1077                 return -ENOMEM;
1078
1079         strlcpy(dev->ifalias, alias, len+1);
1080         return len;
1081 }
1082
1083
1084 /**
1085  *      netdev_features_change - device changes features
1086  *      @dev: device to cause notification
1087  *
1088  *      Called to indicate a device has changed features.
1089  */
1090 void netdev_features_change(struct net_device *dev)
1091 {
1092         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1093 }
1094 EXPORT_SYMBOL(netdev_features_change);
1095
1096 /**
1097  *      netdev_state_change - device changes state
1098  *      @dev: device to cause notification
1099  *
1100  *      Called to indicate a device has changed state. This function calls
1101  *      the notifier chains for netdev_chain and sends a NEWLINK message
1102  *      to the routing socket.
1103  */
1104 void netdev_state_change(struct net_device *dev)
1105 {
1106         if (dev->flags & IFF_UP) {
1107                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1108                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1109         }
1110 }
1111 EXPORT_SYMBOL(netdev_state_change);
1112
1113 int netdev_bonding_change(struct net_device *dev, unsigned long event)
1114 {
1115         return call_netdevice_notifiers(event, dev);
1116 }
1117 EXPORT_SYMBOL(netdev_bonding_change);
1118
1119 /**
1120  *      dev_load        - load a network module
1121  *      @net: the applicable net namespace
1122  *      @name: name of interface
1123  *
1124  *      If a network interface is not present and the process has suitable
1125  *      privileges this function loads the module. If module loading is not
1126  *      available in this kernel then it becomes a nop.
1127  */
1128
1129 void dev_load(struct net *net, const char *name)
1130 {
1131         struct net_device *dev;
1132         int no_module;
1133
1134         rcu_read_lock();
1135         dev = dev_get_by_name_rcu(net, name);
1136         rcu_read_unlock();
1137
1138         no_module = !dev;
1139         if (no_module && capable(CAP_NET_ADMIN))
1140                 no_module = request_module("netdev-%s", name);
1141         if (no_module && capable(CAP_SYS_MODULE)) {
1142                 if (!request_module("%s", name))
1143                         pr_err("Loading kernel module for a network device "
1144 "with CAP_SYS_MODULE (deprecated).  Use CAP_NET_ADMIN and alias netdev-%s "
1145 "instead\n", name);
1146         }
1147 }
1148 EXPORT_SYMBOL(dev_load);
1149
1150 static int __dev_open(struct net_device *dev)
1151 {
1152         const struct net_device_ops *ops = dev->netdev_ops;
1153         int ret;
1154
1155         ASSERT_RTNL();
1156
1157         if (!netif_device_present(dev))
1158                 return -ENODEV;
1159
1160         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1161         ret = notifier_to_errno(ret);
1162         if (ret)
1163                 return ret;
1164
1165         set_bit(__LINK_STATE_START, &dev->state);
1166
1167         if (ops->ndo_validate_addr)
1168                 ret = ops->ndo_validate_addr(dev);
1169
1170         if (!ret && ops->ndo_open)
1171                 ret = ops->ndo_open(dev);
1172
1173         if (ret)
1174                 clear_bit(__LINK_STATE_START, &dev->state);
1175         else {
1176                 dev->flags |= IFF_UP;
1177                 net_dmaengine_get();
1178                 dev_set_rx_mode(dev);
1179                 dev_activate(dev);
1180         }
1181
1182         return ret;
1183 }
1184
1185 /**
1186  *      dev_open        - prepare an interface for use.
1187  *      @dev:   device to open
1188  *
1189  *      Takes a device from down to up state. The device's private open
1190  *      function is invoked and then the multicast lists are loaded. Finally
1191  *      the device is moved into the up state and a %NETDEV_UP message is
1192  *      sent to the netdev notifier chain.
1193  *
1194  *      Calling this function on an active interface is a nop. On a failure
1195  *      a negative errno code is returned.
1196  */
1197 int dev_open(struct net_device *dev)
1198 {
1199         int ret;
1200
1201         if (dev->flags & IFF_UP)
1202                 return 0;
1203
1204         ret = __dev_open(dev);
1205         if (ret < 0)
1206                 return ret;
1207
1208         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1209         call_netdevice_notifiers(NETDEV_UP, dev);
1210
1211         return ret;
1212 }
1213 EXPORT_SYMBOL(dev_open);
1214
1215 static int __dev_close_many(struct list_head *head)
1216 {
1217         struct net_device *dev;
1218
1219         ASSERT_RTNL();
1220         might_sleep();
1221
1222         list_for_each_entry(dev, head, unreg_list) {
1223                 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1224
1225                 clear_bit(__LINK_STATE_START, &dev->state);
1226
1227                 /* Synchronize to scheduled poll. We cannot touch poll list, it
1228                  * can be even on different cpu. So just clear netif_running().
1229                  *
1230                  * dev->stop() will invoke napi_disable() on all of it's
1231                  * napi_struct instances on this device.
1232                  */
1233                 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1234         }
1235
1236         dev_deactivate_many(head);
1237
1238         list_for_each_entry(dev, head, unreg_list) {
1239                 const struct net_device_ops *ops = dev->netdev_ops;
1240
1241                 /*
1242                  *      Call the device specific close. This cannot fail.
1243                  *      Only if device is UP
1244                  *
1245                  *      We allow it to be called even after a DETACH hot-plug
1246                  *      event.
1247                  */
1248                 if (ops->ndo_stop)
1249                         ops->ndo_stop(dev);
1250
1251                 dev->flags &= ~IFF_UP;
1252                 net_dmaengine_put();
1253         }
1254
1255         return 0;
1256 }
1257
1258 static int __dev_close(struct net_device *dev)
1259 {
1260         int retval;
1261         LIST_HEAD(single);
1262
1263         list_add(&dev->unreg_list, &single);
1264         retval = __dev_close_many(&single);
1265         list_del(&single);
1266         return retval;
1267 }
1268
1269 static int dev_close_many(struct list_head *head)
1270 {
1271         struct net_device *dev, *tmp;
1272         LIST_HEAD(tmp_list);
1273
1274         list_for_each_entry_safe(dev, tmp, head, unreg_list)
1275                 if (!(dev->flags & IFF_UP))
1276                         list_move(&dev->unreg_list, &tmp_list);
1277
1278         __dev_close_many(head);
1279
1280         list_for_each_entry(dev, head, unreg_list) {
1281                 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1282                 call_netdevice_notifiers(NETDEV_DOWN, dev);
1283         }
1284
1285         /* rollback_registered_many needs the complete original list */
1286         list_splice(&tmp_list, head);
1287         return 0;
1288 }
1289
1290 /**
1291  *      dev_close - shutdown an interface.
1292  *      @dev: device to shutdown
1293  *
1294  *      This function moves an active device into down state. A
1295  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1296  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1297  *      chain.
1298  */
1299 int dev_close(struct net_device *dev)
1300 {
1301         if (dev->flags & IFF_UP) {
1302                 LIST_HEAD(single);
1303
1304                 list_add(&dev->unreg_list, &single);
1305                 dev_close_many(&single);
1306                 list_del(&single);
1307         }
1308         return 0;
1309 }
1310 EXPORT_SYMBOL(dev_close);
1311
1312
1313 /**
1314  *      dev_disable_lro - disable Large Receive Offload on a device
1315  *      @dev: device
1316  *
1317  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1318  *      called under RTNL.  This is needed if received packets may be
1319  *      forwarded to another interface.
1320  */
1321 void dev_disable_lro(struct net_device *dev)
1322 {
1323         u32 flags;
1324
1325         /*
1326          * If we're trying to disable lro on a vlan device
1327          * use the underlying physical device instead
1328          */
1329         if (is_vlan_dev(dev))
1330                 dev = vlan_dev_real_dev(dev);
1331
1332         if (dev->ethtool_ops && dev->ethtool_ops->get_flags)
1333                 flags = dev->ethtool_ops->get_flags(dev);
1334         else
1335                 flags = ethtool_op_get_flags(dev);
1336
1337         if (!(flags & ETH_FLAG_LRO))
1338                 return;
1339
1340         __ethtool_set_flags(dev, flags & ~ETH_FLAG_LRO);
1341         if (unlikely(dev->features & NETIF_F_LRO))
1342                 netdev_WARN(dev, "failed to disable LRO!\n");
1343 }
1344 EXPORT_SYMBOL(dev_disable_lro);
1345
1346
1347 static int dev_boot_phase = 1;
1348
1349 /**
1350  *      register_netdevice_notifier - register a network notifier block
1351  *      @nb: notifier
1352  *
1353  *      Register a notifier to be called when network device events occur.
1354  *      The notifier passed is linked into the kernel structures and must
1355  *      not be reused until it has been unregistered. A negative errno code
1356  *      is returned on a failure.
1357  *
1358  *      When registered all registration and up events are replayed
1359  *      to the new notifier to allow device to have a race free
1360  *      view of the network device list.
1361  */
1362
1363 int register_netdevice_notifier(struct notifier_block *nb)
1364 {
1365         struct net_device *dev;
1366         struct net_device *last;
1367         struct net *net;
1368         int err;
1369
1370         rtnl_lock();
1371         err = raw_notifier_chain_register(&netdev_chain, nb);
1372         if (err)
1373                 goto unlock;
1374         if (dev_boot_phase)
1375                 goto unlock;
1376         for_each_net(net) {
1377                 for_each_netdev(net, dev) {
1378                         err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1379                         err = notifier_to_errno(err);
1380                         if (err)
1381                                 goto rollback;
1382
1383                         if (!(dev->flags & IFF_UP))
1384                                 continue;
1385
1386                         nb->notifier_call(nb, NETDEV_UP, dev);
1387                 }
1388         }
1389
1390 unlock:
1391         rtnl_unlock();
1392         return err;
1393
1394 rollback:
1395         last = dev;
1396         for_each_net(net) {
1397                 for_each_netdev(net, dev) {
1398                         if (dev == last)
1399                                 goto outroll;
1400
1401                         if (dev->flags & IFF_UP) {
1402                                 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1403                                 nb->notifier_call(nb, NETDEV_DOWN, dev);
1404                         }
1405                         nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1406                         nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1407                 }
1408         }
1409
1410 outroll:
1411         raw_notifier_chain_unregister(&netdev_chain, nb);
1412         goto unlock;
1413 }
1414 EXPORT_SYMBOL(register_netdevice_notifier);
1415
1416 /**
1417  *      unregister_netdevice_notifier - unregister a network notifier block
1418  *      @nb: notifier
1419  *
1420  *      Unregister a notifier previously registered by
1421  *      register_netdevice_notifier(). The notifier is unlinked into the
1422  *      kernel structures and may then be reused. A negative errno code
1423  *      is returned on a failure.
1424  *
1425  *      After unregistering unregister and down device events are synthesized
1426  *      for all devices on the device list to the removed notifier to remove
1427  *      the need for special case cleanup code.
1428  */
1429
1430 int unregister_netdevice_notifier(struct notifier_block *nb)
1431 {
1432         struct net_device *dev;
1433         struct net *net;
1434         int err;
1435
1436         rtnl_lock();
1437         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1438         if (err)
1439                 goto unlock;
1440
1441         for_each_net(net) {
1442                 for_each_netdev(net, dev) {
1443                         if (dev->flags & IFF_UP) {
1444                                 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1445                                 nb->notifier_call(nb, NETDEV_DOWN, dev);
1446                         }
1447                         nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1448                         nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1449                 }
1450         }
1451 unlock:
1452         rtnl_unlock();
1453         return err;
1454 }
1455 EXPORT_SYMBOL(unregister_netdevice_notifier);
1456
1457 /**
1458  *      call_netdevice_notifiers - call all network notifier blocks
1459  *      @val: value passed unmodified to notifier function
1460  *      @dev: net_device pointer passed unmodified to notifier function
1461  *
1462  *      Call all network notifier blocks.  Parameters and return value
1463  *      are as for raw_notifier_call_chain().
1464  */
1465
1466 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1467 {
1468         ASSERT_RTNL();
1469         return raw_notifier_call_chain(&netdev_chain, val, dev);
1470 }
1471 EXPORT_SYMBOL(call_netdevice_notifiers);
1472
1473 /* When > 0 there are consumers of rx skb time stamps */
1474 static atomic_t netstamp_needed = ATOMIC_INIT(0);
1475
1476 void net_enable_timestamp(void)
1477 {
1478         atomic_inc(&netstamp_needed);
1479 }
1480 EXPORT_SYMBOL(net_enable_timestamp);
1481
1482 void net_disable_timestamp(void)
1483 {
1484         atomic_dec(&netstamp_needed);
1485 }
1486 EXPORT_SYMBOL(net_disable_timestamp);
1487
1488 static inline void net_timestamp_set(struct sk_buff *skb)
1489 {
1490         if (atomic_read(&netstamp_needed))
1491                 __net_timestamp(skb);
1492         else
1493                 skb->tstamp.tv64 = 0;
1494 }
1495
1496 static inline void net_timestamp_check(struct sk_buff *skb)
1497 {
1498         if (!skb->tstamp.tv64 && atomic_read(&netstamp_needed))
1499                 __net_timestamp(skb);
1500 }
1501
1502 static int net_hwtstamp_validate(struct ifreq *ifr)
1503 {
1504         struct hwtstamp_config cfg;
1505         enum hwtstamp_tx_types tx_type;
1506         enum hwtstamp_rx_filters rx_filter;
1507         int tx_type_valid = 0;
1508         int rx_filter_valid = 0;
1509
1510         if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg)))
1511                 return -EFAULT;
1512
1513         if (cfg.flags) /* reserved for future extensions */
1514                 return -EINVAL;
1515
1516         tx_type = cfg.tx_type;
1517         rx_filter = cfg.rx_filter;
1518
1519         switch (tx_type) {
1520         case HWTSTAMP_TX_OFF:
1521         case HWTSTAMP_TX_ON:
1522         case HWTSTAMP_TX_ONESTEP_SYNC:
1523                 tx_type_valid = 1;
1524                 break;
1525         }
1526
1527         switch (rx_filter) {
1528         case HWTSTAMP_FILTER_NONE:
1529         case HWTSTAMP_FILTER_ALL:
1530         case HWTSTAMP_FILTER_SOME:
1531         case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
1532         case HWTSTAMP_FILTER_PTP_V1_L4_SYNC:
1533         case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ:
1534         case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
1535         case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
1536         case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
1537         case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
1538         case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
1539         case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
1540         case HWTSTAMP_FILTER_PTP_V2_EVENT:
1541         case HWTSTAMP_FILTER_PTP_V2_SYNC:
1542         case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
1543                 rx_filter_valid = 1;
1544                 break;
1545         }
1546
1547         if (!tx_type_valid || !rx_filter_valid)
1548                 return -ERANGE;
1549
1550         return 0;
1551 }
1552
1553 static inline bool is_skb_forwardable(struct net_device *dev,
1554                                       struct sk_buff *skb)
1555 {
1556         unsigned int len;
1557
1558         if (!(dev->flags & IFF_UP))
1559                 return false;
1560
1561         len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1562         if (skb->len <= len)
1563                 return true;
1564
1565         /* if TSO is enabled, we don't care about the length as the packet
1566          * could be forwarded without being segmented before
1567          */
1568         if (skb_is_gso(skb))
1569                 return true;
1570
1571         return false;
1572 }
1573
1574 /**
1575  * dev_forward_skb - loopback an skb to another netif
1576  *
1577  * @dev: destination network device
1578  * @skb: buffer to forward
1579  *
1580  * return values:
1581  *      NET_RX_SUCCESS  (no congestion)
1582  *      NET_RX_DROP     (packet was dropped, but freed)
1583  *
1584  * dev_forward_skb can be used for injecting an skb from the
1585  * start_xmit function of one device into the receive queue
1586  * of another device.
1587  *
1588  * The receiving device may be in another namespace, so
1589  * we have to clear all information in the skb that could
1590  * impact namespace isolation.
1591  */
1592 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1593 {
1594         if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1595                 if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1596                         atomic_long_inc(&dev->rx_dropped);
1597                         kfree_skb(skb);
1598                         return NET_RX_DROP;
1599                 }
1600         }
1601
1602         skb_orphan(skb);
1603         nf_reset(skb);
1604
1605         if (unlikely(!is_skb_forwardable(dev, skb))) {
1606                 atomic_long_inc(&dev->rx_dropped);
1607                 kfree_skb(skb);
1608                 return NET_RX_DROP;
1609         }
1610         skb_set_dev(skb, dev);
1611         skb->tstamp.tv64 = 0;
1612         skb->pkt_type = PACKET_HOST;
1613         skb->protocol = eth_type_trans(skb, dev);
1614         return netif_rx(skb);
1615 }
1616 EXPORT_SYMBOL_GPL(dev_forward_skb);
1617
1618 static inline int deliver_skb(struct sk_buff *skb,
1619                               struct packet_type *pt_prev,
1620                               struct net_device *orig_dev)
1621 {
1622         atomic_inc(&skb->users);
1623         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1624 }
1625
1626 /*
1627  *      Support routine. Sends outgoing frames to any network
1628  *      taps currently in use.
1629  */
1630
1631 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1632 {
1633         struct packet_type *ptype;
1634         struct sk_buff *skb2 = NULL;
1635         struct packet_type *pt_prev = NULL;
1636
1637         rcu_read_lock();
1638         list_for_each_entry_rcu(ptype, &ptype_all, list) {
1639                 /* Never send packets back to the socket
1640                  * they originated from - MvS (miquels@drinkel.ow.org)
1641                  */
1642                 if ((ptype->dev == dev || !ptype->dev) &&
1643                     (ptype->af_packet_priv == NULL ||
1644                      (struct sock *)ptype->af_packet_priv != skb->sk)) {
1645                         if (pt_prev) {
1646                                 deliver_skb(skb2, pt_prev, skb->dev);
1647                                 pt_prev = ptype;
1648                                 continue;
1649                         }
1650
1651                         skb2 = skb_clone(skb, GFP_ATOMIC);
1652                         if (!skb2)
1653                                 break;
1654
1655                         net_timestamp_set(skb2);
1656
1657                         /* skb->nh should be correctly
1658                            set by sender, so that the second statement is
1659                            just protection against buggy protocols.
1660                          */
1661                         skb_reset_mac_header(skb2);
1662
1663                         if (skb_network_header(skb2) < skb2->data ||
1664                             skb2->network_header > skb2->tail) {
1665                                 if (net_ratelimit())
1666                                         printk(KERN_CRIT "protocol %04x is "
1667                                                "buggy, dev %s\n",
1668                                                ntohs(skb2->protocol),
1669                                                dev->name);
1670                                 skb_reset_network_header(skb2);
1671                         }
1672
1673                         skb2->transport_header = skb2->network_header;
1674                         skb2->pkt_type = PACKET_OUTGOING;
1675                         pt_prev = ptype;
1676                 }
1677         }
1678         if (pt_prev)
1679                 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1680         rcu_read_unlock();
1681 }
1682
1683 /* netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1684  * @dev: Network device
1685  * @txq: number of queues available
1686  *
1687  * If real_num_tx_queues is changed the tc mappings may no longer be
1688  * valid. To resolve this verify the tc mapping remains valid and if
1689  * not NULL the mapping. With no priorities mapping to this
1690  * offset/count pair it will no longer be used. In the worst case TC0
1691  * is invalid nothing can be done so disable priority mappings. If is
1692  * expected that drivers will fix this mapping if they can before
1693  * calling netif_set_real_num_tx_queues.
1694  */
1695 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1696 {
1697         int i;
1698         struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1699
1700         /* If TC0 is invalidated disable TC mapping */
1701         if (tc->offset + tc->count > txq) {
1702                 pr_warning("Number of in use tx queues changed "
1703                            "invalidating tc mappings. Priority "
1704                            "traffic classification disabled!\n");
1705                 dev->num_tc = 0;
1706                 return;
1707         }
1708
1709         /* Invalidated prio to tc mappings set to TC0 */
1710         for (i = 1; i < TC_BITMASK + 1; i++) {
1711                 int q = netdev_get_prio_tc_map(dev, i);
1712
1713                 tc = &dev->tc_to_txq[q];
1714                 if (tc->offset + tc->count > txq) {
1715                         pr_warning("Number of in use tx queues "
1716                                    "changed. Priority %i to tc "
1717                                    "mapping %i is no longer valid "
1718                                    "setting map to 0\n",
1719                                    i, q);
1720                         netdev_set_prio_tc_map(dev, i, 0);
1721                 }
1722         }
1723 }
1724
1725 /*
1726  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1727  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1728  */
1729 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1730 {
1731         int rc;
1732
1733         if (txq < 1 || txq > dev->num_tx_queues)
1734                 return -EINVAL;
1735
1736         if (dev->reg_state == NETREG_REGISTERED ||
1737             dev->reg_state == NETREG_UNREGISTERING) {
1738                 ASSERT_RTNL();
1739
1740                 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
1741                                                   txq);
1742                 if (rc)
1743                         return rc;
1744
1745                 if (dev->num_tc)
1746                         netif_setup_tc(dev, txq);
1747
1748                 if (txq < dev->real_num_tx_queues)
1749                         qdisc_reset_all_tx_gt(dev, txq);
1750         }
1751
1752         dev->real_num_tx_queues = txq;
1753         return 0;
1754 }
1755 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
1756
1757 #ifdef CONFIG_RPS
1758 /**
1759  *      netif_set_real_num_rx_queues - set actual number of RX queues used
1760  *      @dev: Network device
1761  *      @rxq: Actual number of RX queues
1762  *
1763  *      This must be called either with the rtnl_lock held or before
1764  *      registration of the net device.  Returns 0 on success, or a
1765  *      negative error code.  If called before registration, it always
1766  *      succeeds.
1767  */
1768 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
1769 {
1770         int rc;
1771
1772         if (rxq < 1 || rxq > dev->num_rx_queues)
1773                 return -EINVAL;
1774
1775         if (dev->reg_state == NETREG_REGISTERED) {
1776                 ASSERT_RTNL();
1777
1778                 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
1779                                                   rxq);
1780                 if (rc)
1781                         return rc;
1782         }
1783
1784         dev->real_num_rx_queues = rxq;
1785         return 0;
1786 }
1787 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
1788 #endif
1789
1790 static inline void __netif_reschedule(struct Qdisc *q)
1791 {
1792         struct softnet_data *sd;
1793         unsigned long flags;
1794
1795         local_irq_save(flags);
1796         sd = &__get_cpu_var(softnet_data);
1797         q->next_sched = NULL;
1798         *sd->output_queue_tailp = q;
1799         sd->output_queue_tailp = &q->next_sched;
1800         raise_softirq_irqoff(NET_TX_SOFTIRQ);
1801         local_irq_restore(flags);
1802 }
1803
1804 void __netif_schedule(struct Qdisc *q)
1805 {
1806         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1807                 __netif_reschedule(q);
1808 }
1809 EXPORT_SYMBOL(__netif_schedule);
1810
1811 void dev_kfree_skb_irq(struct sk_buff *skb)
1812 {
1813         if (atomic_dec_and_test(&skb->users)) {
1814                 struct softnet_data *sd;
1815                 unsigned long flags;
1816
1817                 local_irq_save(flags);
1818                 sd = &__get_cpu_var(softnet_data);
1819                 skb->next = sd->completion_queue;
1820                 sd->completion_queue = skb;
1821                 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1822                 local_irq_restore(flags);
1823         }
1824 }
1825 EXPORT_SYMBOL(dev_kfree_skb_irq);
1826
1827 void dev_kfree_skb_any(struct sk_buff *skb)
1828 {
1829         if (in_irq() || irqs_disabled())
1830                 dev_kfree_skb_irq(skb);
1831         else
1832                 dev_kfree_skb(skb);
1833 }
1834 EXPORT_SYMBOL(dev_kfree_skb_any);
1835
1836
1837 /**
1838  * netif_device_detach - mark device as removed
1839  * @dev: network device
1840  *
1841  * Mark device as removed from system and therefore no longer available.
1842  */
1843 void netif_device_detach(struct net_device *dev)
1844 {
1845         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1846             netif_running(dev)) {
1847                 netif_tx_stop_all_queues(dev);
1848         }
1849 }
1850 EXPORT_SYMBOL(netif_device_detach);
1851
1852 /**
1853  * netif_device_attach - mark device as attached
1854  * @dev: network device
1855  *
1856  * Mark device as attached from system and restart if needed.
1857  */
1858 void netif_device_attach(struct net_device *dev)
1859 {
1860         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1861             netif_running(dev)) {
1862                 netif_tx_wake_all_queues(dev);
1863                 __netdev_watchdog_up(dev);
1864         }
1865 }
1866 EXPORT_SYMBOL(netif_device_attach);
1867
1868 /**
1869  * skb_dev_set -- assign a new device to a buffer
1870  * @skb: buffer for the new device
1871  * @dev: network device
1872  *
1873  * If an skb is owned by a device already, we have to reset
1874  * all data private to the namespace a device belongs to
1875  * before assigning it a new device.
1876  */
1877 #ifdef CONFIG_NET_NS
1878 void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
1879 {
1880         skb_dst_drop(skb);
1881         if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
1882                 secpath_reset(skb);
1883                 nf_reset(skb);
1884                 skb_init_secmark(skb);
1885                 skb->mark = 0;
1886                 skb->priority = 0;
1887                 skb->nf_trace = 0;
1888                 skb->ipvs_property = 0;
1889 #ifdef CONFIG_NET_SCHED
1890                 skb->tc_index = 0;
1891 #endif
1892         }
1893         skb->dev = dev;
1894 }
1895 EXPORT_SYMBOL(skb_set_dev);
1896 #endif /* CONFIG_NET_NS */
1897
1898 /*
1899  * Invalidate hardware checksum when packet is to be mangled, and
1900  * complete checksum manually on outgoing path.
1901  */
1902 int skb_checksum_help(struct sk_buff *skb)
1903 {
1904         __wsum csum;
1905         int ret = 0, offset;
1906
1907         if (skb->ip_summed == CHECKSUM_COMPLETE)
1908                 goto out_set_summed;
1909
1910         if (unlikely(skb_shinfo(skb)->gso_size)) {
1911                 /* Let GSO fix up the checksum. */
1912                 goto out_set_summed;
1913         }
1914
1915         offset = skb_checksum_start_offset(skb);
1916         BUG_ON(offset >= skb_headlen(skb));
1917         csum = skb_checksum(skb, offset, skb->len - offset, 0);
1918
1919         offset += skb->csum_offset;
1920         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1921
1922         if (skb_cloned(skb) &&
1923             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1924                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1925                 if (ret)
1926                         goto out;
1927         }
1928
1929         *(__sum16 *)(skb->data + offset) = csum_fold(csum);
1930 out_set_summed:
1931         skb->ip_summed = CHECKSUM_NONE;
1932 out:
1933         return ret;
1934 }
1935 EXPORT_SYMBOL(skb_checksum_help);
1936
1937 /**
1938  *      skb_gso_segment - Perform segmentation on skb.
1939  *      @skb: buffer to segment
1940  *      @features: features for the output path (see dev->features)
1941  *
1942  *      This function segments the given skb and returns a list of segments.
1943  *
1944  *      It may return NULL if the skb requires no segmentation.  This is
1945  *      only possible when GSO is used for verifying header integrity.
1946  */
1947 struct sk_buff *skb_gso_segment(struct sk_buff *skb, u32 features)
1948 {
1949         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1950         struct packet_type *ptype;
1951         __be16 type = skb->protocol;
1952         int vlan_depth = ETH_HLEN;
1953         int err;
1954
1955         while (type == htons(ETH_P_8021Q)) {
1956                 struct vlan_hdr *vh;
1957
1958                 if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
1959                         return ERR_PTR(-EINVAL);
1960
1961                 vh = (struct vlan_hdr *)(skb->data + vlan_depth);
1962                 type = vh->h_vlan_encapsulated_proto;
1963                 vlan_depth += VLAN_HLEN;
1964         }
1965
1966         skb_reset_mac_header(skb);
1967         skb->mac_len = skb->network_header - skb->mac_header;
1968         __skb_pull(skb, skb->mac_len);
1969
1970         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1971                 struct net_device *dev = skb->dev;
1972                 struct ethtool_drvinfo info = {};
1973
1974                 if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1975                         dev->ethtool_ops->get_drvinfo(dev, &info);
1976
1977                 WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d ip_summed=%d\n",
1978                      info.driver, dev ? dev->features : 0L,
1979                      skb->sk ? skb->sk->sk_route_caps : 0L,
1980                      skb->len, skb->data_len, skb->ip_summed);
1981
1982                 if (skb_header_cloned(skb) &&
1983                     (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1984                         return ERR_PTR(err);
1985         }
1986
1987         rcu_read_lock();
1988         list_for_each_entry_rcu(ptype,
1989                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1990                 if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1991                         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1992                                 err = ptype->gso_send_check(skb);
1993                                 segs = ERR_PTR(err);
1994                                 if (err || skb_gso_ok(skb, features))
1995                                         break;
1996                                 __skb_push(skb, (skb->data -
1997                                                  skb_network_header(skb)));
1998                         }
1999                         segs = ptype->gso_segment(skb, features);
2000                         break;
2001                 }
2002         }
2003         rcu_read_unlock();
2004
2005         __skb_push(skb, skb->data - skb_mac_header(skb));
2006
2007         return segs;
2008 }
2009 EXPORT_SYMBOL(skb_gso_segment);
2010
2011 /* Take action when hardware reception checksum errors are detected. */
2012 #ifdef CONFIG_BUG
2013 void netdev_rx_csum_fault(struct net_device *dev)
2014 {
2015         if (net_ratelimit()) {
2016                 printk(KERN_ERR "%s: hw csum failure.\n",
2017                         dev ? dev->name : "<unknown>");
2018                 dump_stack();
2019         }
2020 }
2021 EXPORT_SYMBOL(netdev_rx_csum_fault);
2022 #endif
2023
2024 /* Actually, we should eliminate this check as soon as we know, that:
2025  * 1. IOMMU is present and allows to map all the memory.
2026  * 2. No high memory really exists on this machine.
2027  */
2028
2029 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2030 {
2031 #ifdef CONFIG_HIGHMEM
2032         int i;
2033         if (!(dev->features & NETIF_F_HIGHDMA)) {
2034                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2035                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2036                         if (PageHighMem(skb_frag_page(frag)))
2037                                 return 1;
2038                 }
2039         }
2040
2041         if (PCI_DMA_BUS_IS_PHYS) {
2042                 struct device *pdev = dev->dev.parent;
2043
2044                 if (!pdev)
2045                         return 0;
2046                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2047                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2048                         dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2049                         if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2050                                 return 1;
2051                 }
2052         }
2053 #endif
2054         return 0;
2055 }
2056
2057 struct dev_gso_cb {
2058         void (*destructor)(struct sk_buff *skb);
2059 };
2060
2061 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2062
2063 static void dev_gso_skb_destructor(struct sk_buff *skb)
2064 {
2065         struct dev_gso_cb *cb;
2066
2067         do {
2068                 struct sk_buff *nskb = skb->next;
2069
2070                 skb->next = nskb->next;
2071                 nskb->next = NULL;
2072                 kfree_skb(nskb);
2073         } while (skb->next);
2074
2075         cb = DEV_GSO_CB(skb);
2076         if (cb->destructor)
2077                 cb->destructor(skb);
2078 }
2079
2080 /**
2081  *      dev_gso_segment - Perform emulated hardware segmentation on skb.
2082  *      @skb: buffer to segment
2083  *      @features: device features as applicable to this skb
2084  *
2085  *      This function segments the given skb and stores the list of segments
2086  *      in skb->next.
2087  */
2088 static int dev_gso_segment(struct sk_buff *skb, int features)
2089 {
2090         struct sk_buff *segs;
2091
2092         segs = skb_gso_segment(skb, features);
2093
2094         /* Verifying header integrity only. */
2095         if (!segs)
2096                 return 0;
2097
2098         if (IS_ERR(segs))
2099                 return PTR_ERR(segs);
2100
2101         skb->next = segs;
2102         DEV_GSO_CB(skb)->destructor = skb->destructor;
2103         skb->destructor = dev_gso_skb_destructor;
2104
2105         return 0;
2106 }
2107
2108 /*
2109  * Try to orphan skb early, right before transmission by the device.
2110  * We cannot orphan skb if tx timestamp is requested or the sk-reference
2111  * is needed on driver level for other reasons, e.g. see net/can/raw.c
2112  */
2113 static inline void skb_orphan_try(struct sk_buff *skb)
2114 {
2115         struct sock *sk = skb->sk;
2116
2117         if (sk && !skb_shinfo(skb)->tx_flags) {
2118                 /* skb_tx_hash() wont be able to get sk.
2119                  * We copy sk_hash into skb->rxhash
2120                  */
2121                 if (!skb->rxhash)
2122                         skb->rxhash = sk->sk_hash;
2123                 skb_orphan(skb);
2124         }
2125 }
2126
2127 static bool can_checksum_protocol(unsigned long features, __be16 protocol)
2128 {
2129         return ((features & NETIF_F_GEN_CSUM) ||
2130                 ((features & NETIF_F_V4_CSUM) &&
2131                  protocol == htons(ETH_P_IP)) ||
2132                 ((features & NETIF_F_V6_CSUM) &&
2133                  protocol == htons(ETH_P_IPV6)) ||
2134                 ((features & NETIF_F_FCOE_CRC) &&
2135                  protocol == htons(ETH_P_FCOE)));
2136 }
2137
2138 static u32 harmonize_features(struct sk_buff *skb, __be16 protocol, u32 features)
2139 {
2140         if (!can_checksum_protocol(features, protocol)) {
2141                 features &= ~NETIF_F_ALL_CSUM;
2142                 features &= ~NETIF_F_SG;
2143         } else if (illegal_highdma(skb->dev, skb)) {
2144                 features &= ~NETIF_F_SG;
2145         }
2146
2147         return features;
2148 }
2149
2150 u32 netif_skb_features(struct sk_buff *skb)
2151 {
2152         __be16 protocol = skb->protocol;
2153         u32 features = skb->dev->features;
2154
2155         if (protocol == htons(ETH_P_8021Q)) {
2156                 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2157                 protocol = veh->h_vlan_encapsulated_proto;
2158         } else if (!vlan_tx_tag_present(skb)) {
2159                 return harmonize_features(skb, protocol, features);
2160         }
2161
2162         features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
2163
2164         if (protocol != htons(ETH_P_8021Q)) {
2165                 return harmonize_features(skb, protocol, features);
2166         } else {
2167                 features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2168                                 NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
2169                 return harmonize_features(skb, protocol, features);
2170         }
2171 }
2172 EXPORT_SYMBOL(netif_skb_features);
2173
2174 /*
2175  * Returns true if either:
2176  *      1. skb has frag_list and the device doesn't support FRAGLIST, or
2177  *      2. skb is fragmented and the device does not support SG, or if
2178  *         at least one of fragments is in highmem and device does not
2179  *         support DMA from it.
2180  */
2181 static inline int skb_needs_linearize(struct sk_buff *skb,
2182                                       int features)
2183 {
2184         return skb_is_nonlinear(skb) &&
2185                         ((skb_has_frag_list(skb) &&
2186                                 !(features & NETIF_F_FRAGLIST)) ||
2187                         (skb_shinfo(skb)->nr_frags &&
2188                                 !(features & NETIF_F_SG)));
2189 }
2190
2191 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2192                         struct netdev_queue *txq)
2193 {
2194         const struct net_device_ops *ops = dev->netdev_ops;
2195         int rc = NETDEV_TX_OK;
2196         unsigned int skb_len;
2197
2198         if (likely(!skb->next)) {
2199                 u32 features;
2200
2201                 /*
2202                  * If device doesn't need skb->dst, release it right now while
2203                  * its hot in this cpu cache
2204                  */
2205                 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2206                         skb_dst_drop(skb);
2207
2208                 if (!list_empty(&ptype_all))
2209                         dev_queue_xmit_nit(skb, dev);
2210
2211                 skb_orphan_try(skb);
2212
2213                 features = netif_skb_features(skb);
2214
2215                 if (vlan_tx_tag_present(skb) &&
2216                     !(features & NETIF_F_HW_VLAN_TX)) {
2217                         skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2218                         if (unlikely(!skb))
2219                                 goto out;
2220
2221                         skb->vlan_tci = 0;
2222                 }
2223
2224                 if (netif_needs_gso(skb, features)) {
2225                         if (unlikely(dev_gso_segment(skb, features)))
2226                                 goto out_kfree_skb;
2227                         if (skb->next)
2228                                 goto gso;
2229                 } else {
2230                         if (skb_needs_linearize(skb, features) &&
2231                             __skb_linearize(skb))
2232                                 goto out_kfree_skb;
2233
2234                         /* If packet is not checksummed and device does not
2235                          * support checksumming for this protocol, complete
2236                          * checksumming here.
2237                          */
2238                         if (skb->ip_summed == CHECKSUM_PARTIAL) {
2239                                 skb_set_transport_header(skb,
2240                                         skb_checksum_start_offset(skb));
2241                                 if (!(features & NETIF_F_ALL_CSUM) &&
2242                                      skb_checksum_help(skb))
2243                                         goto out_kfree_skb;
2244                         }
2245                 }
2246
2247                 skb_len = skb->len;
2248                 rc = ops->ndo_start_xmit(skb, dev);
2249                 trace_net_dev_xmit(skb, rc, dev, skb_len);
2250                 if (rc == NETDEV_TX_OK)
2251                         txq_trans_update(txq);
2252                 return rc;
2253         }
2254
2255 gso:
2256         do {
2257                 struct sk_buff *nskb = skb->next;
2258
2259                 skb->next = nskb->next;
2260                 nskb->next = NULL;
2261
2262                 /*
2263                  * If device doesn't need nskb->dst, release it right now while
2264                  * its hot in this cpu cache
2265                  */
2266                 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2267                         skb_dst_drop(nskb);
2268
2269                 skb_len = nskb->len;
2270                 rc = ops->ndo_start_xmit(nskb, dev);
2271                 trace_net_dev_xmit(nskb, rc, dev, skb_len);
2272                 if (unlikely(rc != NETDEV_TX_OK)) {
2273                         if (rc & ~NETDEV_TX_MASK)
2274                                 goto out_kfree_gso_skb;
2275                         nskb->next = skb->next;
2276                         skb->next = nskb;
2277                         return rc;
2278                 }
2279                 txq_trans_update(txq);
2280                 if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
2281                         return NETDEV_TX_BUSY;
2282         } while (skb->next);
2283
2284 out_kfree_gso_skb:
2285         if (likely(skb->next == NULL))
2286                 skb->destructor = DEV_GSO_CB(skb)->destructor;
2287 out_kfree_skb:
2288         kfree_skb(skb);
2289 out:
2290         return rc;
2291 }
2292
2293 static u32 hashrnd __read_mostly;
2294
2295 /*
2296  * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2297  * to be used as a distribution range.
2298  */
2299 u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
2300                   unsigned int num_tx_queues)
2301 {
2302         u32 hash;
2303         u16 qoffset = 0;
2304         u16 qcount = num_tx_queues;
2305
2306         if (skb_rx_queue_recorded(skb)) {
2307                 hash = skb_get_rx_queue(skb);
2308                 while (unlikely(hash >= num_tx_queues))
2309                         hash -= num_tx_queues;
2310                 return hash;
2311         }
2312
2313         if (dev->num_tc) {
2314                 u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2315                 qoffset = dev->tc_to_txq[tc].offset;
2316                 qcount = dev->tc_to_txq[tc].count;
2317         }
2318
2319         if (skb->sk && skb->sk->sk_hash)
2320                 hash = skb->sk->sk_hash;
2321         else
2322                 hash = (__force u16) skb->protocol ^ skb->rxhash;
2323         hash = jhash_1word(hash, hashrnd);
2324
2325         return (u16) (((u64) hash * qcount) >> 32) + qoffset;
2326 }
2327 EXPORT_SYMBOL(__skb_tx_hash);
2328
2329 static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2330 {
2331         if (unlikely(queue_index >= dev->real_num_tx_queues)) {
2332                 if (net_ratelimit()) {
2333                         pr_warning("%s selects TX queue %d, but "
2334                                 "real number of TX queues is %d\n",
2335                                 dev->name, queue_index, dev->real_num_tx_queues);
2336                 }
2337                 return 0;
2338         }
2339         return queue_index;
2340 }
2341
2342 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2343 {
2344 #ifdef CONFIG_XPS
2345         struct xps_dev_maps *dev_maps;
2346         struct xps_map *map;
2347         int queue_index = -1;
2348
2349         rcu_read_lock();
2350         dev_maps = rcu_dereference(dev->xps_maps);
2351         if (dev_maps) {
2352                 map = rcu_dereference(
2353                     dev_maps->cpu_map[raw_smp_processor_id()]);
2354                 if (map) {
2355                         if (map->len == 1)
2356                                 queue_index = map->queues[0];
2357                         else {
2358                                 u32 hash;
2359                                 if (skb->sk && skb->sk->sk_hash)
2360                                         hash = skb->sk->sk_hash;
2361                                 else
2362                                         hash = (__force u16) skb->protocol ^
2363                                             skb->rxhash;
2364                                 hash = jhash_1word(hash, hashrnd);
2365                                 queue_index = map->queues[
2366                                     ((u64)hash * map->len) >> 32];
2367                         }
2368                         if (unlikely(queue_index >= dev->real_num_tx_queues))
2369                                 queue_index = -1;
2370                 }
2371         }
2372         rcu_read_unlock();
2373
2374         return queue_index;
2375 #else
2376         return -1;
2377 #endif
2378 }
2379
2380 static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2381                                         struct sk_buff *skb)
2382 {
2383         int queue_index;
2384         const struct net_device_ops *ops = dev->netdev_ops;
2385
2386         if (dev->real_num_tx_queues == 1)
2387                 queue_index = 0;
2388         else if (ops->ndo_select_queue) {
2389                 queue_index = ops->ndo_select_queue(dev, skb);
2390                 queue_index = dev_cap_txqueue(dev, queue_index);
2391         } else {
2392                 struct sock *sk = skb->sk;
2393                 queue_index = sk_tx_queue_get(sk);
2394
2395                 if (queue_index < 0 || skb->ooo_okay ||
2396                     queue_index >= dev->real_num_tx_queues) {
2397                         int old_index = queue_index;
2398
2399                         queue_index = get_xps_queue(dev, skb);
2400                         if (queue_index < 0)
2401                                 queue_index = skb_tx_hash(dev, skb);
2402
2403                         if (queue_index != old_index && sk) {
2404                                 struct dst_entry *dst =
2405                                     rcu_dereference_check(sk->sk_dst_cache, 1);
2406
2407                                 if (dst && skb_dst(skb) == dst)
2408                                         sk_tx_queue_set(sk, queue_index);
2409                         }
2410                 }
2411         }
2412
2413         skb_set_queue_mapping(skb, queue_index);
2414         return netdev_get_tx_queue(dev, queue_index);
2415 }
2416
2417 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2418                                  struct net_device *dev,
2419                                  struct netdev_queue *txq)
2420 {
2421         spinlock_t *root_lock = qdisc_lock(q);
2422         bool contended;
2423         int rc;
2424
2425         qdisc_skb_cb(skb)->pkt_len = skb->len;
2426         qdisc_calculate_pkt_len(skb, q);
2427         /*
2428          * Heuristic to force contended enqueues to serialize on a
2429          * separate lock before trying to get qdisc main lock.
2430          * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2431          * and dequeue packets faster.
2432          */
2433         contended = qdisc_is_running(q);
2434         if (unlikely(contended))
2435                 spin_lock(&q->busylock);
2436
2437         spin_lock(root_lock);
2438         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2439                 kfree_skb(skb);
2440                 rc = NET_XMIT_DROP;
2441         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2442                    qdisc_run_begin(q)) {
2443                 /*
2444                  * This is a work-conserving queue; there are no old skbs
2445                  * waiting to be sent out; and the qdisc is not running -
2446                  * xmit the skb directly.
2447                  */
2448                 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2449                         skb_dst_force(skb);
2450
2451                 qdisc_bstats_update(q, skb);
2452
2453                 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2454                         if (unlikely(contended)) {
2455                                 spin_unlock(&q->busylock);
2456                                 contended = false;
2457                         }
2458                         __qdisc_run(q);
2459                 } else
2460                         qdisc_run_end(q);
2461
2462                 rc = NET_XMIT_SUCCESS;
2463         } else {
2464                 skb_dst_force(skb);
2465                 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2466                 if (qdisc_run_begin(q)) {
2467                         if (unlikely(contended)) {
2468                                 spin_unlock(&q->busylock);
2469                                 contended = false;
2470                         }
2471                         __qdisc_run(q);
2472                 }
2473         }
2474         spin_unlock(root_lock);
2475         if (unlikely(contended))
2476                 spin_unlock(&q->busylock);
2477         return rc;
2478 }
2479
2480 static DEFINE_PER_CPU(int, xmit_recursion);
2481 #define RECURSION_LIMIT 10
2482
2483 /**
2484  *      dev_queue_xmit - transmit a buffer
2485  *      @skb: buffer to transmit
2486  *
2487  *      Queue a buffer for transmission to a network device. The caller must
2488  *      have set the device and priority and built the buffer before calling
2489  *      this function. The function can be called from an interrupt.
2490  *
2491  *      A negative errno code is returned on a failure. A success does not
2492  *      guarantee the frame will be transmitted as it may be dropped due
2493  *      to congestion or traffic shaping.
2494  *
2495  * -----------------------------------------------------------------------------------
2496  *      I notice this method can also return errors from the queue disciplines,
2497  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2498  *      be positive.
2499  *
2500  *      Regardless of the return value, the skb is consumed, so it is currently
2501  *      difficult to retry a send to this method.  (You can bump the ref count
2502  *      before sending to hold a reference for retry if you are careful.)
2503  *
2504  *      When calling this method, interrupts MUST be enabled.  This is because
2505  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2506  *          --BLG
2507  */
2508 int dev_queue_xmit(struct sk_buff *skb)
2509 {
2510         struct net_device *dev = skb->dev;
2511         struct netdev_queue *txq;
2512         struct Qdisc *q;
2513         int rc = -ENOMEM;
2514
2515         /* Disable soft irqs for various locks below. Also
2516          * stops preemption for RCU.
2517          */
2518         rcu_read_lock_bh();
2519
2520         txq = dev_pick_tx(dev, skb);
2521         q = rcu_dereference_bh(txq->qdisc);
2522
2523 #ifdef CONFIG_NET_CLS_ACT
2524         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2525 #endif
2526         trace_net_dev_queue(skb);
2527         if (q->enqueue) {
2528                 rc = __dev_xmit_skb(skb, q, dev, txq);
2529                 goto out;
2530         }
2531
2532         /* The device has no queue. Common case for software devices:
2533            loopback, all the sorts of tunnels...
2534
2535            Really, it is unlikely that netif_tx_lock protection is necessary
2536            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2537            counters.)
2538            However, it is possible, that they rely on protection
2539            made by us here.
2540
2541            Check this and shot the lock. It is not prone from deadlocks.
2542            Either shot noqueue qdisc, it is even simpler 8)
2543          */
2544         if (dev->flags & IFF_UP) {
2545                 int cpu = smp_processor_id(); /* ok because BHs are off */
2546
2547                 if (txq->xmit_lock_owner != cpu) {
2548
2549                         if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2550                                 goto recursion_alert;
2551
2552                         HARD_TX_LOCK(dev, txq, cpu);
2553
2554                         if (!netif_tx_queue_stopped(txq)) {
2555                                 __this_cpu_inc(xmit_recursion);
2556                                 rc = dev_hard_start_xmit(skb, dev, txq);
2557                                 __this_cpu_dec(xmit_recursion);
2558                                 if (dev_xmit_complete(rc)) {
2559                                         HARD_TX_UNLOCK(dev, txq);
2560                                         goto out;
2561                                 }
2562                         }
2563                         HARD_TX_UNLOCK(dev, txq);
2564                         if (net_ratelimit())
2565                                 printk(KERN_CRIT "Virtual device %s asks to "
2566                                        "queue packet!\n", dev->name);
2567                 } else {
2568                         /* Recursion is detected! It is possible,
2569                          * unfortunately
2570                          */
2571 recursion_alert:
2572                         if (net_ratelimit())
2573                                 printk(KERN_CRIT "Dead loop on virtual device "
2574                                        "%s, fix it urgently!\n", dev->name);
2575                 }
2576         }
2577
2578         rc = -ENETDOWN;
2579         rcu_read_unlock_bh();
2580
2581         kfree_skb(skb);
2582         return rc;
2583 out:
2584         rcu_read_unlock_bh();
2585         return rc;
2586 }
2587 EXPORT_SYMBOL(dev_queue_xmit);
2588
2589
2590 /*=======================================================================
2591                         Receiver routines
2592   =======================================================================*/
2593
2594 int netdev_max_backlog __read_mostly = 1000;
2595 int netdev_tstamp_prequeue __read_mostly = 1;
2596 int netdev_budget __read_mostly = 300;
2597 int weight_p __read_mostly = 64;            /* old backlog weight */
2598
2599 /* Called with irq disabled */
2600 static inline void ____napi_schedule(struct softnet_data *sd,
2601                                      struct napi_struct *napi)
2602 {
2603         list_add_tail(&napi->poll_list, &sd->poll_list);
2604         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2605 }
2606
2607 /*
2608  * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
2609  * and src/dst port numbers.  Sets rxhash in skb to non-zero hash value
2610  * on success, zero indicates no valid hash.  Also, sets l4_rxhash in skb
2611  * if hash is a canonical 4-tuple hash over transport ports.
2612  */
2613 void __skb_get_rxhash(struct sk_buff *skb)
2614 {
2615         int nhoff, hash = 0, poff;
2616         const struct ipv6hdr *ip6;
2617         const struct iphdr *ip;
2618         const struct vlan_hdr *vlan;
2619         u8 ip_proto;
2620         u32 addr1, addr2;
2621         u16 proto;
2622         union {
2623                 u32 v32;
2624                 u16 v16[2];
2625         } ports;
2626
2627         nhoff = skb_network_offset(skb);
2628         proto = skb->protocol;
2629
2630 again:
2631         switch (proto) {
2632         case __constant_htons(ETH_P_IP):
2633 ip:
2634                 if (!pskb_may_pull(skb, sizeof(*ip) + nhoff))
2635                         goto done;
2636
2637                 ip = (const struct iphdr *) (skb->data + nhoff);
2638                 if (ip_is_fragment(ip))
2639                         ip_proto = 0;
2640                 else
2641                         ip_proto = ip->protocol;
2642                 addr1 = (__force u32) ip->saddr;
2643                 addr2 = (__force u32) ip->daddr;
2644                 nhoff += ip->ihl * 4;
2645                 break;
2646         case __constant_htons(ETH_P_IPV6):
2647 ipv6:
2648                 if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff))
2649                         goto done;
2650
2651                 ip6 = (const struct ipv6hdr *) (skb->data + nhoff);
2652                 ip_proto = ip6->nexthdr;
2653                 addr1 = (__force u32) ip6->saddr.s6_addr32[3];
2654                 addr2 = (__force u32) ip6->daddr.s6_addr32[3];
2655                 nhoff += 40;
2656                 break;
2657         case __constant_htons(ETH_P_8021Q):
2658                 if (!pskb_may_pull(skb, sizeof(*vlan) + nhoff))
2659                         goto done;
2660                 vlan = (const struct vlan_hdr *) (skb->data + nhoff);
2661                 proto = vlan->h_vlan_encapsulated_proto;
2662                 nhoff += sizeof(*vlan);
2663                 goto again;
2664         case __constant_htons(ETH_P_PPP_SES):
2665                 if (!pskb_may_pull(skb, PPPOE_SES_HLEN + nhoff))
2666                         goto done;
2667                 proto = *((__be16 *) (skb->data + nhoff +
2668                                       sizeof(struct pppoe_hdr)));
2669                 nhoff += PPPOE_SES_HLEN;
2670                 switch (proto) {
2671                 case __constant_htons(PPP_IP):
2672                         goto ip;
2673                 case __constant_htons(PPP_IPV6):
2674                         goto ipv6;
2675                 default:
2676                         goto done;
2677                 }
2678         default:
2679                 goto done;
2680         }
2681
2682         switch (ip_proto) {
2683         case IPPROTO_GRE:
2684                 if (pskb_may_pull(skb, nhoff + 16)) {
2685                         u8 *h = skb->data + nhoff;
2686                         __be16 flags = *(__be16 *)h;
2687
2688                         /*
2689                          * Only look inside GRE if version zero and no
2690                          * routing
2691                          */
2692                         if (!(flags & (GRE_VERSION|GRE_ROUTING))) {
2693                                 proto = *(__be16 *)(h + 2);
2694                                 nhoff += 4;
2695                                 if (flags & GRE_CSUM)
2696                                         nhoff += 4;
2697                                 if (flags & GRE_KEY)
2698                                         nhoff += 4;
2699                                 if (flags & GRE_SEQ)
2700                                         nhoff += 4;
2701                                 goto again;
2702                         }
2703                 }
2704                 break;
2705         case IPPROTO_IPIP:
2706                 goto again;
2707         default:
2708                 break;
2709         }
2710
2711         ports.v32 = 0;
2712         poff = proto_ports_offset(ip_proto);
2713         if (poff >= 0) {
2714                 nhoff += poff;
2715                 if (pskb_may_pull(skb, nhoff + 4)) {
2716                         ports.v32 = * (__force u32 *) (skb->data + nhoff);
2717                         if (ports.v16[1] < ports.v16[0])
2718                                 swap(ports.v16[0], ports.v16[1]);
2719                         skb->l4_rxhash = 1;
2720                 }
2721         }
2722
2723         /* get a consistent hash (same value on both flow directions) */
2724         if (addr2 < addr1)
2725                 swap(addr1, addr2);
2726
2727         hash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
2728         if (!hash)
2729                 hash = 1;
2730
2731 done:
2732         skb->rxhash = hash;
2733 }
2734 EXPORT_SYMBOL(__skb_get_rxhash);
2735
2736 #ifdef CONFIG_RPS
2737
2738 /* One global table that all flow-based protocols share. */
2739 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2740 EXPORT_SYMBOL(rps_sock_flow_table);
2741
2742 static struct rps_dev_flow *
2743 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2744             struct rps_dev_flow *rflow, u16 next_cpu)
2745 {
2746         if (next_cpu != RPS_NO_CPU) {
2747 #ifdef CONFIG_RFS_ACCEL
2748                 struct netdev_rx_queue *rxqueue;
2749                 struct rps_dev_flow_table *flow_table;
2750                 struct rps_dev_flow *old_rflow;
2751                 u32 flow_id;
2752                 u16 rxq_index;
2753                 int rc;
2754
2755                 /* Should we steer this flow to a different hardware queue? */
2756                 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2757                     !(dev->features & NETIF_F_NTUPLE))
2758                         goto out;
2759                 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2760                 if (rxq_index == skb_get_rx_queue(skb))
2761                         goto out;
2762
2763                 rxqueue = dev->_rx + rxq_index;
2764                 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2765                 if (!flow_table)
2766                         goto out;
2767                 flow_id = skb->rxhash & flow_table->mask;
2768                 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2769                                                         rxq_index, flow_id);
2770                 if (rc < 0)
2771                         goto out;
2772                 old_rflow = rflow;
2773                 rflow = &flow_table->flows[flow_id];
2774                 rflow->filter = rc;
2775                 if (old_rflow->filter == rflow->filter)
2776                         old_rflow->filter = RPS_NO_FILTER;
2777         out:
2778 #endif
2779                 rflow->last_qtail =
2780                         per_cpu(softnet_data, next_cpu).input_queue_head;
2781         }
2782
2783         rflow->cpu = next_cpu;
2784         return rflow;
2785 }
2786
2787 /*
2788  * get_rps_cpu is called from netif_receive_skb and returns the target
2789  * CPU from the RPS map of the receiving queue for a given skb.
2790  * rcu_read_lock must be held on entry.
2791  */
2792 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2793                        struct rps_dev_flow **rflowp)
2794 {
2795         struct netdev_rx_queue *rxqueue;
2796         struct rps_map *map;
2797         struct rps_dev_flow_table *flow_table;
2798         struct rps_sock_flow_table *sock_flow_table;
2799         int cpu = -1;
2800         u16 tcpu;
2801
2802         if (skb_rx_queue_recorded(skb)) {
2803                 u16 index = skb_get_rx_queue(skb);
2804                 if (unlikely(index >= dev->real_num_rx_queues)) {
2805                         WARN_ONCE(dev->real_num_rx_queues > 1,
2806                                   "%s received packet on queue %u, but number "
2807                                   "of RX queues is %u\n",
2808                                   dev->name, index, dev->real_num_rx_queues);
2809                         goto done;
2810                 }
2811                 rxqueue = dev->_rx + index;
2812         } else
2813                 rxqueue = dev->_rx;
2814
2815         map = rcu_dereference(rxqueue->rps_map);
2816         if (map) {
2817                 if (map->len == 1 &&
2818                     !rcu_access_pointer(rxqueue->rps_flow_table)) {
2819                         tcpu = map->cpus[0];
2820                         if (cpu_online(tcpu))
2821                                 cpu = tcpu;
2822                         goto done;
2823                 }
2824         } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
2825                 goto done;
2826         }
2827
2828         skb_reset_network_header(skb);
2829         if (!skb_get_rxhash(skb))
2830                 goto done;
2831
2832         flow_table = rcu_dereference(rxqueue->rps_flow_table);
2833         sock_flow_table = rcu_dereference(rps_sock_flow_table);
2834         if (flow_table && sock_flow_table) {
2835                 u16 next_cpu;
2836                 struct rps_dev_flow *rflow;
2837
2838                 rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2839                 tcpu = rflow->cpu;
2840
2841                 next_cpu = sock_flow_table->ents[skb->rxhash &
2842                     sock_flow_table->mask];
2843
2844                 /*
2845                  * If the desired CPU (where last recvmsg was done) is
2846                  * different from current CPU (one in the rx-queue flow
2847                  * table entry), switch if one of the following holds:
2848                  *   - Current CPU is unset (equal to RPS_NO_CPU).
2849                  *   - Current CPU is offline.
2850                  *   - The current CPU's queue tail has advanced beyond the
2851                  *     last packet that was enqueued using this table entry.
2852                  *     This guarantees that all previous packets for the flow
2853                  *     have been dequeued, thus preserving in order delivery.
2854                  */
2855                 if (unlikely(tcpu != next_cpu) &&
2856                     (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2857                      ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2858                       rflow->last_qtail)) >= 0))
2859                         rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
2860
2861                 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2862                         *rflowp = rflow;
2863                         cpu = tcpu;
2864                         goto done;
2865                 }
2866         }
2867
2868         if (map) {
2869                 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2870
2871                 if (cpu_online(tcpu)) {
2872                         cpu = tcpu;
2873                         goto done;
2874                 }
2875         }
2876
2877 done:
2878         return cpu;
2879 }
2880
2881 #ifdef CONFIG_RFS_ACCEL
2882
2883 /**
2884  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
2885  * @dev: Device on which the filter was set
2886  * @rxq_index: RX queue index
2887  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
2888  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
2889  *
2890  * Drivers that implement ndo_rx_flow_steer() should periodically call
2891  * this function for each installed filter and remove the filters for
2892  * which it returns %true.
2893  */
2894 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
2895                          u32 flow_id, u16 filter_id)
2896 {
2897         struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
2898         struct rps_dev_flow_table *flow_table;
2899         struct rps_dev_flow *rflow;
2900         bool expire = true;
2901         int cpu;
2902
2903         rcu_read_lock();
2904         flow_table = rcu_dereference(rxqueue->rps_flow_table);
2905         if (flow_table && flow_id <= flow_table->mask) {
2906                 rflow = &flow_table->flows[flow_id];
2907                 cpu = ACCESS_ONCE(rflow->cpu);
2908                 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
2909                     ((int)(per_cpu(softnet_data, cpu).input_queue_head -
2910                            rflow->last_qtail) <
2911                      (int)(10 * flow_table->mask)))
2912                         expire = false;
2913         }
2914         rcu_read_unlock();
2915         return expire;
2916 }
2917 EXPORT_SYMBOL(rps_may_expire_flow);
2918
2919 #endif /* CONFIG_RFS_ACCEL */
2920
2921 /* Called from hardirq (IPI) context */
2922 static void rps_trigger_softirq(void *data)
2923 {
2924         struct softnet_data *sd = data;
2925
2926         ____napi_schedule(sd, &sd->backlog);
2927         sd->received_rps++;
2928 }
2929
2930 #endif /* CONFIG_RPS */
2931
2932 /*
2933  * Check if this softnet_data structure is another cpu one
2934  * If yes, queue it to our IPI list and return 1
2935  * If no, return 0
2936  */
2937 static int rps_ipi_queued(struct softnet_data *sd)
2938 {
2939 #ifdef CONFIG_RPS
2940         struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2941
2942         if (sd != mysd) {
2943                 sd->rps_ipi_next = mysd->rps_ipi_list;
2944                 mysd->rps_ipi_list = sd;
2945
2946                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2947                 return 1;
2948         }
2949 #endif /* CONFIG_RPS */
2950         return 0;
2951 }
2952
2953 /*
2954  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2955  * queue (may be a remote CPU queue).
2956  */
2957 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2958                               unsigned int *qtail)
2959 {
2960         struct softnet_data *sd;
2961         unsigned long flags;
2962
2963         sd = &per_cpu(softnet_data, cpu);
2964
2965         local_irq_save(flags);
2966
2967         rps_lock(sd);
2968         if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
2969                 if (skb_queue_len(&sd->input_pkt_queue)) {
2970 enqueue:
2971                         __skb_queue_tail(&sd->input_pkt_queue, skb);
2972                         input_queue_tail_incr_save(sd, qtail);
2973                         rps_unlock(sd);
2974                         local_irq_restore(flags);
2975                         return NET_RX_SUCCESS;
2976                 }
2977
2978                 /* Schedule NAPI for backlog device
2979                  * We can use non atomic operation since we own the queue lock
2980                  */
2981                 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
2982                         if (!rps_ipi_queued(sd))
2983                                 ____napi_schedule(sd, &sd->backlog);
2984                 }
2985                 goto enqueue;
2986         }
2987
2988         sd->dropped++;
2989         rps_unlock(sd);
2990
2991         local_irq_restore(flags);
2992
2993         atomic_long_inc(&skb->dev->rx_dropped);
2994         kfree_skb(skb);
2995         return NET_RX_DROP;
2996 }
2997
2998 /**
2999  *      netif_rx        -       post buffer to the network code
3000  *      @skb: buffer to post
3001  *
3002  *      This function receives a packet from a device driver and queues it for
3003  *      the upper (protocol) levels to process.  It always succeeds. The buffer
3004  *      may be dropped during processing for congestion control or by the
3005  *      protocol layers.
3006  *
3007  *      return values:
3008  *      NET_RX_SUCCESS  (no congestion)
3009  *      NET_RX_DROP     (packet was dropped)
3010  *
3011  */
3012
3013 int netif_rx(struct sk_buff *skb)
3014 {
3015         int ret;
3016
3017         /* if netpoll wants it, pretend we never saw it */
3018         if (netpoll_rx(skb))
3019                 return NET_RX_DROP;
3020
3021         if (netdev_tstamp_prequeue)
3022                 net_timestamp_check(skb);
3023
3024         trace_netif_rx(skb);
3025 #ifdef CONFIG_RPS
3026         {
3027                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3028                 int cpu;
3029
3030                 preempt_disable();
3031                 rcu_read_lock();
3032
3033                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3034                 if (cpu < 0)
3035                         cpu = smp_processor_id();
3036
3037                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3038
3039                 rcu_read_unlock();
3040                 preempt_enable();
3041         }
3042 #else
3043         {
3044                 unsigned int qtail;
3045                 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3046                 put_cpu();
3047         }
3048 #endif
3049         return ret;
3050 }
3051 EXPORT_SYMBOL(netif_rx);
3052
3053 int netif_rx_ni(struct sk_buff *skb)
3054 {
3055         int err;
3056
3057         preempt_disable();
3058         err = netif_rx(skb);
3059         if (local_softirq_pending())
3060                 do_softirq();
3061         preempt_enable();
3062
3063         return err;
3064 }
3065 EXPORT_SYMBOL(netif_rx_ni);
3066
3067 static void net_tx_action(struct softirq_action *h)
3068 {
3069         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3070
3071         if (sd->completion_queue) {
3072                 struct sk_buff *clist;
3073
3074                 local_irq_disable();
3075                 clist = sd->completion_queue;
3076                 sd->completion_queue = NULL;
3077                 local_irq_enable();
3078
3079                 while (clist) {
3080                         struct sk_buff *skb = clist;
3081                         clist = clist->next;
3082
3083                         WARN_ON(atomic_read(&skb->users));
3084                         trace_kfree_skb(skb, net_tx_action);
3085                         __kfree_skb(skb);
3086                 }
3087         }
3088
3089         if (sd->output_queue) {
3090                 struct Qdisc *head;
3091
3092                 local_irq_disable();
3093                 head = sd->output_queue;
3094                 sd->output_queue = NULL;
3095                 sd->output_queue_tailp = &sd->output_queue;
3096                 local_irq_enable();
3097
3098                 while (head) {
3099                         struct Qdisc *q = head;
3100                         spinlock_t *root_lock;
3101
3102                         head = head->next_sched;
3103
3104                         root_lock = qdisc_lock(q);
3105                         if (spin_trylock(root_lock)) {
3106                                 smp_mb__before_clear_bit();
3107                                 clear_bit(__QDISC_STATE_SCHED,
3108                                           &q->state);
3109                                 qdisc_run(q);
3110                                 spin_unlock(root_lock);
3111                         } else {
3112                                 if (!test_bit(__QDISC_STATE_DEACTIVATED,
3113                                               &q->state)) {
3114                                         __netif_reschedule(q);
3115                                 } else {
3116                                         smp_mb__before_clear_bit();
3117                                         clear_bit(__QDISC_STATE_SCHED,
3118                                                   &q->state);
3119                                 }
3120                         }
3121                 }
3122         }
3123 }
3124
3125 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3126     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3127 /* This hook is defined here for ATM LANE */
3128 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3129                              unsigned char *addr) __read_mostly;
3130 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3131 #endif
3132
3133 #ifdef CONFIG_NET_CLS_ACT
3134 /* TODO: Maybe we should just force sch_ingress to be compiled in
3135  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3136  * a compare and 2 stores extra right now if we dont have it on
3137  * but have CONFIG_NET_CLS_ACT
3138  * NOTE: This doesn't stop any functionality; if you dont have
3139  * the ingress scheduler, you just can't add policies on ingress.
3140  *
3141  */
3142 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3143 {
3144         struct net_device *dev = skb->dev;
3145         u32 ttl = G_TC_RTTL(skb->tc_verd);
3146         int result = TC_ACT_OK;
3147         struct Qdisc *q;
3148
3149         if (unlikely(MAX_RED_LOOP < ttl++)) {
3150                 if (net_ratelimit())
3151                         pr_warning( "Redir loop detected Dropping packet (%d->%d)\n",
3152                                skb->skb_iif, dev->ifindex);
3153                 return TC_ACT_SHOT;
3154         }
3155
3156         skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3157         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3158
3159         q = rxq->qdisc;
3160         if (q != &noop_qdisc) {
3161                 spin_lock(qdisc_lock(q));
3162                 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3163                         result = qdisc_enqueue_root(skb, q);
3164                 spin_unlock(qdisc_lock(q));
3165         }
3166
3167         return result;
3168 }
3169
3170 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3171                                          struct packet_type **pt_prev,
3172                                          int *ret, struct net_device *orig_dev)
3173 {
3174         struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3175
3176         if (!rxq || rxq->qdisc == &noop_qdisc)
3177                 goto out;
3178
3179         if (*pt_prev) {
3180                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3181                 *pt_prev = NULL;
3182         }
3183
3184         switch (ing_filter(skb, rxq)) {
3185         case TC_ACT_SHOT:
3186         case TC_ACT_STOLEN:
3187                 kfree_skb(skb);
3188                 return NULL;
3189         }
3190
3191 out:
3192         skb->tc_verd = 0;
3193         return skb;
3194 }
3195 #endif
3196
3197 /**
3198  *      netdev_rx_handler_register - register receive handler
3199  *      @dev: device to register a handler for
3200  *      @rx_handler: receive handler to register
3201  *      @rx_handler_data: data pointer that is used by rx handler
3202  *
3203  *      Register a receive hander for a device. This handler will then be
3204  *      called from __netif_receive_skb. A negative errno code is returned
3205  *      on a failure.
3206  *
3207  *      The caller must hold the rtnl_mutex.
3208  *
3209  *      For a general description of rx_handler, see enum rx_handler_result.
3210  */
3211 int netdev_rx_handler_register(struct net_device *dev,
3212                                rx_handler_func_t *rx_handler,
3213                                void *rx_handler_data)
3214 {
3215         ASSERT_RTNL();
3216
3217         if (dev->rx_handler)
3218                 return -EBUSY;
3219
3220         rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3221         rcu_assign_pointer(dev->rx_handler, rx_handler);
3222
3223         return 0;
3224 }
3225 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3226
3227 /**
3228  *      netdev_rx_handler_unregister - unregister receive handler
3229  *      @dev: device to unregister a handler from
3230  *
3231  *      Unregister a receive hander from a device.
3232  *
3233  *      The caller must hold the rtnl_mutex.
3234  */
3235 void netdev_rx_handler_unregister(struct net_device *dev)
3236 {
3237
3238         ASSERT_RTNL();
3239         RCU_INIT_POINTER(dev->rx_handler, NULL);
3240         RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3241 }
3242 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3243
3244 static int __netif_receive_skb(struct sk_buff *skb)
3245 {
3246         struct packet_type *ptype, *pt_prev;
3247         rx_handler_func_t *rx_handler;
3248         struct net_device *orig_dev;
3249         struct net_device *null_or_dev;
3250         bool deliver_exact = false;
3251         int ret = NET_RX_DROP;
3252         __be16 type;
3253
3254         if (!netdev_tstamp_prequeue)
3255                 net_timestamp_check(skb);
3256
3257         trace_netif_receive_skb(skb);
3258
3259         /* if we've gotten here through NAPI, check netpoll */
3260         if (netpoll_receive_skb(skb))
3261                 return NET_RX_DROP;
3262
3263         if (!skb->skb_iif)
3264                 skb->skb_iif = skb->dev->ifindex;
3265         orig_dev = skb->dev;
3266
3267         skb_reset_network_header(skb);
3268         skb_reset_transport_header(skb);
3269         skb_reset_mac_len(skb);
3270
3271         pt_prev = NULL;
3272
3273         rcu_read_lock();
3274
3275 another_round:
3276
3277         __this_cpu_inc(softnet_data.processed);
3278
3279         if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) {
3280                 skb = vlan_untag(skb);
3281                 if (unlikely(!skb))
3282                         goto out;
3283         }
3284
3285 #ifdef CONFIG_NET_CLS_ACT
3286         if (skb->tc_verd & TC_NCLS) {
3287                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3288                 goto ncls;
3289         }
3290 #endif
3291
3292         list_for_each_entry_rcu(ptype, &ptype_all, list) {
3293                 if (!ptype->dev || ptype->dev == skb->dev) {
3294                         if (pt_prev)
3295                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3296                         pt_prev = ptype;
3297                 }
3298         }
3299
3300 #ifdef CONFIG_NET_CLS_ACT
3301         skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3302         if (!skb)
3303                 goto out;
3304 ncls:
3305 #endif
3306
3307         rx_handler = rcu_dereference(skb->dev->rx_handler);
3308         if (vlan_tx_tag_present(skb)) {
3309                 if (pt_prev) {
3310                         ret = deliver_skb(skb, pt_prev, orig_dev);
3311                         pt_prev = NULL;
3312                 }
3313                 if (vlan_do_receive(&skb, !rx_handler))
3314                         goto another_round;
3315                 else if (unlikely(!skb))
3316                         goto out;
3317         }
3318
3319         if (rx_handler) {
3320                 if (pt_prev) {
3321                         ret = deliver_skb(skb, pt_prev, orig_dev);
3322                         pt_prev = NULL;
3323                 }
3324                 switch (rx_handler(&skb)) {
3325                 case RX_HANDLER_CONSUMED:
3326                         goto out;
3327                 case RX_HANDLER_ANOTHER:
3328                         goto another_round;
3329                 case RX_HANDLER_EXACT:
3330                         deliver_exact = true;
3331                 case RX_HANDLER_PASS:
3332                         break;
3333                 default:
3334                         BUG();
3335                 }
3336         }
3337
3338         /* deliver only exact match when indicated */
3339         null_or_dev = deliver_exact ? skb->dev : NULL;
3340
3341         type = skb->protocol;
3342         list_for_each_entry_rcu(ptype,
3343                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3344                 if (ptype->type == type &&
3345                     (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3346                      ptype->dev == orig_dev)) {
3347                         if (pt_prev)
3348                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3349                         pt_prev = ptype;
3350                 }
3351         }
3352
3353         if (pt_prev) {
3354                 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3355         } else {
3356                 atomic_long_inc(&skb->dev->rx_dropped);
3357                 kfree_skb(skb);
3358                 /* Jamal, now you will not able to escape explaining
3359                  * me how you were going to use this. :-)
3360                  */
3361                 ret = NET_RX_DROP;
3362         }
3363
3364 out:
3365         rcu_read_unlock();
3366         return ret;
3367 }
3368
3369 /**
3370  *      netif_receive_skb - process receive buffer from network
3371  *      @skb: buffer to process
3372  *
3373  *      netif_receive_skb() is the main receive data processing function.
3374  *      It always succeeds. The buffer may be dropped during processing
3375  *      for congestion control or by the protocol layers.
3376  *
3377  *      This function may only be called from softirq context and interrupts
3378  *      should be enabled.
3379  *
3380  *      Return values (usually ignored):
3381  *      NET_RX_SUCCESS: no congestion
3382  *      NET_RX_DROP: packet was dropped
3383  */
3384 int netif_receive_skb(struct sk_buff *skb)
3385 {
3386         if (netdev_tstamp_prequeue)
3387                 net_timestamp_check(skb);
3388
3389         if (skb_defer_rx_timestamp(skb))
3390                 return NET_RX_SUCCESS;
3391
3392 #ifdef CONFIG_RPS
3393         {
3394                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3395                 int cpu, ret;
3396
3397                 rcu_read_lock();
3398
3399                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3400
3401                 if (cpu >= 0) {
3402                         ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3403                         rcu_read_unlock();
3404                 } else {
3405                         rcu_read_unlock();
3406                         ret = __netif_receive_skb(skb);
3407                 }
3408
3409                 return ret;
3410         }
3411 #else
3412         return __netif_receive_skb(skb);
3413 #endif
3414 }
3415 EXPORT_SYMBOL(netif_receive_skb);
3416
3417 /* Network device is going away, flush any packets still pending
3418  * Called with irqs disabled.
3419  */
3420 static void flush_backlog(void *arg)
3421 {
3422         struct net_device *dev = arg;
3423         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3424         struct sk_buff *skb, *tmp;
3425
3426         rps_lock(sd);
3427         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3428                 if (skb->dev == dev) {
3429                         __skb_unlink(skb, &sd->input_pkt_queue);
3430                         kfree_skb(skb);
3431                         input_queue_head_incr(sd);
3432                 }
3433         }
3434         rps_unlock(sd);
3435
3436         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3437                 if (skb->dev == dev) {
3438                         __skb_unlink(skb, &sd->process_queue);
3439                         kfree_skb(skb);
3440                         input_queue_head_incr(sd);
3441                 }
3442         }
3443 }
3444
3445 static int napi_gro_complete(struct sk_buff *skb)
3446 {
3447         struct packet_type *ptype;
3448         __be16 type = skb->protocol;
3449         struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3450         int err = -ENOENT;
3451
3452         if (NAPI_GRO_CB(skb)->count == 1) {
3453                 skb_shinfo(skb)->gso_size = 0;
3454                 goto out;
3455         }
3456
3457         rcu_read_lock();
3458         list_for_each_entry_rcu(ptype, head, list) {
3459                 if (ptype->type != type || ptype->dev || !ptype->gro_complete)
3460                         continue;
3461
3462                 err = ptype->gro_complete(skb);
3463                 break;
3464         }
3465         rcu_read_unlock();
3466
3467         if (err) {
3468                 WARN_ON(&ptype->list == head);
3469                 kfree_skb(skb);
3470                 return NET_RX_SUCCESS;
3471         }
3472
3473 out:
3474         return netif_receive_skb(skb);
3475 }
3476
3477 inline void napi_gro_flush(struct napi_struct *napi)
3478 {
3479         struct sk_buff *skb, *next;
3480
3481         for (skb = napi->gro_list; skb; skb = next) {
3482                 next = skb->next;
3483                 skb->next = NULL;
3484                 napi_gro_complete(skb);
3485         }
3486
3487         napi->gro_count = 0;
3488         napi->gro_list = NULL;
3489 }
3490 EXPORT_SYMBOL(napi_gro_flush);
3491
3492 enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3493 {
3494         struct sk_buff **pp = NULL;
3495         struct packet_type *ptype;
3496         __be16 type = skb->protocol;
3497         struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3498         int same_flow;
3499         int mac_len;
3500         enum gro_result ret;
3501
3502         if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3503                 goto normal;
3504
3505         if (skb_is_gso(skb) || skb_has_frag_list(skb))
3506                 goto normal;
3507
3508         rcu_read_lock();
3509         list_for_each_entry_rcu(ptype, head, list) {
3510                 if (ptype->type != type || ptype->dev || !ptype->gro_receive)
3511                         continue;
3512
3513                 skb_set_network_header(skb, skb_gro_offset(skb));
3514                 mac_len = skb->network_header - skb->mac_header;
3515                 skb->mac_len = mac_len;
3516                 NAPI_GRO_CB(skb)->same_flow = 0;
3517                 NAPI_GRO_CB(skb)->flush = 0;
3518                 NAPI_GRO_CB(skb)->free = 0;
3519
3520                 pp = ptype->gro_receive(&napi->gro_list, skb);
3521                 break;
3522         }
3523         rcu_read_unlock();
3524
3525         if (&ptype->list == head)
3526                 goto normal;
3527
3528         same_flow = NAPI_GRO_CB(skb)->same_flow;
3529         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3530
3531         if (pp) {
3532                 struct sk_buff *nskb = *pp;
3533
3534                 *pp = nskb->next;
3535                 nskb->next = NULL;
3536                 napi_gro_complete(nskb);
3537                 napi->gro_count--;
3538         }
3539
3540         if (same_flow)
3541                 goto ok;
3542
3543         if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3544                 goto normal;
3545
3546         napi->gro_count++;
3547         NAPI_GRO_CB(skb)->count = 1;
3548         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3549         skb->next = napi->gro_list;
3550         napi->gro_list = skb;
3551         ret = GRO_HELD;
3552
3553 pull:
3554         if (skb_headlen(skb) < skb_gro_offset(skb)) {
3555                 int grow = skb_gro_offset(skb) - skb_headlen(skb);
3556
3557                 BUG_ON(skb->end - skb->tail < grow);
3558
3559                 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3560
3561                 skb->tail += grow;
3562                 skb->data_len -= grow;
3563
3564                 skb_shinfo(skb)->frags[0].page_offset += grow;
3565                 skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow);
3566
3567                 if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) {
3568                         skb_frag_unref(skb, 0);
3569                         memmove(skb_shinfo(skb)->frags,
3570                                 skb_shinfo(skb)->frags + 1,
3571                                 --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3572                 }
3573         }
3574
3575 ok:
3576         return ret;
3577
3578 normal:
3579         ret = GRO_NORMAL;
3580         goto pull;
3581 }
3582 EXPORT_SYMBOL(dev_gro_receive);
3583
3584 static inline gro_result_t
3585 __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3586 {
3587         struct sk_buff *p;
3588         unsigned int maclen = skb->dev->hard_header_len;
3589
3590         for (p = napi->gro_list; p; p = p->next) {
3591                 unsigned long diffs;
3592
3593                 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3594                 diffs |= p->vlan_tci ^ skb->vlan_tci;
3595                 if (maclen == ETH_HLEN)
3596                         diffs |= compare_ether_header(skb_mac_header(p),
3597                                                       skb_gro_mac_header(skb));
3598                 else if (!diffs)
3599                         diffs = memcmp(skb_mac_header(p),
3600                                        skb_gro_mac_header(skb),
3601                                        maclen);
3602                 NAPI_GRO_CB(p)->same_flow = !diffs;
3603                 NAPI_GRO_CB(p)->flush = 0;
3604         }
3605
3606         return dev_gro_receive(napi, skb);
3607 }
3608
3609 gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3610 {
3611         switch (ret) {
3612         case GRO_NORMAL:
3613                 if (netif_receive_skb(skb))
3614                         ret = GRO_DROP;
3615                 break;
3616
3617         case GRO_DROP:
3618         case GRO_MERGED_FREE:
3619                 kfree_skb(skb);
3620                 break;
3621
3622         case GRO_HELD:
3623         case GRO_MERGED:
3624                 break;
3625         }
3626
3627         return ret;
3628 }
3629 EXPORT_SYMBOL(napi_skb_finish);
3630
3631 void skb_gro_reset_offset(struct sk_buff *skb)
3632 {
3633         NAPI_GRO_CB(skb)->data_offset = 0;
3634         NAPI_GRO_CB(skb)->frag0 = NULL;
3635         NAPI_GRO_CB(skb)->frag0_len = 0;
3636
3637         if (skb->mac_header == skb->tail &&
3638             !PageHighMem(skb_frag_page(&skb_shinfo(skb)->frags[0]))) {
3639                 NAPI_GRO_CB(skb)->frag0 =
3640                         skb_frag_address(&skb_shinfo(skb)->frags[0]);
3641                 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(&skb_shinfo(skb)->frags[0]);
3642         }
3643 }
3644 EXPORT_SYMBOL(skb_gro_reset_offset);
3645
3646 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3647 {
3648         skb_gro_reset_offset(skb);
3649
3650         return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
3651 }
3652 EXPORT_SYMBOL(napi_gro_receive);
3653
3654 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3655 {
3656         __skb_pull(skb, skb_headlen(skb));
3657         /* restore the reserve we had after netdev_alloc_skb_ip_align() */
3658         skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
3659         skb->vlan_tci = 0;
3660         skb->dev = napi->dev;
3661         skb->skb_iif = 0;
3662
3663         napi->skb = skb;
3664 }
3665
3666 struct sk_buff *napi_get_frags(struct napi_struct *napi)
3667 {
3668         struct sk_buff *skb = napi->skb;
3669
3670         if (!skb) {
3671                 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3672                 if (skb)
3673                         napi->skb = skb;
3674         }
3675         return skb;
3676 }
3677 EXPORT_SYMBOL(napi_get_frags);
3678
3679 gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3680                                gro_result_t ret)
3681 {
3682         switch (ret) {
3683         case GRO_NORMAL:
3684         case GRO_HELD:
3685                 skb->protocol = eth_type_trans(skb, skb->dev);
3686
3687                 if (ret == GRO_HELD)
3688                         skb_gro_pull(skb, -ETH_HLEN);
3689                 else if (netif_receive_skb(skb))
3690                         ret = GRO_DROP;
3691                 break;
3692
3693         case GRO_DROP:
3694         case GRO_MERGED_FREE:
3695                 napi_reuse_skb(napi, skb);
3696                 break;
3697
3698         case GRO_MERGED:
3699                 break;
3700         }
3701
3702         return ret;
3703 }
3704 EXPORT_SYMBOL(napi_frags_finish);
3705
3706 struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3707 {
3708         struct sk_buff *skb = napi->skb;
3709         struct ethhdr *eth;
3710         unsigned int hlen;
3711         unsigned int off;
3712
3713         napi->skb = NULL;
3714
3715         skb_reset_mac_header(skb);
3716         skb_gro_reset_offset(skb);
3717
3718         off = skb_gro_offset(skb);
3719         hlen = off + sizeof(*eth);
3720         eth = skb_gro_header_fast(skb, off);
3721         if (skb_gro_header_hard(skb, hlen)) {
3722                 eth = skb_gro_header_slow(skb, hlen, off);
3723                 if (unlikely(!eth)) {
3724                         napi_reuse_skb(napi, skb);
3725                         skb = NULL;
3726                         goto out;
3727                 }
3728         }
3729
3730         skb_gro_pull(skb, sizeof(*eth));
3731
3732         /*
3733          * This works because the only protocols we care about don't require
3734          * special handling.  We'll fix it up properly at the end.
3735          */
3736         skb->protocol = eth->h_proto;
3737
3738 out:
3739         return skb;
3740 }
3741 EXPORT_SYMBOL(napi_frags_skb);
3742
3743 gro_result_t napi_gro_frags(struct napi_struct *napi)
3744 {
3745         struct sk_buff *skb = napi_frags_skb(napi);
3746
3747         if (!skb)
3748                 return GRO_DROP;
3749
3750         return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
3751 }
3752 EXPORT_SYMBOL(napi_gro_frags);
3753
3754 /*
3755  * net_rps_action sends any pending IPI's for rps.
3756  * Note: called with local irq disabled, but exits with local irq enabled.
3757  */
3758 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3759 {
3760 #ifdef CONFIG_RPS
3761         struct softnet_data *remsd = sd->rps_ipi_list;
3762
3763         if (remsd) {
3764                 sd->rps_ipi_list = NULL;
3765
3766                 local_irq_enable();
3767
3768                 /* Send pending IPI's to kick RPS processing on remote cpus. */
3769                 while (remsd) {
3770                         struct softnet_data *next = remsd->rps_ipi_next;
3771
3772                         if (cpu_online(remsd->cpu))
3773                                 __smp_call_function_single(remsd->cpu,
3774                                                            &remsd->csd, 0);
3775                         remsd = next;
3776                 }
3777         } else
3778 #endif
3779                 local_irq_enable();
3780 }
3781
3782 static int process_backlog(struct napi_struct *napi, int quota)
3783 {
3784         int work = 0;
3785         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
3786
3787 #ifdef CONFIG_RPS
3788         /* Check if we have pending ipi, its better to send them now,
3789          * not waiting net_rx_action() end.
3790          */
3791         if (sd->rps_ipi_list) {
3792                 local_irq_disable();
3793                 net_rps_action_and_irq_enable(sd);
3794         }
3795 #endif
3796         napi->weight = weight_p;
3797         local_irq_disable();
3798         while (work < quota) {
3799                 struct sk_buff *skb;
3800                 unsigned int qlen;
3801
3802                 while ((skb = __skb_dequeue(&sd->process_queue))) {
3803                         local_irq_enable();
3804                         __netif_receive_skb(skb);
3805                         local_irq_disable();
3806                         input_queue_head_incr(sd);
3807                         if (++work >= quota) {
3808                                 local_irq_enable();
3809                                 return work;
3810                         }
3811                 }
3812
3813                 rps_lock(sd);
3814                 qlen = skb_queue_len(&sd->input_pkt_queue);
3815                 if (qlen)
3816                         skb_queue_splice_tail_init(&sd->input_pkt_queue,
3817                                                    &sd->process_queue);
3818
3819                 if (qlen < quota - work) {
3820                         /*
3821                          * Inline a custom version of __napi_complete().
3822                          * only current cpu owns and manipulates this napi,
3823                          * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3824                          * we can use a plain write instead of clear_bit(),
3825                          * and we dont need an smp_mb() memory barrier.
3826                          */
3827                         list_del(&napi->poll_list);
3828                         napi->state = 0;
3829
3830                         quota = work + qlen;
3831                 }
3832                 rps_unlock(sd);
3833         }
3834         local_irq_enable();
3835
3836         return work;
3837 }
3838
3839 /**
3840  * __napi_schedule - schedule for receive
3841  * @n: entry to schedule
3842  *
3843  * The entry's receive function will be scheduled to run
3844  */
3845 void __napi_schedule(struct napi_struct *n)
3846 {
3847         unsigned long flags;
3848
3849         local_irq_save(flags);
3850         ____napi_schedule(&__get_cpu_var(softnet_data), n);
3851         local_irq_restore(flags);
3852 }
3853 EXPORT_SYMBOL(__napi_schedule);
3854
3855 void __napi_complete(struct napi_struct *n)
3856 {
3857         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3858         BUG_ON(n->gro_list);
3859
3860         list_del(&n->poll_list);
3861         smp_mb__before_clear_bit();
3862         clear_bit(NAPI_STATE_SCHED, &n->state);
3863 }
3864 EXPORT_SYMBOL(__napi_complete);
3865
3866 void napi_complete(struct napi_struct *n)
3867 {
3868         unsigned long flags;
3869
3870         /*
3871          * don't let napi dequeue from the cpu poll list
3872          * just in case its running on a different cpu
3873          */
3874         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3875                 return;
3876
3877         napi_gro_flush(n);
3878         local_irq_save(flags);
3879         __napi_complete(n);
3880         local_irq_restore(flags);
3881 }
3882 EXPORT_SYMBOL(napi_complete);
3883
3884 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3885                     int (*poll)(struct napi_struct *, int), int weight)
3886 {
3887         INIT_LIST_HEAD(&napi->poll_list);
3888         napi->gro_count = 0;
3889         napi->gro_list = NULL;
3890         napi->skb = NULL;
3891         napi->poll = poll;
3892         napi->weight = weight;
3893         list_add(&napi->dev_list, &dev->napi_list);
3894         napi->dev = dev;
3895 #ifdef CONFIG_NETPOLL
3896         spin_lock_init(&napi->poll_lock);
3897         napi->poll_owner = -1;
3898 #endif
3899         set_bit(NAPI_STATE_SCHED, &napi->state);
3900 }
3901 EXPORT_SYMBOL(netif_napi_add);
3902
3903 void netif_napi_del(struct napi_struct *napi)
3904 {
3905         struct sk_buff *skb, *next;
3906
3907         list_del_init(&napi->dev_list);
3908         napi_free_frags(napi);
3909
3910         for (skb = napi->gro_list; skb; skb = next) {
3911                 next = skb->next;
3912                 skb->next = NULL;
3913                 kfree_skb(skb);
3914         }
3915
3916         napi->gro_list = NULL;
3917         napi->gro_count = 0;
3918 }
3919 EXPORT_SYMBOL(netif_napi_del);
3920
3921 static void net_rx_action(struct softirq_action *h)
3922 {
3923         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3924         unsigned long time_limit = jiffies + 2;
3925         int budget = netdev_budget;
3926         void *have;
3927
3928         local_irq_disable();
3929
3930         while (!list_empty(&sd->poll_list)) {
3931                 struct napi_struct *n;
3932                 int work, weight;
3933
3934                 /* If softirq window is exhuasted then punt.
3935                  * Allow this to run for 2 jiffies since which will allow
3936                  * an average latency of 1.5/HZ.
3937                  */
3938                 if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
3939                         goto softnet_break;
3940
3941                 local_irq_enable();
3942
3943                 /* Even though interrupts have been re-enabled, this
3944                  * access is safe because interrupts can only add new
3945                  * entries to the tail of this list, and only ->poll()
3946                  * calls can remove this head entry from the list.
3947                  */
3948                 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
3949
3950                 have = netpoll_poll_lock(n);
3951
3952                 weight = n->weight;
3953
3954                 /* This NAPI_STATE_SCHED test is for avoiding a race
3955                  * with netpoll's poll_napi().  Only the entity which
3956                  * obtains the lock and sees NAPI_STATE_SCHED set will
3957                  * actually make the ->poll() call.  Therefore we avoid
3958                  * accidentally calling ->poll() when NAPI is not scheduled.
3959                  */
3960                 work = 0;
3961                 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
3962                         work = n->poll(n, weight);
3963                         trace_napi_poll(n);
3964                 }
3965
3966                 WARN_ON_ONCE(work > weight);
3967
3968                 budget -= work;
3969
3970                 local_irq_disable();
3971
3972                 /* Drivers must not modify the NAPI state if they
3973                  * consume the entire weight.  In such cases this code
3974                  * still "owns" the NAPI instance and therefore can
3975                  * move the instance around on the list at-will.
3976                  */
3977                 if (unlikely(work == weight)) {
3978                         if (unlikely(napi_disable_pending(n))) {
3979                                 local_irq_enable();
3980                                 napi_complete(n);
3981                                 local_irq_disable();
3982                         } else
3983                                 list_move_tail(&n->poll_list, &sd->poll_list);
3984                 }
3985
3986                 netpoll_poll_unlock(have);
3987         }
3988 out:
3989         net_rps_action_and_irq_enable(sd);
3990
3991 #ifdef CONFIG_NET_DMA
3992         /*
3993          * There may not be any more sk_buffs coming right now, so push
3994          * any pending DMA copies to hardware
3995          */
3996         dma_issue_pending_all();
3997 #endif
3998
3999         return;
4000
4001 softnet_break:
4002         sd->time_squeeze++;
4003         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4004         goto out;
4005 }
4006
4007 static gifconf_func_t *gifconf_list[NPROTO];
4008
4009 /**
4010  *      register_gifconf        -       register a SIOCGIF handler
4011  *      @family: Address family
4012  *      @gifconf: Function handler
4013  *
4014  *      Register protocol dependent address dumping routines. The handler
4015  *      that is passed must not be freed or reused until it has been replaced
4016  *      by another handler.
4017  */
4018 int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
4019 {
4020         if (family >= NPROTO)
4021                 return -EINVAL;
4022         gifconf_list[family] = gifconf;
4023         return 0;
4024 }
4025 EXPORT_SYMBOL(register_gifconf);
4026
4027
4028 /*
4029  *      Map an interface index to its name (SIOCGIFNAME)
4030  */
4031
4032 /*
4033  *      We need this ioctl for efficient implementation of the
4034  *      if_indextoname() function required by the IPv6 API.  Without
4035  *      it, we would have to search all the interfaces to find a
4036  *      match.  --pb
4037  */
4038
4039 static int dev_ifname(struct net *net, struct ifreq __user *arg)
4040 {
4041         struct net_device *dev;
4042         struct ifreq ifr;
4043
4044         /*
4045          *      Fetch the caller's info block.
4046          */
4047
4048         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4049                 return -EFAULT;
4050
4051         rcu_read_lock();
4052         dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
4053         if (!dev) {
4054                 rcu_read_unlock();
4055                 return -ENODEV;
4056         }
4057
4058         strcpy(ifr.ifr_name, dev->name);
4059         rcu_read_unlock();
4060
4061         if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
4062                 return -EFAULT;
4063         return 0;
4064 }
4065
4066 /*
4067  *      Perform a SIOCGIFCONF call. This structure will change
4068  *      size eventually, and there is nothing I can do about it.
4069  *      Thus we will need a 'compatibility mode'.
4070  */
4071
4072 static int dev_ifconf(struct net *net, char __user *arg)
4073 {
4074         struct ifconf ifc;
4075         struct net_device *dev;
4076         char __user *pos;
4077         int len;
4078         int total;
4079         int i;
4080
4081         /*
4082          *      Fetch the caller's info block.
4083          */
4084
4085         if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
4086                 return -EFAULT;
4087
4088         pos = ifc.ifc_buf;
4089         len = ifc.ifc_len;
4090
4091         /*
4092          *      Loop over the interfaces, and write an info block for each.
4093          */
4094
4095         total = 0;
4096         for_each_netdev(net, dev) {
4097                 for (i = 0; i < NPROTO; i++) {
4098                         if (gifconf_list[i]) {
4099                                 int done;
4100                                 if (!pos)
4101                                         done = gifconf_list[i](dev, NULL, 0);
4102                                 else
4103                                         done = gifconf_list[i](dev, pos + total,
4104                                                                len - total);
4105                                 if (done < 0)
4106                                         return -EFAULT;
4107                                 total += done;
4108                         }
4109                 }
4110         }
4111
4112         /*
4113          *      All done.  Write the updated control block back to the caller.
4114          */
4115         ifc.ifc_len = total;
4116
4117         /*
4118          *      Both BSD and Solaris return 0 here, so we do too.
4119          */
4120         return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
4121 }
4122
4123 #ifdef CONFIG_PROC_FS
4124
4125 #define BUCKET_SPACE (32 - NETDEV_HASHBITS - 1)
4126
4127 #define get_bucket(x) ((x) >> BUCKET_SPACE)
4128 #define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1))
4129 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
4130
4131 static inline struct net_device *dev_from_same_bucket(struct seq_file *seq, loff_t *pos)
4132 {
4133         struct net *net = seq_file_net(seq);
4134         struct net_device *dev;
4135         struct hlist_node *p;
4136         struct hlist_head *h;
4137         unsigned int count = 0, offset = get_offset(*pos);
4138
4139         h = &net->dev_name_head[get_bucket(*pos)];
4140         hlist_for_each_entry_rcu(dev, p, h, name_hlist) {
4141                 if (++count == offset)
4142                         return dev;
4143         }
4144
4145         return NULL;
4146 }
4147
4148 static inline struct net_device *dev_from_bucket(struct seq_file *seq, loff_t *pos)
4149 {
4150         struct net_device *dev;
4151         unsigned int bucket;
4152
4153         do {
4154                 dev = dev_from_same_bucket(seq, pos);
4155                 if (dev)
4156                         return dev;
4157
4158                 bucket = get_bucket(*pos) + 1;
4159                 *pos = set_bucket_offset(bucket, 1);
4160         } while (bucket < NETDEV_HASHENTRIES);
4161
4162         return NULL;
4163 }
4164
4165 /*
4166  *      This is invoked by the /proc filesystem handler to display a device
4167  *      in detail.
4168  */
4169 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
4170         __acquires(RCU)
4171 {
4172         rcu_read_lock();
4173         if (!*pos)
4174                 return SEQ_START_TOKEN;
4175
4176         if (get_bucket(*pos) >= NETDEV_HASHENTRIES)
4177                 return NULL;
4178
4179         return dev_from_bucket(seq, pos);
4180 }
4181
4182 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4183 {
4184         ++*pos;
4185         return dev_from_bucket(seq, pos);
4186 }
4187
4188 void dev_seq_stop(struct seq_file *seq, void *v)
4189         __releases(RCU)
4190 {
4191         rcu_read_unlock();
4192 }
4193
4194 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
4195 {
4196         struct rtnl_link_stats64 temp;
4197         const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
4198
4199         seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
4200                    "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
4201                    dev->name, stats->rx_bytes, stats->rx_packets,
4202                    stats->rx_errors,
4203                    stats->rx_dropped + stats->rx_missed_errors,
4204                    stats->rx_fifo_errors,
4205                    stats->rx_length_errors + stats->rx_over_errors +
4206                     stats->rx_crc_errors + stats->rx_frame_errors,
4207                    stats->rx_compressed, stats->multicast,
4208                    stats->tx_bytes, stats->tx_packets,
4209                    stats->tx_errors, stats->tx_dropped,
4210                    stats->tx_fifo_errors, stats->collisions,
4211                    stats->tx_carrier_errors +
4212                     stats->tx_aborted_errors +
4213                     stats->tx_window_errors +
4214                     stats->tx_heartbeat_errors,
4215                    stats->tx_compressed);
4216 }
4217
4218 /*
4219  *      Called from the PROCfs module. This now uses the new arbitrary sized
4220  *      /proc/net interface to create /proc/net/dev
4221  */
4222 static int dev_seq_show(struct seq_file *seq, void *v)
4223 {
4224         if (v == SEQ_START_TOKEN)
4225                 seq_puts(seq, "Inter-|   Receive                            "
4226                               "                    |  Transmit\n"
4227                               " face |bytes    packets errs drop fifo frame "
4228                               "compressed multicast|bytes    packets errs "
4229                               "drop fifo colls carrier compressed\n");
4230         else
4231                 dev_seq_printf_stats(seq, v);
4232         return 0;
4233 }
4234
4235 static struct softnet_data *softnet_get_online(loff_t *pos)
4236 {
4237         struct softnet_data *sd = NULL;
4238
4239         while (*pos < nr_cpu_ids)
4240                 if (cpu_online(*pos)) {
4241                         sd = &per_cpu(softnet_data, *pos);
4242                         break;
4243                 } else
4244                         ++*pos;
4245         return sd;
4246 }
4247
4248 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
4249 {
4250         return softnet_get_online(pos);
4251 }
4252
4253 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4254 {
4255         ++*pos;
4256         return softnet_get_online(pos);
4257 }
4258
4259 static void softnet_seq_stop(struct seq_file *seq, void *v)
4260 {
4261 }
4262
4263 static int softnet_seq_show(struct seq_file *seq, void *v)
4264 {
4265         struct softnet_data *sd = v;
4266
4267         seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
4268                    sd->processed, sd->dropped, sd->time_squeeze, 0,
4269                    0, 0, 0, 0, /* was fastroute */
4270                    sd->cpu_collision, sd->received_rps);
4271         return 0;
4272 }
4273
4274 static const struct seq_operations dev_seq_ops = {
4275         .start = dev_seq_start,
4276         .next  = dev_seq_next,
4277         .stop  = dev_seq_stop,
4278         .show  = dev_seq_show,
4279 };
4280
4281 static int dev_seq_open(struct inode *inode, struct file *file)
4282 {
4283         return seq_open_net(inode, file, &dev_seq_ops,
4284                             sizeof(struct seq_net_private));
4285 }
4286
4287 static const struct file_operations dev_seq_fops = {
4288         .owner   = THIS_MODULE,
4289         .open    = dev_seq_open,
4290         .read    = seq_read,
4291         .llseek  = seq_lseek,
4292         .release = seq_release_net,
4293 };
4294
4295 static const struct seq_operations softnet_seq_ops = {
4296         .start = softnet_seq_start,
4297         .next  = softnet_seq_next,
4298         .stop  = softnet_seq_stop,
4299         .show  = softnet_seq_show,
4300 };
4301
4302 static int softnet_seq_open(struct inode *inode, struct file *file)
4303 {
4304         return seq_open(file, &softnet_seq_ops);
4305 }
4306
4307 static const struct file_operations softnet_seq_fops = {
4308         .owner   = THIS_MODULE,
4309         .open    = softnet_seq_open,
4310         .read    = seq_read,
4311         .llseek  = seq_lseek,
4312         .release = seq_release,
4313 };
4314
4315 static void *ptype_get_idx(loff_t pos)
4316 {
4317         struct packet_type *pt = NULL;
4318         loff_t i = 0;
4319         int t;
4320
4321         list_for_each_entry_rcu(pt, &ptype_all, list) {
4322                 if (i == pos)
4323                         return pt;
4324                 ++i;
4325         }
4326
4327         for (t = 0; t < PTYPE_HASH_SIZE; t++) {
4328                 list_for_each_entry_rcu(pt, &ptype_base[t], list) {
4329                         if (i == pos)
4330                                 return pt;
4331                         ++i;
4332                 }
4333         }
4334         return NULL;
4335 }
4336
4337 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
4338         __acquires(RCU)
4339 {
4340         rcu_read_lock();
4341         return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
4342 }
4343
4344 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4345 {
4346         struct packet_type *pt;
4347         struct list_head *nxt;
4348         int hash;
4349
4350         ++*pos;
4351         if (v == SEQ_START_TOKEN)
4352                 return ptype_get_idx(0);
4353
4354         pt = v;
4355         nxt = pt->list.next;
4356         if (pt->type == htons(ETH_P_ALL)) {
4357                 if (nxt != &ptype_all)
4358                         goto found;
4359                 hash = 0;
4360                 nxt = ptype_base[0].next;
4361         } else
4362                 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
4363
4364         while (nxt == &ptype_base[hash]) {
4365                 if (++hash >= PTYPE_HASH_SIZE)
4366                         return NULL;
4367                 nxt = ptype_base[hash].next;
4368         }
4369 found:
4370         return list_entry(nxt, struct packet_type, list);
4371 }
4372
4373 static void ptype_seq_stop(struct seq_file *seq, void *v)
4374         __releases(RCU)
4375 {
4376         rcu_read_unlock();
4377 }
4378
4379 static int ptype_seq_show(struct seq_file *seq, void *v)
4380 {
4381         struct packet_type *pt = v;
4382
4383         if (v == SEQ_START_TOKEN)
4384                 seq_puts(seq, "Type Device      Function\n");
4385         else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
4386                 if (pt->type == htons(ETH_P_ALL))
4387                         seq_puts(seq, "ALL ");
4388                 else
4389                         seq_printf(seq, "%04x", ntohs(pt->type));
4390
4391                 seq_printf(seq, " %-8s %pF\n",
4392                            pt->dev ? pt->dev->name : "", pt->func);
4393         }
4394
4395         return 0;
4396 }
4397
4398 static const struct seq_operations ptype_seq_ops = {
4399         .start = ptype_seq_start,
4400         .next  = ptype_seq_next,
4401         .stop  = ptype_seq_stop,
4402         .show  = ptype_seq_show,
4403 };
4404
4405 static int ptype_seq_open(struct inode *inode, struct file *file)
4406 {
4407         return seq_open_net(inode, file, &ptype_seq_ops,
4408                         sizeof(struct seq_net_private));
4409 }
4410
4411 static const struct file_operations ptype_seq_fops = {
4412         .owner   = THIS_MODULE,
4413         .open    = ptype_seq_open,
4414         .read    = seq_read,
4415         .llseek  = seq_lseek,
4416         .release = seq_release_net,
4417 };
4418
4419
4420 static int __net_init dev_proc_net_init(struct net *net)
4421 {
4422         int rc = -ENOMEM;
4423
4424         if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
4425                 goto out;
4426         if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
4427                 goto out_dev;
4428         if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
4429                 goto out_softnet;
4430
4431         if (wext_proc_init(net))
4432                 goto out_ptype;
4433         rc = 0;
4434 out:
4435         return rc;
4436 out_ptype:
4437         proc_net_remove(net, "ptype");
4438 out_softnet:
4439         proc_net_remove(net, "softnet_stat");
4440 out_dev:
4441         proc_net_remove(net, "dev");
4442         goto out;
4443 }
4444
4445 static void __net_exit dev_proc_net_exit(struct net *net)
4446 {
4447         wext_proc_exit(net);
4448
4449         proc_net_remove(net, "ptype");
4450         proc_net_remove(net, "softnet_stat");
4451         proc_net_remove(net, "dev");
4452 }
4453
4454 static struct pernet_operations __net_initdata dev_proc_ops = {
4455         .init = dev_proc_net_init,
4456         .exit = dev_proc_net_exit,
4457 };
4458
4459 static int __init dev_proc_init(void)
4460 {
4461         return register_pernet_subsys(&dev_proc_ops);
4462 }
4463 #else
4464 #define dev_proc_init() 0
4465 #endif  /* CONFIG_PROC_FS */
4466
4467
4468 /**
4469  *      netdev_set_master       -       set up master pointer
4470  *      @slave: slave device
4471  *      @master: new master device
4472  *
4473  *      Changes the master device of the slave. Pass %NULL to break the
4474  *      bonding. The caller must hold the RTNL semaphore. On a failure
4475  *      a negative errno code is returned. On success the reference counts
4476  *      are adjusted and the function returns zero.
4477  */
4478 int netdev_set_master(struct net_device *slave, struct net_device *master)
4479 {
4480         struct net_device *old = slave->master;
4481
4482         ASSERT_RTNL();
4483
4484         if (master) {
4485                 if (old)
4486                         return -EBUSY;
4487                 dev_hold(master);
4488         }
4489
4490         slave->master = master;
4491
4492         if (old)
4493                 dev_put(old);
4494         return 0;
4495 }
4496 EXPORT_SYMBOL(netdev_set_master);
4497
4498 /**
4499  *      netdev_set_bond_master  -       set up bonding master/slave pair
4500  *      @slave: slave device
4501  *      @master: new master device
4502  *
4503  *      Changes the master device of the slave. Pass %NULL to break the
4504  *      bonding. The caller must hold the RTNL semaphore. On a failure
4505  *      a negative errno code is returned. On success %RTM_NEWLINK is sent
4506  *      to the routing socket and the function returns zero.
4507  */
4508 int netdev_set_bond_master(struct net_device *slave, struct net_device *master)
4509 {
4510         int err;
4511
4512         ASSERT_RTNL();
4513
4514         err = netdev_set_master(slave, master);
4515         if (err)
4516                 return err;
4517         if (master)
4518                 slave->flags |= IFF_SLAVE;
4519         else
4520                 slave->flags &= ~IFF_SLAVE;
4521
4522         rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4523         return 0;
4524 }
4525 EXPORT_SYMBOL(netdev_set_bond_master);
4526
4527 static void dev_change_rx_flags(struct net_device *dev, int flags)
4528 {
4529         const struct net_device_ops *ops = dev->netdev_ops;
4530
4531         if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4532                 ops->ndo_change_rx_flags(dev, flags);
4533 }
4534
4535 static int __dev_set_promiscuity(struct net_device *dev, int inc)
4536 {
4537         unsigned short old_flags = dev->flags;
4538         uid_t uid;
4539         gid_t gid;
4540
4541         ASSERT_RTNL();
4542
4543         dev->flags |= IFF_PROMISC;
4544         dev->promiscuity += inc;
4545         if (dev->promiscuity == 0) {
4546                 /*
4547                  * Avoid overflow.
4548                  * If inc causes overflow, untouch promisc and return error.
4549                  */
4550                 if (inc < 0)
4551                         dev->flags &= ~IFF_PROMISC;
4552                 else {
4553                         dev->promiscuity -= inc;
4554                         printk(KERN_WARNING "%s: promiscuity touches roof, "
4555                                 "set promiscuity failed, promiscuity feature "
4556                                 "of device might be broken.\n", dev->name);
4557                         return -EOVERFLOW;
4558                 }
4559         }
4560         if (dev->flags != old_flags) {
4561                 printk(KERN_INFO "device %s %s promiscuous mode\n",
4562                        dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
4563                                                                "left");
4564                 if (audit_enabled) {
4565                         current_uid_gid(&uid, &gid);
4566                         audit_log(current->audit_context, GFP_ATOMIC,
4567                                 AUDIT_ANOM_PROMISCUOUS,
4568                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4569                                 dev->name, (dev->flags & IFF_PROMISC),
4570                                 (old_flags & IFF_PROMISC),
4571                                 audit_get_loginuid(current),
4572                                 uid, gid,
4573                                 audit_get_sessionid(current));
4574                 }
4575
4576                 dev_change_rx_flags(dev, IFF_PROMISC);
4577         }
4578         return 0;
4579 }
4580
4581 /**
4582  *      dev_set_promiscuity     - update promiscuity count on a device
4583  *      @dev: device
4584  *      @inc: modifier
4585  *
4586  *      Add or remove promiscuity from a device. While the count in the device
4587  *      remains above zero the interface remains promiscuous. Once it hits zero
4588  *      the device reverts back to normal filtering operation. A negative inc
4589  *      value is used to drop promiscuity on the device.
4590  *      Return 0 if successful or a negative errno code on error.
4591  */
4592 int dev_set_promiscuity(struct net_device *dev, int inc)
4593 {
4594         unsigned short old_flags = dev->flags;
4595         int err;
4596
4597         err = __dev_set_promiscuity(dev, inc);
4598         if (err < 0)
4599                 return err;
4600         if (dev->flags != old_flags)
4601                 dev_set_rx_mode(dev);
4602         return err;
4603 }
4604 EXPORT_SYMBOL(dev_set_promiscuity);
4605
4606 /**
4607  *      dev_set_allmulti        - update allmulti count on a device
4608  *      @dev: device
4609  *      @inc: modifier
4610  *
4611  *      Add or remove reception of all multicast frames to a device. While the
4612  *      count in the device remains above zero the interface remains listening
4613  *      to all interfaces. Once it hits zero the device reverts back to normal
4614  *      filtering operation. A negative @inc value is used to drop the counter
4615  *      when releasing a resource needing all multicasts.
4616  *      Return 0 if successful or a negative errno code on error.
4617  */
4618
4619 int dev_set_allmulti(struct net_device *dev, int inc)
4620 {
4621         unsigned short old_flags = dev->flags;
4622
4623         ASSERT_RTNL();
4624
4625         dev->flags |= IFF_ALLMULTI;
4626         dev->allmulti += inc;
4627         if (dev->allmulti == 0) {
4628                 /*
4629                  * Avoid overflow.
4630                  * If inc causes overflow, untouch allmulti and return error.
4631                  */
4632                 if (inc < 0)
4633                         dev->flags &= ~IFF_ALLMULTI;
4634                 else {
4635                         dev->allmulti -= inc;
4636                         printk(KERN_WARNING "%s: allmulti touches roof, "
4637                                 "set allmulti failed, allmulti feature of "
4638                                 "device might be broken.\n", dev->name);
4639                         return -EOVERFLOW;
4640                 }
4641         }
4642         if (dev->flags ^ old_flags) {
4643                 dev_change_rx_flags(dev, IFF_ALLMULTI);
4644                 dev_set_rx_mode(dev);
4645         }
4646         return 0;
4647 }
4648 EXPORT_SYMBOL(dev_set_allmulti);
4649
4650 /*
4651  *      Upload unicast and multicast address lists to device and
4652  *      configure RX filtering. When the device doesn't support unicast
4653  *      filtering it is put in promiscuous mode while unicast addresses
4654  *      are present.
4655  */
4656 void __dev_set_rx_mode(struct net_device *dev)
4657 {
4658         const struct net_device_ops *ops = dev->netdev_ops;
4659
4660         /* dev_open will call this function so the list will stay sane. */
4661         if (!(dev->flags&IFF_UP))
4662                 return;
4663
4664         if (!netif_device_present(dev))
4665                 return;
4666
4667         if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
4668                 /* Unicast addresses changes may only happen under the rtnl,
4669                  * therefore calling __dev_set_promiscuity here is safe.
4670                  */
4671                 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4672                         __dev_set_promiscuity(dev, 1);
4673                         dev->uc_promisc = true;
4674                 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4675                         __dev_set_promiscuity(dev, -1);
4676                         dev->uc_promisc = false;
4677                 }
4678         }
4679
4680         if (ops->ndo_set_rx_mode)
4681                 ops->ndo_set_rx_mode(dev);
4682 }
4683
4684 void dev_set_rx_mode(struct net_device *dev)
4685 {
4686         netif_addr_lock_bh(dev);
4687         __dev_set_rx_mode(dev);
4688         netif_addr_unlock_bh(dev);
4689 }
4690
4691 /**
4692  *      dev_get_flags - get flags reported to userspace
4693  *      @dev: device
4694  *
4695  *      Get the combination of flag bits exported through APIs to userspace.
4696  */
4697 unsigned dev_get_flags(const struct net_device *dev)
4698 {
4699         unsigned flags;
4700
4701         flags = (dev->flags & ~(IFF_PROMISC |
4702                                 IFF_ALLMULTI |
4703                                 IFF_RUNNING |
4704                                 IFF_LOWER_UP |
4705                                 IFF_DORMANT)) |
4706                 (dev->gflags & (IFF_PROMISC |
4707                                 IFF_ALLMULTI));
4708
4709         if (netif_running(dev)) {
4710                 if (netif_oper_up(dev))
4711                         flags |= IFF_RUNNING;
4712                 if (netif_carrier_ok(dev))
4713                         flags |= IFF_LOWER_UP;
4714                 if (netif_dormant(dev))
4715                         flags |= IFF_DORMANT;
4716         }
4717
4718         return flags;
4719 }
4720 EXPORT_SYMBOL(dev_get_flags);
4721
4722 int __dev_change_flags(struct net_device *dev, unsigned int flags)
4723 {
4724         int old_flags = dev->flags;
4725         int ret;
4726
4727         ASSERT_RTNL();
4728
4729         /*
4730          *      Set the flags on our device.
4731          */
4732
4733         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4734                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4735                                IFF_AUTOMEDIA)) |
4736                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4737                                     IFF_ALLMULTI));
4738
4739         /*
4740          *      Load in the correct multicast list now the flags have changed.
4741          */
4742
4743         if ((old_flags ^ flags) & IFF_MULTICAST)
4744                 dev_change_rx_flags(dev, IFF_MULTICAST);
4745
4746         dev_set_rx_mode(dev);
4747
4748         /*
4749          *      Have we downed the interface. We handle IFF_UP ourselves
4750          *      according to user attempts to set it, rather than blindly
4751          *      setting it.
4752          */
4753
4754         ret = 0;
4755         if ((old_flags ^ flags) & IFF_UP) {     /* Bit is different  ? */
4756                 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4757
4758                 if (!ret)
4759                         dev_set_rx_mode(dev);
4760         }
4761
4762         if ((flags ^ dev->gflags) & IFF_PROMISC) {
4763                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
4764
4765                 dev->gflags ^= IFF_PROMISC;
4766                 dev_set_promiscuity(dev, inc);
4767         }
4768
4769         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4770            is important. Some (broken) drivers set IFF_PROMISC, when
4771            IFF_ALLMULTI is requested not asking us and not reporting.
4772          */
4773         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4774                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4775
4776                 dev->gflags ^= IFF_ALLMULTI;
4777                 dev_set_allmulti(dev, inc);
4778         }
4779
4780         return ret;
4781 }
4782
4783 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4784 {
4785         unsigned int changes = dev->flags ^ old_flags;
4786
4787         if (changes & IFF_UP) {
4788                 if (dev->flags & IFF_UP)
4789                         call_netdevice_notifiers(NETDEV_UP, dev);
4790                 else
4791                         call_netdevice_notifiers(NETDEV_DOWN, dev);
4792         }
4793
4794         if (dev->flags & IFF_UP &&
4795             (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4796                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
4797 }
4798
4799 /**
4800  *      dev_change_flags - change device settings
4801  *      @dev: device
4802  *      @flags: device state flags
4803  *
4804  *      Change settings on device based state flags. The flags are
4805  *      in the userspace exported format.
4806  */
4807 int dev_change_flags(struct net_device *dev, unsigned flags)
4808 {
4809         int ret, changes;
4810         int old_flags = dev->flags;
4811
4812         ret = __dev_change_flags(dev, flags);
4813         if (ret < 0)
4814                 return ret;
4815
4816         changes = old_flags ^ dev->flags;
4817         if (changes)
4818                 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4819
4820         __dev_notify_flags(dev, old_flags);
4821         return ret;
4822 }
4823 EXPORT_SYMBOL(dev_change_flags);
4824
4825 /**
4826  *      dev_set_mtu - Change maximum transfer unit
4827  *      @dev: device
4828  *      @new_mtu: new transfer unit
4829  *
4830  *      Change the maximum transfer size of the network device.
4831  */
4832 int dev_set_mtu(struct net_device *dev, int new_mtu)
4833 {
4834         const struct net_device_ops *ops = dev->netdev_ops;
4835         int err;
4836
4837         if (new_mtu == dev->mtu)
4838                 return 0;
4839
4840         /*      MTU must be positive.    */
4841         if (new_mtu < 0)
4842                 return -EINVAL;
4843
4844         if (!netif_device_present(dev))
4845                 return -ENODEV;
4846
4847         err = 0;
4848         if (ops->ndo_change_mtu)
4849                 err = ops->ndo_change_mtu(dev, new_mtu);
4850         else
4851                 dev->mtu = new_mtu;
4852
4853         if (!err && dev->flags & IFF_UP)
4854                 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4855         return err;
4856 }
4857 EXPORT_SYMBOL(dev_set_mtu);
4858
4859 /**
4860  *      dev_set_group - Change group this device belongs to
4861  *      @dev: device
4862  *      @new_group: group this device should belong to
4863  */
4864 void dev_set_group(struct net_device *dev, int new_group)
4865 {
4866         dev->group = new_group;
4867 }
4868 EXPORT_SYMBOL(dev_set_group);
4869
4870 /**
4871  *      dev_set_mac_address - Change Media Access Control Address
4872  *      @dev: device
4873  *      @sa: new address
4874  *
4875  *      Change the hardware (MAC) address of the device
4876  */
4877 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4878 {
4879         const struct net_device_ops *ops = dev->netdev_ops;
4880         int err;
4881
4882         if (!ops->ndo_set_mac_address)
4883                 return -EOPNOTSUPP;
4884         if (sa->sa_family != dev->type)
4885                 return -EINVAL;
4886         if (!netif_device_present(dev))
4887                 return -ENODEV;
4888         err = ops->ndo_set_mac_address(dev, sa);
4889         if (!err)
4890                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4891         return err;
4892 }
4893 EXPORT_SYMBOL(dev_set_mac_address);
4894
4895 /*
4896  *      Perform the SIOCxIFxxx calls, inside rcu_read_lock()
4897  */
4898 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4899 {
4900         int err;
4901         struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
4902
4903         if (!dev)
4904                 return -ENODEV;
4905
4906         switch (cmd) {
4907         case SIOCGIFFLAGS:      /* Get interface flags */
4908                 ifr->ifr_flags = (short) dev_get_flags(dev);
4909                 return 0;
4910
4911         case SIOCGIFMETRIC:     /* Get the metric on the interface
4912                                    (currently unused) */
4913                 ifr->ifr_metric = 0;
4914                 return 0;
4915
4916         case SIOCGIFMTU:        /* Get the MTU of a device */
4917                 ifr->ifr_mtu = dev->mtu;
4918                 return 0;
4919
4920         case SIOCGIFHWADDR:
4921                 if (!dev->addr_len)
4922                         memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4923                 else
4924                         memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4925                                min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4926                 ifr->ifr_hwaddr.sa_family = dev->type;
4927                 return 0;
4928
4929         case SIOCGIFSLAVE:
4930                 err = -EINVAL;
4931                 break;
4932
4933         case SIOCGIFMAP:
4934                 ifr->ifr_map.mem_start = dev->mem_start;
4935                 ifr->ifr_map.mem_end   = dev->mem_end;
4936                 ifr->ifr_map.base_addr = dev->base_addr;
4937                 ifr->ifr_map.irq       = dev->irq;
4938                 ifr->ifr_map.dma       = dev->dma;
4939                 ifr->ifr_map.port      = dev->if_port;
4940                 return 0;
4941
4942         case SIOCGIFINDEX:
4943                 ifr->ifr_ifindex = dev->ifindex;
4944                 return 0;
4945
4946         case SIOCGIFTXQLEN:
4947                 ifr->ifr_qlen = dev->tx_queue_len;
4948                 return 0;
4949
4950         default:
4951                 /* dev_ioctl() should ensure this case
4952                  * is never reached
4953                  */
4954                 WARN_ON(1);
4955                 err = -ENOTTY;
4956                 break;
4957
4958         }
4959         return err;
4960 }
4961
4962 /*
4963  *      Perform the SIOCxIFxxx calls, inside rtnl_lock()
4964  */
4965 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4966 {
4967         int err;
4968         struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4969         const struct net_device_ops *ops;
4970
4971         if (!dev)
4972                 return -ENODEV;
4973
4974         ops = dev->netdev_ops;
4975
4976         switch (cmd) {
4977         case SIOCSIFFLAGS:      /* Set interface flags */
4978                 return dev_change_flags(dev, ifr->ifr_flags);
4979
4980         case SIOCSIFMETRIC:     /* Set the metric on the interface
4981                                    (currently unused) */
4982                 return -EOPNOTSUPP;
4983
4984         case SIOCSIFMTU:        /* Set the MTU of a device */
4985                 return dev_set_mtu(dev, ifr->ifr_mtu);
4986
4987         case SIOCSIFHWADDR:
4988                 return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
4989
4990         case SIOCSIFHWBROADCAST:
4991                 if (ifr->ifr_hwaddr.sa_family != dev->type)
4992                         return -EINVAL;
4993                 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4994                        min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4995                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4996                 return 0;
4997
4998         case SIOCSIFMAP:
4999                 if (ops->ndo_set_config) {
5000                         if (!netif_device_present(dev))
5001                                 return -ENODEV;
5002                         return ops->ndo_set_config(dev, &ifr->ifr_map);
5003                 }
5004                 return -EOPNOTSUPP;
5005
5006         case SIOCADDMULTI:
5007                 if (!ops->ndo_set_rx_mode ||
5008                     ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
5009                         return -EINVAL;
5010                 if (!netif_device_present(dev))
5011                         return -ENODEV;
5012                 return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
5013
5014         case SIOCDELMULTI:
5015                 if (!ops->ndo_set_rx_mode ||
5016                     ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
5017                         return -EINVAL;
5018                 if (!netif_device_present(dev))
5019                         return -ENODEV;
5020                 return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
5021
5022         case SIOCSIFTXQLEN:
5023                 if (ifr->ifr_qlen < 0)
5024                         return -EINVAL;
5025                 dev->tx_queue_len = ifr->ifr_qlen;
5026                 return 0;
5027
5028         case SIOCSIFNAME:
5029                 ifr->ifr_newname[IFNAMSIZ-1] = '\0';
5030                 return dev_change_name(dev, ifr->ifr_newname);
5031
5032         case SIOCSHWTSTAMP:
5033                 err = net_hwtstamp_validate(ifr);
5034                 if (err)
5035                         return err;
5036                 /* fall through */
5037
5038         /*
5039          *      Unknown or private ioctl
5040          */
5041         default:
5042                 if ((cmd >= SIOCDEVPRIVATE &&
5043                     cmd <= SIOCDEVPRIVATE + 15) ||
5044                     cmd == SIOCBONDENSLAVE ||
5045                     cmd == SIOCBONDRELEASE ||
5046                     cmd == SIOCBONDSETHWADDR ||
5047                     cmd == SIOCBONDSLAVEINFOQUERY ||
5048                     cmd == SIOCBONDINFOQUERY ||
5049                     cmd == SIOCBONDCHANGEACTIVE ||
5050                     cmd == SIOCGMIIPHY ||
5051                     cmd == SIOCGMIIREG ||
5052                     cmd == SIOCSMIIREG ||
5053                     cmd == SIOCBRADDIF ||
5054                     cmd == SIOCBRDELIF ||
5055                     cmd == SIOCSHWTSTAMP ||
5056                     cmd == SIOCWANDEV) {
5057                         err = -EOPNOTSUPP;
5058                         if (ops->ndo_do_ioctl) {
5059                                 if (netif_device_present(dev))
5060                                         err = ops->ndo_do_ioctl(dev, ifr, cmd);
5061                                 else
5062                                         err = -ENODEV;
5063                         }
5064                 } else
5065                         err = -EINVAL;
5066
5067         }
5068         return err;
5069 }
5070
5071 /*
5072  *      This function handles all "interface"-type I/O control requests. The actual
5073  *      'doing' part of this is dev_ifsioc above.
5074  */
5075
5076 /**
5077  *      dev_ioctl       -       network device ioctl
5078  *      @net: the applicable net namespace
5079  *      @cmd: command to issue
5080  *      @arg: pointer to a struct ifreq in user space
5081  *
5082  *      Issue ioctl functions to devices. This is normally called by the
5083  *      user space syscall interfaces but can sometimes be useful for
5084  *      other purposes. The return value is the return from the syscall if
5085  *      positive or a negative errno code on error.
5086  */
5087
5088 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
5089 {
5090         struct ifreq ifr;
5091         int ret;
5092         char *colon;
5093
5094         /* One special case: SIOCGIFCONF takes ifconf argument
5095            and requires shared lock, because it sleeps writing
5096            to user space.
5097          */
5098
5099         if (cmd == SIOCGIFCONF) {
5100                 rtnl_lock();
5101                 ret = dev_ifconf(net, (char __user *) arg);
5102                 rtnl_unlock();
5103                 return ret;
5104         }
5105         if (cmd == SIOCGIFNAME)
5106                 return dev_ifname(net, (struct ifreq __user *)arg);
5107
5108         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
5109                 return -EFAULT;
5110
5111         ifr.ifr_name[IFNAMSIZ-1] = 0;
5112
5113         colon = strchr(ifr.ifr_name, ':');
5114         if (colon)
5115                 *colon = 0;
5116
5117         /*
5118          *      See which interface the caller is talking about.
5119          */
5120
5121         switch (cmd) {
5122         /*
5123          *      These ioctl calls:
5124          *      - can be done by all.
5125          *      - atomic and do not require locking.
5126          *      - return a value
5127          */
5128         case SIOCGIFFLAGS:
5129         case SIOCGIFMETRIC:
5130         case SIOCGIFMTU:
5131         case SIOCGIFHWADDR:
5132         case SIOCGIFSLAVE:
5133         case SIOCGIFMAP:
5134         case SIOCGIFINDEX:
5135         case SIOCGIFTXQLEN:
5136                 dev_load(net, ifr.ifr_name);
5137                 rcu_read_lock();
5138                 ret = dev_ifsioc_locked(net, &ifr, cmd);
5139                 rcu_read_unlock();
5140                 if (!ret) {
5141                         if (colon)
5142                                 *colon = ':';
5143                         if (copy_to_user(arg, &ifr,
5144                                          sizeof(struct ifreq)))
5145                                 ret = -EFAULT;
5146                 }
5147                 return ret;
5148
5149         case SIOCETHTOOL:
5150                 dev_load(net, ifr.ifr_name);
5151                 rtnl_lock();
5152                 ret = dev_ethtool(net, &ifr);
5153                 rtnl_unlock();
5154                 if (!ret) {
5155                         if (colon)
5156                                 *colon = ':';
5157                         if (copy_to_user(arg, &ifr,
5158                                          sizeof(struct ifreq)))
5159                                 ret = -EFAULT;
5160                 }
5161                 return ret;
5162
5163         /*
5164          *      These ioctl calls:
5165          *      - require superuser power.
5166          *      - require strict serialization.
5167          *      - return a value
5168          */
5169         case SIOCGMIIPHY:
5170         case SIOCGMIIREG:
5171         case SIOCSIFNAME:
5172                 if (!capable(CAP_NET_ADMIN))
5173                         return -EPERM;
5174                 dev_load(net, ifr.ifr_name);
5175                 rtnl_lock();
5176                 ret = dev_ifsioc(net, &ifr, cmd);
5177                 rtnl_unlock();
5178                 if (!ret) {
5179                         if (colon)
5180                                 *colon = ':';
5181                         if (copy_to_user(arg, &ifr,
5182                                          sizeof(struct ifreq)))
5183                                 ret = -EFAULT;
5184                 }
5185                 return ret;
5186
5187         /*
5188          *      These ioctl calls:
5189          *      - require superuser power.
5190          *      - require strict serialization.
5191          *      - do not return a value
5192          */
5193         case SIOCSIFFLAGS:
5194         case SIOCSIFMETRIC:
5195         case SIOCSIFMTU:
5196         case SIOCSIFMAP:
5197         case SIOCSIFHWADDR:
5198         case SIOCSIFSLAVE:
5199         case SIOCADDMULTI:
5200         case SIOCDELMULTI:
5201         case SIOCSIFHWBROADCAST:
5202         case SIOCSIFTXQLEN:
5203         case SIOCSMIIREG:
5204         case SIOCBONDENSLAVE:
5205         case SIOCBONDRELEASE:
5206         case SIOCBONDSETHWADDR:
5207         case SIOCBONDCHANGEACTIVE:
5208         case SIOCBRADDIF:
5209         case SIOCBRDELIF:
5210         case SIOCSHWTSTAMP:
5211                 if (!capable(CAP_NET_ADMIN))
5212                         return -EPERM;
5213                 /* fall through */
5214         case SIOCBONDSLAVEINFOQUERY:
5215         case SIOCBONDINFOQUERY:
5216                 dev_load(net, ifr.ifr_name);
5217                 rtnl_lock();
5218                 ret = dev_ifsioc(net, &ifr, cmd);
5219                 rtnl_unlock();
5220                 return ret;
5221
5222         case SIOCGIFMEM:
5223                 /* Get the per device memory space. We can add this but
5224                  * currently do not support it */
5225         case SIOCSIFMEM:
5226                 /* Set the per device memory buffer space.
5227                  * Not applicable in our case */
5228         case SIOCSIFLINK:
5229                 return -ENOTTY;
5230
5231         /*
5232          *      Unknown or private ioctl.
5233          */
5234         default:
5235                 if (cmd == SIOCWANDEV ||
5236                     (cmd >= SIOCDEVPRIVATE &&
5237                      cmd <= SIOCDEVPRIVATE + 15)) {
5238                         dev_load(net, ifr.ifr_name);
5239                         rtnl_lock();
5240                         ret = dev_ifsioc(net, &ifr, cmd);
5241                         rtnl_unlock();
5242                         if (!ret && copy_to_user(arg, &ifr,
5243                                                  sizeof(struct ifreq)))
5244                                 ret = -EFAULT;
5245                         return ret;
5246                 }
5247                 /* Take care of Wireless Extensions */
5248                 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
5249                         return wext_handle_ioctl(net, &ifr, cmd, arg);
5250                 return -ENOTTY;
5251         }
5252 }
5253
5254
5255 /**
5256  *      dev_new_index   -       allocate an ifindex
5257  *      @net: the applicable net namespace
5258  *
5259  *      Returns a suitable unique value for a new device interface
5260  *      number.  The caller must hold the rtnl semaphore or the
5261  *      dev_base_lock to be sure it remains unique.
5262  */
5263 static int dev_new_index(struct net *net)
5264 {
5265         static int ifindex;
5266         for (;;) {
5267                 if (++ifindex <= 0)
5268                         ifindex = 1;
5269                 if (!__dev_get_by_index(net, ifindex))
5270                         return ifindex;
5271         }
5272 }
5273
5274 /* Delayed registration/unregisteration */
5275 static LIST_HEAD(net_todo_list);
5276
5277 static void net_set_todo(struct net_device *dev)
5278 {
5279         list_add_tail(&dev->todo_list, &net_todo_list);
5280 }
5281
5282 static void rollback_registered_many(struct list_head *head)
5283 {
5284         struct net_device *dev, *tmp;
5285
5286         BUG_ON(dev_boot_phase);
5287         ASSERT_RTNL();
5288
5289         list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5290                 /* Some devices call without registering
5291                  * for initialization unwind. Remove those
5292                  * devices and proceed with the remaining.
5293                  */
5294                 if (dev->reg_state == NETREG_UNINITIALIZED) {
5295                         pr_debug("unregister_netdevice: device %s/%p never "
5296                                  "was registered\n", dev->name, dev);
5297
5298                         WARN_ON(1);
5299                         list_del(&dev->unreg_list);
5300                         continue;
5301                 }
5302                 dev->dismantle = true;
5303                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
5304         }
5305
5306         /* If device is running, close it first. */
5307         dev_close_many(head);
5308
5309         list_for_each_entry(dev, head, unreg_list) {
5310                 /* And unlink it from device chain. */
5311                 unlist_netdevice(dev);
5312
5313                 dev->reg_state = NETREG_UNREGISTERING;
5314         }
5315
5316         synchronize_net();
5317
5318         list_for_each_entry(dev, head, unreg_list) {
5319                 /* Shutdown queueing discipline. */
5320                 dev_shutdown(dev);
5321
5322
5323                 /* Notify protocols, that we are about to destroy
5324                    this device. They should clean all the things.
5325                 */
5326                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5327
5328                 if (!dev->rtnl_link_ops ||
5329                     dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5330                         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5331
5332                 /*
5333                  *      Flush the unicast and multicast chains
5334                  */
5335                 dev_uc_flush(dev);
5336                 dev_mc_flush(dev);
5337
5338                 if (dev->netdev_ops->ndo_uninit)
5339                         dev->netdev_ops->ndo_uninit(dev);
5340
5341                 /* Notifier chain MUST detach us from master device. */
5342                 WARN_ON(dev->master);
5343
5344                 /* Remove entries from kobject tree */
5345                 netdev_unregister_kobject(dev);
5346         }
5347
5348         /* Process any work delayed until the end of the batch */
5349         dev = list_first_entry(head, struct net_device, unreg_list);
5350         call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5351
5352         synchronize_net();
5353
5354         list_for_each_entry(dev, head, unreg_list)
5355                 dev_put(dev);
5356 }
5357
5358 static void rollback_registered(struct net_device *dev)
5359 {
5360         LIST_HEAD(single);
5361
5362         list_add(&dev->unreg_list, &single);
5363         rollback_registered_many(&single);
5364         list_del(&single);
5365 }
5366
5367 static u32 netdev_fix_features(struct net_device *dev, u32 features)
5368 {
5369         /* Fix illegal checksum combinations */
5370         if ((features & NETIF_F_HW_CSUM) &&
5371             (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5372                 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5373                 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5374         }
5375
5376         if ((features & NETIF_F_NO_CSUM) &&
5377             (features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5378                 netdev_warn(dev, "mixed no checksumming and other settings.\n");
5379                 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
5380         }
5381
5382         /* Fix illegal SG+CSUM combinations. */
5383         if ((features & NETIF_F_SG) &&
5384             !(features & NETIF_F_ALL_CSUM)) {
5385                 netdev_dbg(dev,
5386                         "Dropping NETIF_F_SG since no checksum feature.\n");
5387                 features &= ~NETIF_F_SG;
5388         }
5389
5390         /* TSO requires that SG is present as well. */
5391         if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5392                 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5393                 features &= ~NETIF_F_ALL_TSO;
5394         }
5395
5396         /* TSO ECN requires that TSO is present as well. */
5397         if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5398                 features &= ~NETIF_F_TSO_ECN;
5399
5400         /* Software GSO depends on SG. */
5401         if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5402                 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5403                 features &= ~NETIF_F_GSO;
5404         }
5405
5406         /* UFO needs SG and checksumming */
5407         if (features & NETIF_F_UFO) {
5408                 /* maybe split UFO into V4 and V6? */
5409                 if (!((features & NETIF_F_GEN_CSUM) ||
5410                     (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5411                             == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5412                         netdev_dbg(dev,
5413                                 "Dropping NETIF_F_UFO since no checksum offload features.\n");
5414                         features &= ~NETIF_F_UFO;
5415                 }
5416
5417                 if (!(features & NETIF_F_SG)) {
5418                         netdev_dbg(dev,
5419                                 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5420                         features &= ~NETIF_F_UFO;
5421                 }
5422         }
5423
5424         return features;
5425 }
5426
5427 int __netdev_update_features(struct net_device *dev)
5428 {
5429         u32 features;
5430         int err = 0;
5431
5432         ASSERT_RTNL();
5433
5434         features = netdev_get_wanted_features(dev);
5435
5436         if (dev->netdev_ops->ndo_fix_features)
5437                 features = dev->netdev_ops->ndo_fix_features(dev, features);
5438
5439         /* driver might be less strict about feature dependencies */
5440         features = netdev_fix_features(dev, features);
5441
5442         if (dev->features == features)
5443                 return 0;
5444
5445         netdev_dbg(dev, "Features changed: 0x%08x -> 0x%08x\n",
5446                 dev->features, features);
5447
5448         if (dev->netdev_ops->ndo_set_features)
5449                 err = dev->netdev_ops->ndo_set_features(dev, features);
5450
5451         if (unlikely(err < 0)) {
5452                 netdev_err(dev,
5453                         "set_features() failed (%d); wanted 0x%08x, left 0x%08x\n",
5454                         err, features, dev->features);
5455                 return -1;
5456         }
5457
5458         if (!err)
5459                 dev->features = features;
5460
5461         return 1;
5462 }
5463
5464 /**
5465  *      netdev_update_features - recalculate device features
5466  *      @dev: the device to check
5467  *
5468  *      Recalculate dev->features set and send notifications if it
5469  *      has changed. Should be called after driver or hardware dependent
5470  *      conditions might have changed that influence the features.
5471  */
5472 void netdev_update_features(struct net_device *dev)
5473 {
5474         if (__netdev_update_features(dev))
5475                 netdev_features_change(dev);
5476 }
5477 EXPORT_SYMBOL(netdev_update_features);
5478
5479 /**
5480  *      netdev_change_features - recalculate device features
5481  *      @dev: the device to check
5482  *
5483  *      Recalculate dev->features set and send notifications even
5484  *      if they have not changed. Should be called instead of
5485  *      netdev_update_features() if also dev->vlan_features might
5486  *      have changed to allow the changes to be propagated to stacked
5487  *      VLAN devices.
5488  */
5489 void netdev_change_features(struct net_device *dev)
5490 {
5491         __netdev_update_features(dev);
5492         netdev_features_change(dev);
5493 }
5494 EXPORT_SYMBOL(netdev_change_features);
5495
5496 /**
5497  *      netif_stacked_transfer_operstate -      transfer operstate
5498  *      @rootdev: the root or lower level device to transfer state from
5499  *      @dev: the device to transfer operstate to
5500  *
5501  *      Transfer operational state from root to device. This is normally
5502  *      called when a stacking relationship exists between the root
5503  *      device and the device(a leaf device).
5504  */
5505 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5506                                         struct net_device *dev)
5507 {
5508         if (rootdev->operstate == IF_OPER_DORMANT)
5509                 netif_dormant_on(dev);
5510         else
5511                 netif_dormant_off(dev);
5512
5513         if (netif_carrier_ok(rootdev)) {
5514                 if (!netif_carrier_ok(dev))
5515                         netif_carrier_on(dev);
5516         } else {
5517                 if (netif_carrier_ok(dev))
5518                         netif_carrier_off(dev);
5519         }
5520 }
5521 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5522
5523 #ifdef CONFIG_RPS
5524 static int netif_alloc_rx_queues(struct net_device *dev)
5525 {
5526         unsigned int i, count = dev->num_rx_queues;
5527         struct netdev_rx_queue *rx;
5528
5529         BUG_ON(count < 1);
5530
5531         rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5532         if (!rx) {
5533                 pr_err("netdev: Unable to allocate %u rx queues.\n", count);
5534                 return -ENOMEM;
5535         }
5536         dev->_rx = rx;
5537
5538         for (i = 0; i < count; i++)
5539                 rx[i].dev = dev;
5540         return 0;
5541 }
5542 #endif
5543
5544 static void netdev_init_one_queue(struct net_device *dev,
5545                                   struct netdev_queue *queue, void *_unused)
5546 {
5547         /* Initialize queue lock */
5548         spin_lock_init(&queue->_xmit_lock);
5549         netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5550         queue->xmit_lock_owner = -1;
5551         netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5552         queue->dev = dev;
5553 }
5554
5555 static int netif_alloc_netdev_queues(struct net_device *dev)
5556 {
5557         unsigned int count = dev->num_tx_queues;
5558         struct netdev_queue *tx;
5559
5560         BUG_ON(count < 1);
5561
5562         tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5563         if (!tx) {
5564                 pr_err("netdev: Unable to allocate %u tx queues.\n",
5565                        count);
5566                 return -ENOMEM;
5567         }
5568         dev->_tx = tx;
5569
5570         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5571         spin_lock_init(&dev->tx_global_lock);
5572
5573         return 0;
5574 }
5575
5576 /**
5577  *      register_netdevice      - register a network device
5578  *      @dev: device to register
5579  *
5580  *      Take a completed network device structure and add it to the kernel
5581  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5582  *      chain. 0 is returned on success. A negative errno code is returned
5583  *      on a failure to set up the device, or if the name is a duplicate.
5584  *
5585  *      Callers must hold the rtnl semaphore. You may want
5586  *      register_netdev() instead of this.
5587  *
5588  *      BUGS:
5589  *      The locking appears insufficient to guarantee two parallel registers
5590  *      will not get the same name.
5591  */
5592
5593 int register_netdevice(struct net_device *dev)
5594 {
5595         int ret;
5596         struct net *net = dev_net(dev);
5597
5598         BUG_ON(dev_boot_phase);
5599         ASSERT_RTNL();
5600
5601         might_sleep();
5602
5603         /* When net_device's are persistent, this will be fatal. */
5604         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5605         BUG_ON(!net);
5606
5607         spin_lock_init(&dev->addr_list_lock);
5608         netdev_set_addr_lockdep_class(dev);
5609
5610         dev->iflink = -1;
5611
5612         ret = dev_get_valid_name(dev, dev->name);
5613         if (ret < 0)
5614                 goto out;
5615
5616         /* Init, if this function is available */
5617         if (dev->netdev_ops->ndo_init) {
5618                 ret = dev->netdev_ops->ndo_init(dev);
5619                 if (ret) {
5620                         if (ret > 0)
5621                                 ret = -EIO;
5622                         goto out;
5623                 }
5624         }
5625
5626         dev->ifindex = dev_new_index(net);
5627         if (dev->iflink == -1)
5628                 dev->iflink = dev->ifindex;
5629
5630         /* Transfer changeable features to wanted_features and enable
5631          * software offloads (GSO and GRO).
5632          */
5633         dev->hw_features |= NETIF_F_SOFT_FEATURES;
5634         dev->features |= NETIF_F_SOFT_FEATURES;
5635         dev->wanted_features = dev->features & dev->hw_features;
5636
5637         /* Turn on no cache copy if HW is doing checksum */
5638         dev->hw_features |= NETIF_F_NOCACHE_COPY;
5639         if ((dev->features & NETIF_F_ALL_CSUM) &&
5640             !(dev->features & NETIF_F_NO_CSUM)) {
5641                 dev->wanted_features |= NETIF_F_NOCACHE_COPY;
5642                 dev->features |= NETIF_F_NOCACHE_COPY;
5643         }
5644
5645         /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
5646          */
5647         dev->vlan_features |= NETIF_F_HIGHDMA;
5648
5649         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5650         ret = notifier_to_errno(ret);
5651         if (ret)
5652                 goto err_uninit;
5653
5654         ret = netdev_register_kobject(dev);
5655         if (ret)
5656                 goto err_uninit;
5657         dev->reg_state = NETREG_REGISTERED;
5658
5659         __netdev_update_features(dev);
5660
5661         /*
5662          *      Default initial state at registry is that the
5663          *      device is present.
5664          */
5665
5666         set_bit(__LINK_STATE_PRESENT, &dev->state);
5667
5668         dev_init_scheduler(dev);
5669         dev_hold(dev);
5670         list_netdevice(dev);
5671
5672         /* Notify protocols, that a new device appeared. */
5673         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5674         ret = notifier_to_errno(ret);
5675         if (ret) {
5676                 rollback_registered(dev);
5677                 dev->reg_state = NETREG_UNREGISTERED;
5678         }
5679         /*
5680          *      Prevent userspace races by waiting until the network
5681          *      device is fully setup before sending notifications.
5682          */
5683         if (!dev->rtnl_link_ops ||
5684             dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5685                 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5686
5687 out:
5688         return ret;
5689
5690 err_uninit:
5691         if (dev->netdev_ops->ndo_uninit)
5692                 dev->netdev_ops->ndo_uninit(dev);
5693         goto out;
5694 }
5695 EXPORT_SYMBOL(register_netdevice);
5696
5697 /**
5698  *      init_dummy_netdev       - init a dummy network device for NAPI
5699  *      @dev: device to init
5700  *
5701  *      This takes a network device structure and initialize the minimum
5702  *      amount of fields so it can be used to schedule NAPI polls without
5703  *      registering a full blown interface. This is to be used by drivers
5704  *      that need to tie several hardware interfaces to a single NAPI
5705  *      poll scheduler due to HW limitations.
5706  */
5707 int init_dummy_netdev(struct net_device *dev)
5708 {
5709         /* Clear everything. Note we don't initialize spinlocks
5710          * are they aren't supposed to be taken by any of the
5711          * NAPI code and this dummy netdev is supposed to be
5712          * only ever used for NAPI polls
5713          */
5714         memset(dev, 0, sizeof(struct net_device));
5715
5716         /* make sure we BUG if trying to hit standard
5717          * register/unregister code path
5718          */
5719         dev->reg_state = NETREG_DUMMY;
5720
5721         /* NAPI wants this */
5722         INIT_LIST_HEAD(&dev->napi_list);
5723
5724         /* a dummy interface is started by default */
5725         set_bit(__LINK_STATE_PRESENT, &dev->state);
5726         set_bit(__LINK_STATE_START, &dev->state);
5727
5728         /* Note : We dont allocate pcpu_refcnt for dummy devices,
5729          * because users of this 'device' dont need to change
5730          * its refcount.
5731          */
5732
5733         return 0;
5734 }
5735 EXPORT_SYMBOL_GPL(init_dummy_netdev);
5736
5737
5738 /**
5739  *      register_netdev - register a network device
5740  *      @dev: device to register
5741  *
5742  *      Take a completed network device structure and add it to the kernel
5743  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5744  *      chain. 0 is returned on success. A negative errno code is returned
5745  *      on a failure to set up the device, or if the name is a duplicate.
5746  *
5747  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
5748  *      and expands the device name if you passed a format string to
5749  *      alloc_netdev.
5750  */
5751 int register_netdev(struct net_device *dev)
5752 {
5753         int err;
5754
5755         rtnl_lock();
5756         err = register_netdevice(dev);
5757         rtnl_unlock();
5758         return err;
5759 }
5760 EXPORT_SYMBOL(register_netdev);
5761
5762 int netdev_refcnt_read(const struct net_device *dev)
5763 {
5764         int i, refcnt = 0;
5765
5766         for_each_possible_cpu(i)
5767                 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5768         return refcnt;
5769 }
5770 EXPORT_SYMBOL(netdev_refcnt_read);
5771
5772 /*
5773  * netdev_wait_allrefs - wait until all references are gone.
5774  *
5775  * This is called when unregistering network devices.
5776  *
5777  * Any protocol or device that holds a reference should register
5778  * for netdevice notification, and cleanup and put back the
5779  * reference if they receive an UNREGISTER event.
5780  * We can get stuck here if buggy protocols don't correctly
5781  * call dev_put.
5782  */
5783 static void netdev_wait_allrefs(struct net_device *dev)
5784 {
5785         unsigned long rebroadcast_time, warning_time;
5786         int refcnt;
5787
5788         linkwatch_forget_dev(dev);
5789
5790         rebroadcast_time = warning_time = jiffies;
5791         refcnt = netdev_refcnt_read(dev);
5792
5793         while (refcnt != 0) {
5794                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5795                         rtnl_lock();
5796
5797                         /* Rebroadcast unregister notification */
5798                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5799                         /* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
5800                          * should have already handle it the first time */
5801
5802                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5803                                      &dev->state)) {
5804                                 /* We must not have linkwatch events
5805                                  * pending on unregister. If this
5806                                  * happens, we simply run the queue
5807                                  * unscheduled, resulting in a noop
5808                                  * for this device.
5809                                  */
5810                                 linkwatch_run_queue();
5811                         }
5812
5813                         __rtnl_unlock();
5814
5815                         rebroadcast_time = jiffies;
5816                 }
5817
5818                 msleep(250);
5819
5820                 refcnt = netdev_refcnt_read(dev);
5821
5822                 if (time_after(jiffies, warning_time + 10 * HZ)) {
5823                         printk(KERN_EMERG "unregister_netdevice: "
5824                                "waiting for %s to become free. Usage "
5825                                "count = %d\n",
5826                                dev->name, refcnt);
5827                         warning_time = jiffies;
5828                 }
5829         }
5830 }
5831
5832 /* The sequence is:
5833  *
5834  *      rtnl_lock();
5835  *      ...
5836  *      register_netdevice(x1);
5837  *      register_netdevice(x2);
5838  *      ...
5839  *      unregister_netdevice(y1);
5840  *      unregister_netdevice(y2);
5841  *      ...
5842  *      rtnl_unlock();
5843  *      free_netdev(y1);
5844  *      free_netdev(y2);
5845  *
5846  * We are invoked by rtnl_unlock().
5847  * This allows us to deal with problems:
5848  * 1) We can delete sysfs objects which invoke hotplug
5849  *    without deadlocking with linkwatch via keventd.
5850  * 2) Since we run with the RTNL semaphore not held, we can sleep
5851  *    safely in order to wait for the netdev refcnt to drop to zero.
5852  *
5853  * We must not return until all unregister events added during
5854  * the interval the lock was held have been completed.
5855  */
5856 void netdev_run_todo(void)
5857 {
5858         struct list_head list;
5859
5860         /* Snapshot list, allow later requests */
5861         list_replace_init(&net_todo_list, &list);
5862
5863         __rtnl_unlock();
5864
5865         /* Wait for rcu callbacks to finish before attempting to drain
5866          * the device list.  This usually avoids a 250ms wait.
5867          */
5868         if (!list_empty(&list))
5869                 rcu_barrier();
5870
5871         while (!list_empty(&list)) {
5872                 struct net_device *dev
5873                         = list_first_entry(&list, struct net_device, todo_list);
5874                 list_del(&dev->todo_list);
5875
5876                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5877                         printk(KERN_ERR "network todo '%s' but state %d\n",
5878                                dev->name, dev->reg_state);
5879                         dump_stack();
5880                         continue;
5881                 }
5882
5883                 dev->reg_state = NETREG_UNREGISTERED;
5884
5885                 on_each_cpu(flush_backlog, dev, 1);
5886
5887                 netdev_wait_allrefs(dev);
5888
5889                 /* paranoia */
5890                 BUG_ON(netdev_refcnt_read(dev));
5891                 WARN_ON(rcu_access_pointer(dev->ip_ptr));
5892                 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
5893                 WARN_ON(dev->dn_ptr);
5894
5895                 if (dev->destructor)
5896                         dev->destructor(dev);
5897
5898                 /* Free network device */
5899                 kobject_put(&dev->dev.kobj);
5900         }
5901 }
5902
5903 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
5904  * fields in the same order, with only the type differing.
5905  */
5906 static void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5907                                     const struct net_device_stats *netdev_stats)
5908 {
5909 #if BITS_PER_LONG == 64
5910         BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5911         memcpy(stats64, netdev_stats, sizeof(*stats64));
5912 #else
5913         size_t i, n = sizeof(*stats64) / sizeof(u64);
5914         const unsigned long *src = (const unsigned long *)netdev_stats;
5915         u64 *dst = (u64 *)stats64;
5916
5917         BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5918                      sizeof(*stats64) / sizeof(u64));
5919         for (i = 0; i < n; i++)
5920                 dst[i] = src[i];
5921 #endif
5922 }
5923
5924 /**
5925  *      dev_get_stats   - get network device statistics
5926  *      @dev: device to get statistics from
5927  *      @storage: place to store stats
5928  *
5929  *      Get network statistics from device. Return @storage.
5930  *      The device driver may provide its own method by setting
5931  *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5932  *      otherwise the internal statistics structure is used.
5933  */
5934 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5935                                         struct rtnl_link_stats64 *storage)
5936 {
5937         const struct net_device_ops *ops = dev->netdev_ops;
5938
5939         if (ops->ndo_get_stats64) {
5940                 memset(storage, 0, sizeof(*storage));
5941                 ops->ndo_get_stats64(dev, storage);
5942         } else if (ops->ndo_get_stats) {
5943                 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
5944         } else {
5945                 netdev_stats_to_stats64(storage, &dev->stats);
5946         }
5947         storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
5948         return storage;
5949 }
5950 EXPORT_SYMBOL(dev_get_stats);
5951
5952 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
5953 {
5954         struct netdev_queue *queue = dev_ingress_queue(dev);
5955
5956 #ifdef CONFIG_NET_CLS_ACT
5957         if (queue)
5958                 return queue;
5959         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5960         if (!queue)
5961                 return NULL;
5962         netdev_init_one_queue(dev, queue, NULL);
5963         queue->qdisc = &noop_qdisc;
5964         queue->qdisc_sleeping = &noop_qdisc;
5965         rcu_assign_pointer(dev->ingress_queue, queue);
5966 #endif
5967         return queue;
5968 }
5969
5970 /**
5971  *      alloc_netdev_mqs - allocate network device
5972  *      @sizeof_priv:   size of private data to allocate space for
5973  *      @name:          device name format string
5974  *      @setup:         callback to initialize device
5975  *      @txqs:          the number of TX subqueues to allocate
5976  *      @rxqs:          the number of RX subqueues to allocate
5977  *
5978  *      Allocates a struct net_device with private data area for driver use
5979  *      and performs basic initialization.  Also allocates subquue structs
5980  *      for each queue on the device.
5981  */
5982 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
5983                 void (*setup)(struct net_device *),
5984                 unsigned int txqs, unsigned int rxqs)
5985 {
5986         struct net_device *dev;
5987         size_t alloc_size;
5988         struct net_device *p;
5989
5990         BUG_ON(strlen(name) >= sizeof(dev->name));
5991
5992         if (txqs < 1) {
5993                 pr_err("alloc_netdev: Unable to allocate device "
5994                        "with zero queues.\n");
5995                 return NULL;
5996         }
5997
5998 #ifdef CONFIG_RPS
5999         if (rxqs < 1) {
6000                 pr_err("alloc_netdev: Unable to allocate device "
6001                        "with zero RX queues.\n");
6002                 return NULL;
6003         }
6004 #endif
6005
6006         alloc_size = sizeof(struct net_device);
6007         if (sizeof_priv) {
6008                 /* ensure 32-byte alignment of private area */
6009                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
6010                 alloc_size += sizeof_priv;
6011         }
6012         /* ensure 32-byte alignment of whole construct */
6013         alloc_size += NETDEV_ALIGN - 1;
6014
6015         p = kzalloc(alloc_size, GFP_KERNEL);
6016         if (!p) {
6017                 printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
6018                 return NULL;
6019         }
6020
6021         dev = PTR_ALIGN(p, NETDEV_ALIGN);
6022         dev->padded = (char *)dev - (char *)p;
6023
6024         dev->pcpu_refcnt = alloc_percpu(int);
6025         if (!dev->pcpu_refcnt)
6026                 goto free_p;
6027
6028         if (dev_addr_init(dev))
6029                 goto free_pcpu;
6030
6031         dev_mc_init(dev);
6032         dev_uc_init(dev);
6033
6034         dev_net_set(dev, &init_net);
6035
6036         dev->gso_max_size = GSO_MAX_SIZE;
6037
6038         INIT_LIST_HEAD(&dev->napi_list);
6039         INIT_LIST_HEAD(&dev->unreg_list);
6040         INIT_LIST_HEAD(&dev->link_watch_list);
6041         dev->priv_flags = IFF_XMIT_DST_RELEASE;
6042         setup(dev);
6043
6044         dev->num_tx_queues = txqs;
6045         dev->real_num_tx_queues = txqs;
6046         if (netif_alloc_netdev_queues(dev))
6047                 goto free_all;
6048
6049 #ifdef CONFIG_RPS
6050         dev->num_rx_queues = rxqs;
6051         dev->real_num_rx_queues = rxqs;
6052         if (netif_alloc_rx_queues(dev))
6053                 goto free_all;
6054 #endif
6055
6056         strcpy(dev->name, name);
6057         dev->group = INIT_NETDEV_GROUP;
6058         return dev;
6059
6060 free_all:
6061         free_netdev(dev);
6062         return NULL;
6063
6064 free_pcpu:
6065         free_percpu(dev->pcpu_refcnt);
6066         kfree(dev->_tx);
6067 #ifdef CONFIG_RPS
6068         kfree(dev->_rx);
6069 #endif
6070
6071 free_p:
6072         kfree(p);
6073         return NULL;
6074 }
6075 EXPORT_SYMBOL(alloc_netdev_mqs);
6076
6077 /**
6078  *      free_netdev - free network device
6079  *      @dev: device
6080  *
6081  *      This function does the last stage of destroying an allocated device
6082  *      interface. The reference to the device object is released.
6083  *      If this is the last reference then it will be freed.
6084  */
6085 void free_netdev(struct net_device *dev)
6086 {
6087         struct napi_struct *p, *n;
6088
6089         release_net(dev_net(dev));
6090
6091         kfree(dev->_tx);
6092 #ifdef CONFIG_RPS
6093         kfree(dev->_rx);
6094 #endif
6095
6096         kfree(rcu_dereference_protected(dev->ingress_queue, 1));
6097
6098         /* Flush device addresses */
6099         dev_addr_flush(dev);
6100
6101         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
6102                 netif_napi_del(p);
6103
6104         free_percpu(dev->pcpu_refcnt);
6105         dev->pcpu_refcnt = NULL;
6106
6107         /*  Compatibility with error handling in drivers */
6108         if (dev->reg_state == NETREG_UNINITIALIZED) {
6109                 kfree((char *)dev - dev->padded);
6110                 return;
6111         }
6112
6113         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6114         dev->reg_state = NETREG_RELEASED;
6115
6116         /* will free via device release */
6117         put_device(&dev->dev);
6118 }
6119 EXPORT_SYMBOL(free_netdev);
6120
6121 /**
6122  *      synchronize_net -  Synchronize with packet receive processing
6123  *
6124  *      Wait for packets currently being received to be done.
6125  *      Does not block later packets from starting.
6126  */
6127 void synchronize_net(void)
6128 {
6129         might_sleep();
6130         if (rtnl_is_locked())
6131                 synchronize_rcu_expedited();
6132         else
6133                 synchronize_rcu();
6134 }
6135 EXPORT_SYMBOL(synchronize_net);
6136
6137 /**
6138  *      unregister_netdevice_queue - remove device from the kernel
6139  *      @dev: device
6140  *      @head: list
6141  *
6142  *      This function shuts down a device interface and removes it
6143  *      from the kernel tables.
6144  *      If head not NULL, device is queued to be unregistered later.
6145  *
6146  *      Callers must hold the rtnl semaphore.  You may want
6147  *      unregister_netdev() instead of this.
6148  */
6149
6150 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
6151 {
6152         ASSERT_RTNL();
6153
6154         if (head) {
6155                 list_move_tail(&dev->unreg_list, head);
6156         } else {
6157                 rollback_registered(dev);
6158                 /* Finish processing unregister after unlock */
6159                 net_set_todo(dev);
6160         }
6161 }
6162 EXPORT_SYMBOL(unregister_netdevice_queue);
6163
6164 /**
6165  *      unregister_netdevice_many - unregister many devices
6166  *      @head: list of devices
6167  */
6168 void unregister_netdevice_many(struct list_head *head)
6169 {
6170         struct net_device *dev;
6171
6172         if (!list_empty(head)) {
6173                 rollback_registered_many(head);
6174                 list_for_each_entry(dev, head, unreg_list)
6175                         net_set_todo(dev);
6176         }
6177 }
6178 EXPORT_SYMBOL(unregister_netdevice_many);
6179
6180 /**
6181  *      unregister_netdev - remove device from the kernel
6182  *      @dev: device
6183  *
6184  *      This function shuts down a device interface and removes it
6185  *      from the kernel tables.
6186  *
6187  *      This is just a wrapper for unregister_netdevice that takes
6188  *      the rtnl semaphore.  In general you want to use this and not
6189  *      unregister_netdevice.
6190  */
6191 void unregister_netdev(struct net_device *dev)
6192 {
6193         rtnl_lock();
6194         unregister_netdevice(dev);
6195         rtnl_unlock();
6196 }
6197 EXPORT_SYMBOL(unregister_netdev);
6198
6199 /**
6200  *      dev_change_net_namespace - move device to different nethost namespace
6201  *      @dev: device
6202  *      @net: network namespace
6203  *      @pat: If not NULL name pattern to try if the current device name
6204  *            is already taken in the destination network namespace.
6205  *
6206  *      This function shuts down a device interface and moves it
6207  *      to a new network namespace. On success 0 is returned, on
6208  *      a failure a netagive errno code is returned.
6209  *
6210  *      Callers must hold the rtnl semaphore.
6211  */
6212
6213 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6214 {
6215         int err;
6216
6217         ASSERT_RTNL();
6218
6219         /* Don't allow namespace local devices to be moved. */
6220         err = -EINVAL;
6221         if (dev->features & NETIF_F_NETNS_LOCAL)
6222                 goto out;
6223
6224         /* Ensure the device has been registrered */
6225         err = -EINVAL;
6226         if (dev->reg_state != NETREG_REGISTERED)
6227                 goto out;
6228
6229         /* Get out if there is nothing todo */
6230         err = 0;
6231         if (net_eq(dev_net(dev), net))
6232                 goto out;
6233
6234         /* Pick the destination device name, and ensure
6235          * we can use it in the destination network namespace.
6236          */
6237         err = -EEXIST;
6238         if (__dev_get_by_name(net, dev->name)) {
6239                 /* We get here if we can't use the current device name */
6240                 if (!pat)
6241                         goto out;
6242                 if (dev_get_valid_name(dev, pat) < 0)
6243                         goto out;
6244         }
6245
6246         /*
6247          * And now a mini version of register_netdevice unregister_netdevice.
6248          */
6249
6250         /* If device is running close it first. */
6251         dev_close(dev);
6252
6253         /* And unlink it from device chain */
6254         err = -ENODEV;
6255         unlist_netdevice(dev);
6256
6257         synchronize_net();
6258
6259         /* Shutdown queueing discipline. */
6260         dev_shutdown(dev);
6261
6262         /* Notify protocols, that we are about to destroy
6263            this device. They should clean all the things.
6264
6265            Note that dev->reg_state stays at NETREG_REGISTERED.
6266            This is wanted because this way 8021q and macvlan know
6267            the device is just moving and can keep their slaves up.
6268         */
6269         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6270         call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
6271         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
6272
6273         /*
6274          *      Flush the unicast and multicast chains
6275          */
6276         dev_uc_flush(dev);
6277         dev_mc_flush(dev);
6278
6279         /* Actually switch the network namespace */
6280         dev_net_set(dev, net);
6281
6282         /* If there is an ifindex conflict assign a new one */
6283         if (__dev_get_by_index(net, dev->ifindex)) {
6284                 int iflink = (dev->iflink == dev->ifindex);
6285                 dev->ifindex = dev_new_index(net);
6286                 if (iflink)
6287                         dev->iflink = dev->ifindex;
6288         }
6289
6290         /* Fixup kobjects */
6291         err = device_rename(&dev->dev, dev->name);
6292         WARN_ON(err);
6293
6294         /* Add the device back in the hashes */
6295         list_netdevice(dev);
6296
6297         /* Notify protocols, that a new device appeared. */
6298         call_netdevice_notifiers(NETDEV_REGISTER, dev);
6299
6300         /*
6301          *      Prevent userspace races by waiting until the network
6302          *      device is fully setup before sending notifications.
6303          */
6304         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6305
6306         synchronize_net();
6307         err = 0;
6308 out:
6309         return err;
6310 }
6311 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6312
6313 static int dev_cpu_callback(struct notifier_block *nfb,
6314                             unsigned long action,
6315                             void *ocpu)
6316 {
6317         struct sk_buff **list_skb;
6318         struct sk_buff *skb;
6319         unsigned int cpu, oldcpu = (unsigned long)ocpu;
6320         struct softnet_data *sd, *oldsd;
6321
6322         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6323                 return NOTIFY_OK;
6324
6325         local_irq_disable();
6326         cpu = smp_processor_id();
6327         sd = &per_cpu(softnet_data, cpu);
6328         oldsd = &per_cpu(softnet_data, oldcpu);
6329
6330         /* Find end of our completion_queue. */
6331         list_skb = &sd->completion_queue;
6332         while (*list_skb)
6333                 list_skb = &(*list_skb)->next;
6334         /* Append completion queue from offline CPU. */
6335         *list_skb = oldsd->completion_queue;
6336         oldsd->completion_queue = NULL;
6337
6338         /* Append output queue from offline CPU. */
6339         if (oldsd->output_queue) {
6340                 *sd->output_queue_tailp = oldsd->output_queue;
6341                 sd->output_queue_tailp = oldsd->output_queue_tailp;
6342                 oldsd->output_queue = NULL;
6343                 oldsd->output_queue_tailp = &oldsd->output_queue;
6344         }
6345         /* Append NAPI poll list from offline CPU. */
6346         if (!list_empty(&oldsd->poll_list)) {
6347                 list_splice_init(&oldsd->poll_list, &sd->poll_list);
6348                 raise_softirq_irqoff(NET_RX_SOFTIRQ);
6349         }
6350
6351         raise_softirq_irqoff(NET_TX_SOFTIRQ);
6352         local_irq_enable();
6353
6354         /* Process offline CPU's input_pkt_queue */
6355         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6356                 netif_rx(skb);
6357                 input_queue_head_incr(oldsd);
6358         }
6359         while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6360                 netif_rx(skb);
6361                 input_queue_head_incr(oldsd);
6362         }
6363
6364         return NOTIFY_OK;
6365 }
6366
6367
6368 /**
6369  *      netdev_increment_features - increment feature set by one
6370  *      @all: current feature set
6371  *      @one: new feature set
6372  *      @mask: mask feature set
6373  *
6374  *      Computes a new feature set after adding a device with feature set
6375  *      @one to the master device with current feature set @all.  Will not
6376  *      enable anything that is off in @mask. Returns the new feature set.
6377  */
6378 u32 netdev_increment_features(u32 all, u32 one, u32 mask)
6379 {
6380         if (mask & NETIF_F_GEN_CSUM)
6381                 mask |= NETIF_F_ALL_CSUM;
6382         mask |= NETIF_F_VLAN_CHALLENGED;
6383
6384         all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6385         all &= one | ~NETIF_F_ALL_FOR_ALL;
6386
6387         /* If device needs checksumming, downgrade to it. */
6388         if (all & (NETIF_F_ALL_CSUM & ~NETIF_F_NO_CSUM))
6389                 all &= ~NETIF_F_NO_CSUM;
6390
6391         /* If one device supports hw checksumming, set for all. */
6392         if (all & NETIF_F_GEN_CSUM)
6393                 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
6394
6395         return all;
6396 }
6397 EXPORT_SYMBOL(netdev_increment_features);
6398
6399 static struct hlist_head *netdev_create_hash(void)
6400 {
6401         int i;
6402         struct hlist_head *hash;
6403
6404         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6405         if (hash != NULL)
6406                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
6407                         INIT_HLIST_HEAD(&hash[i]);
6408
6409         return hash;
6410 }
6411
6412 /* Initialize per network namespace state */
6413 static int __net_init netdev_init(struct net *net)
6414 {
6415         INIT_LIST_HEAD(&net->dev_base_head);
6416
6417         net->dev_name_head = netdev_create_hash();
6418         if (net->dev_name_head == NULL)
6419                 goto err_name;
6420
6421         net->dev_index_head = netdev_create_hash();
6422         if (net->dev_index_head == NULL)
6423                 goto err_idx;
6424
6425         return 0;
6426
6427 err_idx:
6428         kfree(net->dev_name_head);
6429 err_name:
6430         return -ENOMEM;
6431 }
6432
6433 /**
6434  *      netdev_drivername - network driver for the device
6435  *      @dev: network device
6436  *
6437  *      Determine network driver for device.
6438  */
6439 const char *netdev_drivername(const struct net_device *dev)
6440 {
6441         const struct device_driver *driver;
6442         const struct device *parent;
6443         const char *empty = "";
6444
6445         parent = dev->dev.parent;
6446         if (!parent)
6447                 return empty;
6448
6449         driver = parent->driver;
6450         if (driver && driver->name)
6451                 return driver->name;
6452         return empty;
6453 }
6454
6455 int __netdev_printk(const char *level, const struct net_device *dev,
6456                            struct va_format *vaf)
6457 {
6458         int r;
6459
6460         if (dev && dev->dev.parent)
6461                 r = dev_printk(level, dev->dev.parent, "%s: %pV",
6462                                netdev_name(dev), vaf);
6463         else if (dev)
6464                 r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6465         else
6466                 r = printk("%s(NULL net_device): %pV", level, vaf);
6467
6468         return r;
6469 }
6470 EXPORT_SYMBOL(__netdev_printk);
6471
6472 int netdev_printk(const char *level, const struct net_device *dev,
6473                   const char *format, ...)
6474 {
6475         struct va_format vaf;
6476         va_list args;
6477         int r;
6478
6479         va_start(args, format);
6480
6481         vaf.fmt = format;
6482         vaf.va = &args;
6483
6484         r = __netdev_printk(level, dev, &vaf);
6485         va_end(args);
6486
6487         return r;
6488 }
6489 EXPORT_SYMBOL(netdev_printk);
6490
6491 #define define_netdev_printk_level(func, level)                 \
6492 int func(const struct net_device *dev, const char *fmt, ...)    \
6493 {                                                               \
6494         int r;                                                  \
6495         struct va_format vaf;                                   \
6496         va_list args;                                           \
6497                                                                 \
6498         va_start(args, fmt);                                    \
6499                                                                 \
6500         vaf.fmt = fmt;                                          \
6501         vaf.va = &args;                                         \
6502                                                                 \
6503         r = __netdev_printk(level, dev, &vaf);                  \
6504         va_end(args);                                           \
6505                                                                 \
6506         return r;                                               \
6507 }                                                               \
6508 EXPORT_SYMBOL(func);
6509
6510 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6511 define_netdev_printk_level(netdev_alert, KERN_ALERT);
6512 define_netdev_printk_level(netdev_crit, KERN_CRIT);
6513 define_netdev_printk_level(netdev_err, KERN_ERR);
6514 define_netdev_printk_level(netdev_warn, KERN_WARNING);
6515 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6516 define_netdev_printk_level(netdev_info, KERN_INFO);
6517
6518 static void __net_exit netdev_exit(struct net *net)
6519 {
6520         kfree(net->dev_name_head);
6521         kfree(net->dev_index_head);
6522 }
6523
6524 static struct pernet_operations __net_initdata netdev_net_ops = {
6525         .init = netdev_init,
6526         .exit = netdev_exit,
6527 };
6528
6529 static void __net_exit default_device_exit(struct net *net)
6530 {
6531         struct net_device *dev, *aux;
6532         /*
6533          * Push all migratable network devices back to the
6534          * initial network namespace
6535          */
6536         rtnl_lock();
6537         for_each_netdev_safe(net, dev, aux) {
6538                 int err;
6539                 char fb_name[IFNAMSIZ];
6540
6541                 /* Ignore unmoveable devices (i.e. loopback) */
6542                 if (dev->features & NETIF_F_NETNS_LOCAL)
6543                         continue;
6544
6545                 /* Leave virtual devices for the generic cleanup */
6546                 if (dev->rtnl_link_ops)
6547                         continue;
6548
6549                 /* Push remaining network devices to init_net */
6550                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6551                 err = dev_change_net_namespace(dev, &init_net, fb_name);
6552                 if (err) {
6553                         printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
6554                                 __func__, dev->name, err);
6555                         BUG();
6556                 }
6557         }
6558         rtnl_unlock();
6559 }
6560
6561 static void __net_exit default_device_exit_batch(struct list_head *net_list)
6562 {
6563         /* At exit all network devices most be removed from a network
6564          * namespace.  Do this in the reverse order of registration.
6565          * Do this across as many network namespaces as possible to
6566          * improve batching efficiency.
6567          */
6568         struct net_device *dev;
6569         struct net *net;
6570         LIST_HEAD(dev_kill_list);
6571
6572         rtnl_lock();
6573         list_for_each_entry(net, net_list, exit_list) {
6574                 for_each_netdev_reverse(net, dev) {
6575                         if (dev->rtnl_link_ops)
6576                                 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6577                         else
6578                                 unregister_netdevice_queue(dev, &dev_kill_list);
6579                 }
6580         }
6581         unregister_netdevice_many(&dev_kill_list);
6582         list_del(&dev_kill_list);
6583         rtnl_unlock();
6584 }
6585
6586 static struct pernet_operations __net_initdata default_device_ops = {
6587         .exit = default_device_exit,
6588         .exit_batch = default_device_exit_batch,
6589 };
6590
6591 /*
6592  *      Initialize the DEV module. At boot time this walks the device list and
6593  *      unhooks any devices that fail to initialise (normally hardware not
6594  *      present) and leaves us with a valid list of present and active devices.
6595  *
6596  */
6597
6598 /*
6599  *       This is called single threaded during boot, so no need
6600  *       to take the rtnl semaphore.
6601  */
6602 static int __init net_dev_init(void)
6603 {
6604         int i, rc = -ENOMEM;
6605
6606         BUG_ON(!dev_boot_phase);
6607
6608         if (dev_proc_init())
6609                 goto out;
6610
6611         if (netdev_kobject_init())
6612                 goto out;
6613
6614         INIT_LIST_HEAD(&ptype_all);
6615         for (i = 0; i < PTYPE_HASH_SIZE; i++)
6616                 INIT_LIST_HEAD(&ptype_base[i]);
6617
6618         if (register_pernet_subsys(&netdev_net_ops))
6619                 goto out;
6620
6621         /*
6622          *      Initialise the packet receive queues.
6623          */
6624
6625         for_each_possible_cpu(i) {
6626                 struct softnet_data *sd = &per_cpu(softnet_data, i);
6627
6628                 memset(sd, 0, sizeof(*sd));
6629                 skb_queue_head_init(&sd->input_pkt_queue);
6630                 skb_queue_head_init(&sd->process_queue);
6631                 sd->completion_queue = NULL;
6632                 INIT_LIST_HEAD(&sd->poll_list);
6633                 sd->output_queue = NULL;
6634                 sd->output_queue_tailp = &sd->output_queue;
6635 #ifdef CONFIG_RPS
6636                 sd->csd.func = rps_trigger_softirq;
6637                 sd->csd.info = sd;
6638                 sd->csd.flags = 0;
6639                 sd->cpu = i;
6640 #endif
6641
6642                 sd->backlog.poll = process_backlog;
6643                 sd->backlog.weight = weight_p;
6644                 sd->backlog.gro_list = NULL;
6645                 sd->backlog.gro_count = 0;
6646         }
6647
6648         dev_boot_phase = 0;
6649
6650         /* The loopback device is special if any other network devices
6651          * is present in a network namespace the loopback device must
6652          * be present. Since we now dynamically allocate and free the
6653          * loopback device ensure this invariant is maintained by
6654          * keeping the loopback device as the first device on the
6655          * list of network devices.  Ensuring the loopback devices
6656          * is the first device that appears and the last network device
6657          * that disappears.
6658          */
6659         if (register_pernet_device(&loopback_net_ops))
6660                 goto out;
6661
6662         if (register_pernet_device(&default_device_ops))
6663                 goto out;
6664
6665         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6666         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6667
6668         hotcpu_notifier(dev_cpu_callback, 0);
6669         dst_init();
6670         dev_mcast_init();
6671         rc = 0;
6672 out:
6673         return rc;
6674 }
6675
6676 subsys_initcall(net_dev_init);
6677
6678 static int __init initialize_hashrnd(void)
6679 {
6680         get_random_bytes(&hashrnd, sizeof(hashrnd));
6681         return 0;
6682 }
6683
6684 late_initcall_sync(initialize_hashrnd);
6685