net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro
  11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17  *              David Hinds <dahinds@users.sourceforge.net>
  18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19  *              Adam Sulmicki <adam@cfar.umd.edu>
  20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <asm/system.h>
  77 #include <linux/bitops.h>
  78 #include <linux/capability.h>
  79 #include <linux/cpu.h>
  80 #include <linux/types.h>
  81 #include <linux/kernel.h>
  82 #include <linux/hash.h>
  83 #include <linux/slab.h>
  84 #include <linux/sched.h>
  85 #include <linux/mutex.h>
  86 #include <linux/string.h>
  87 #include <linux/mm.h>
  88 #include <linux/socket.h>
  89 #include <linux/sockios.h>
  90 #include <linux/errno.h>
  91 #include <linux/interrupt.h>
  92 #include <linux/if_ether.h>
  93 #include <linux/netdevice.h>
  94 #include <linux/etherdevice.h>
  95 #include <linux/ethtool.h>
  96 #include <linux/notifier.h>
  97 #include <linux/skbuff.h>
  98 #include <net/net_namespace.h>
  99 #include <net/sock.h>
 100 #include <linux/rtnetlink.h>
 101 #include <linux/proc_fs.h>
 102 #include <linux/seq_file.h>
 103 #include <linux/stat.h>
 104 #include <net/dst.h>
 105 #include <net/pkt_sched.h>
 106 #include <net/checksum.h>
 107 #include <net/xfrm.h>
 108 #include <linux/highmem.h>
 109 #include <linux/init.h>
 110 #include <linux/kmod.h>
 111 #include <linux/module.h>
 112 #include <linux/netpoll.h>
 113 #include <linux/rcupdate.h>
 114 #include <linux/delay.h>
 115 #include <net/wext.h>
 116 #include <net/iw_handler.h>
 117 #include <asm/current.h>
 118 #include <linux/audit.h>
 119 #include <linux/dmaengine.h>
 120 #include <linux/err.h>
 121 #include <linux/ctype.h>
 122 #include <linux/if_arp.h>
 123 #include <linux/if_vlan.h>
 124 #include <linux/ip.h>
 125 #include <net/ip.h>
 126 #include <linux/ipv6.h>
 127 #include <linux/in.h>
 128 #include <linux/jhash.h>
 129 #include <linux/random.h>
 130 #include <trace/events/napi.h>
 131 #include <trace/events/net.h>
 132 #include <trace/events/skb.h>
 133 #include <linux/pci.h>
 134 #include <linux/inetdevice.h>
 135 #include <linux/cpu_rmap.h>
 136
 137 #include "net-sysfs.h"
 138
 139 /* Instead of increasing this, you should create a hash table. */
 140 #define MAX_GRO_SKBS 8
 141
 142 /* This should be increased if a protocol with a bigger head is added. */
 143 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 144
 145 /*
 146  *      The list of packet types we will receive (as opposed to discard)
 147  *      and the routines to invoke.
 148  *
 149  *      Why 16. Because with 16 the only overlap we get on a hash of the
 150  *      low nibble of the protocol value is RARP/SNAP/X.25.
 151  *
 152  *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
 153  *             sure which should go first, but I bet it won't make much
 154  *             difference if we are running VLANs.  The good news is that
 155  *             this protocol won't be in the list unless compiled in, so
 156  *             the average user (w/out VLANs) will not be adversely affected.
 157  *             --BLG
 158  *
 159  *              0800    IP
 160  *              8100    802.1Q VLAN
 161  *              0001    802.3
 162  *              0002    AX.25
 163  *              0004    802.2
 164  *              8035    RARP
 165  *              0005    SNAP
 166  *              0805    X.25
 167  *              0806    ARP
 168  *              8137    IPX
 169  *              0009    Localtalk
 170  *              86DD    IPv6
 171  */
 172
 173 #define PTYPE_HASH_SIZE (16)
 174 #define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
 175
 176 static DEFINE_SPINLOCK(ptype_lock);
 177 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 178 static struct list_head ptype_all __read_mostly;        /* Taps */
 179
 180 /*
 181  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 182  * semaphore.
 183  *
 184  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 185  *
 186  * Writers must hold the rtnl semaphore while they loop through the
 187  * dev_base_head list, and hold dev_base_lock for writing when they do the
 188  * actual updates.  This allows pure readers to access the list even
 189  * while a writer is preparing to update it.
 190  *
 191  * To put it another way, dev_base_lock is held for writing only to
 192  * protect against pure readers; the rtnl semaphore provides the
 193  * protection against other writers.
 194  *
 195  * See, for example usages, register_netdevice() and
 196  * unregister_netdevice(), which must be called with the rtnl
 197  * semaphore held.
 198  */
 199 DEFINE_RWLOCK(dev_base_lock);
 200 EXPORT_SYMBOL(dev_base_lock);
 201
 202 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 203 {
 204         unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 205         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 206 }
 207
 208 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 209 {
 210         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 211 }
 212
 213 static inline void rps_lock(struct softnet_data *sd)
 214 {
 215 #ifdef CONFIG_RPS
 216         spin_lock(&sd->input_pkt_queue.lock);
 217 #endif
 218 }
 219
 220 static inline void rps_unlock(struct softnet_data *sd)
 221 {
 222 #ifdef CONFIG_RPS
 223         spin_unlock(&sd->input_pkt_queue.lock);
 224 #endif
 225 }
 226
 227 /* Device list insertion */
 228 static int list_netdevice(struct net_device *dev)
 229 {
 230         struct net *net = dev_net(dev);
 231
 232         ASSERT_RTNL();
 233
 234         write_lock_bh(&dev_base_lock);
 235         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 236         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 237         hlist_add_head_rcu(&dev->index_hlist,
 238                            dev_index_hash(net, dev->ifindex));
 239         write_unlock_bh(&dev_base_lock);
 240         return 0;
 241 }
 242
 243 /* Device list removal
 244  * caller must respect a RCU grace period before freeing/reusing dev
 245  */
 246 static void unlist_netdevice(struct net_device *dev)
 247 {
 248         ASSERT_RTNL();
 249
 250         /* Unlink dev from the device chain */
 251         write_lock_bh(&dev_base_lock);
 252         list_del_rcu(&dev->dev_list);
 253         hlist_del_rcu(&dev->name_hlist);
 254         hlist_del_rcu(&dev->index_hlist);
 255         write_unlock_bh(&dev_base_lock);
 256 }
 257
 258 /*
 259  *      Our notifier list
 260  */
 261
 262 static RAW_NOTIFIER_HEAD(netdev_chain);
 263
 264 /*
 265  *      Device drivers call our routines to queue packets here. We empty the
 266  *      queue in the local softnet handler.
 267  */
 268
 269 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 270 EXPORT_PER_CPU_SYMBOL(softnet_data);
 271
 272 #ifdef CONFIG_LOCKDEP
 273 /*
 274  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 275  * according to dev->type
 276  */
 277 static const unsigned short netdev_lock_type[] =
 278         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 279          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 280          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 281          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 282          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 283          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 284          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 285          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 286          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 287          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 288          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 289          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 290          ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
 291          ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
 292          ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,
 293          ARPHRD_VOID, ARPHRD_NONE};
 294
 295 static const char *const netdev_lock_name[] =
 296         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 297          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 298          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 299          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 300          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 301          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 302          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 303          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 304          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 305          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 306          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 307          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 308          "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
 309          "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
 310          "_xmit_PHONET_PIPE", "_xmit_IEEE802154",
 311          "_xmit_VOID", "_xmit_NONE"};
 312
 313 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 314 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 315
 316 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 317 {
 318         int i;
 319
 320         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 321                 if (netdev_lock_type[i] == dev_type)
 322                         return i;
 323         /* the last key is used by default */
 324         return ARRAY_SIZE(netdev_lock_type) - 1;
 325 }
 326
 327 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 328                                                  unsigned short dev_type)
 329 {
 330         int i;
 331
 332         i = netdev_lock_pos(dev_type);
 333         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 334                                    netdev_lock_name[i]);
 335 }
 336
 337 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 338 {
 339         int i;
 340
 341         i = netdev_lock_pos(dev->type);
 342         lockdep_set_class_and_name(&dev->addr_list_lock,
 343                                    &netdev_addr_lock_key[i],
 344                                    netdev_lock_name[i]);
 345 }
 346 #else
 347 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 348                                                  unsigned short dev_type)
 349 {
 350 }
 351 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 352 {
 353 }
 354 #endif
 355
 356 /*******************************************************************************
 357
 358                 Protocol management and registration routines
 359
 360 *******************************************************************************/
 361
 362 /*
 363  *      Add a protocol ID to the list. Now that the input handler is
 364  *      smarter we can dispense with all the messy stuff that used to be
 365  *      here.
 366  *
 367  *      BEWARE!!! Protocol handlers, mangling input packets,
 368  *      MUST BE last in hash buckets and checking protocol handlers
 369  *      MUST start from promiscuous ptype_all chain in net_bh.
 370  *      It is true now, do not change it.
 371  *      Explanation follows: if protocol handler, mangling packet, will
 372  *      be the first on list, it is not able to sense, that packet
 373  *      is cloned and should be copied-on-write, so that it will
 374  *      change it and subsequent readers will get broken packet.
 375  *                                                      --ANK (980803)
 376  */
 377
 378 static inline struct list_head *ptype_head(const struct packet_type *pt)
 379 {
 380         if (pt->type == htons(ETH_P_ALL))
 381                 return &ptype_all;
 382         else
 383                 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 384 }
 385
 386 /**
 387  *      dev_add_pack - add packet handler
 388  *      @pt: packet type declaration
 389  *
 390  *      Add a protocol handler to the networking stack. The passed &packet_type
 391  *      is linked into kernel lists and may not be freed until it has been
 392  *      removed from the kernel lists.
 393  *
 394  *      This call does not sleep therefore it can not
 395  *      guarantee all CPU's that are in middle of receiving packets
 396  *      will see the new packet type (until the next received packet).
 397  */
 398
 399 void dev_add_pack(struct packet_type *pt)
 400 {
 401         struct list_head *head = ptype_head(pt);
 402
 403         spin_lock(&ptype_lock);
 404         list_add_rcu(&pt->list, head);
 405         spin_unlock(&ptype_lock);
 406 }
 407 EXPORT_SYMBOL(dev_add_pack);
 408
 409 /**
 410  *      __dev_remove_pack        - remove packet handler
 411  *      @pt: packet type declaration
 412  *
 413  *      Remove a protocol handler that was previously added to the kernel
 414  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 415  *      from the kernel lists and can be freed or reused once this function
 416  *      returns.
 417  *
 418  *      The packet type might still be in use by receivers
 419  *      and must not be freed until after all the CPU's have gone
 420  *      through a quiescent state.
 421  */
 422 void __dev_remove_pack(struct packet_type *pt)
 423 {
 424         struct list_head *head = ptype_head(pt);
 425         struct packet_type *pt1;
 426
 427         spin_lock(&ptype_lock);
 428
 429         list_for_each_entry(pt1, head, list) {
 430                 if (pt == pt1) {
 431                         list_del_rcu(&pt->list);
 432                         goto out;
 433                 }
 434         }
 435
 436         printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
 437 out:
 438         spin_unlock(&ptype_lock);
 439 }
 440 EXPORT_SYMBOL(__dev_remove_pack);
 441
 442 /**
 443  *      dev_remove_pack  - remove packet handler
 444  *      @pt: packet type declaration
 445  *
 446  *      Remove a protocol handler that was previously added to the kernel
 447  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 448  *      from the kernel lists and can be freed or reused once this function
 449  *      returns.
 450  *
 451  *      This call sleeps to guarantee that no CPU is looking at the packet
 452  *      type after return.
 453  */
 454 void dev_remove_pack(struct packet_type *pt)
 455 {
 456         __dev_remove_pack(pt);
 457
 458         synchronize_net();
 459 }
 460 EXPORT_SYMBOL(dev_remove_pack);
 461
 462 /******************************************************************************
 463
 464                       Device Boot-time Settings Routines
 465
 466 *******************************************************************************/
 467
 468 /* Boot time configuration table */
 469 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 470
 471 /**
 472  *      netdev_boot_setup_add   - add new setup entry
 473  *      @name: name of the device
 474  *      @map: configured settings for the device
 475  *
 476  *      Adds new setup entry to the dev_boot_setup list.  The function
 477  *      returns 0 on error and 1 on success.  This is a generic routine to
 478  *      all netdevices.
 479  */
 480 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 481 {
 482         struct netdev_boot_setup *s;
 483         int i;
 484
 485         s = dev_boot_setup;
 486         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 487                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 488                         memset(s[i].name, 0, sizeof(s[i].name));
 489                         strlcpy(s[i].name, name, IFNAMSIZ);
 490                         memcpy(&s[i].map, map, sizeof(s[i].map));
 491                         break;
 492                 }
 493         }
 494
 495         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 496 }
 497
 498 /**
 499  *      netdev_boot_setup_check - check boot time settings
 500  *      @dev: the netdevice
 501  *
 502  *      Check boot time settings for the device.
 503  *      The found settings are set for the device to be used
 504  *      later in the device probing.
 505  *      Returns 0 if no settings found, 1 if they are.
 506  */
 507 int netdev_boot_setup_check(struct net_device *dev)
 508 {
 509         struct netdev_boot_setup *s = dev_boot_setup;
 510         int i;
 511
 512         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 513                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 514                     !strcmp(dev->name, s[i].name)) {
 515                         dev->irq        = s[i].map.irq;
 516                         dev->base_addr  = s[i].map.base_addr;
 517                         dev->mem_start  = s[i].map.mem_start;
 518                         dev->mem_end    = s[i].map.mem_end;
 519                         return 1;
 520                 }
 521         }
 522         return 0;
 523 }
 524 EXPORT_SYMBOL(netdev_boot_setup_check);
 525
 526
 527 /**
 528  *      netdev_boot_base        - get address from boot time settings
 529  *      @prefix: prefix for network device
 530  *      @unit: id for network device
 531  *
 532  *      Check boot time settings for the base address of device.
 533  *      The found settings are set for the device to be used
 534  *      later in the device probing.
 535  *      Returns 0 if no settings found.
 536  */
 537 unsigned long netdev_boot_base(const char *prefix, int unit)
 538 {
 539         const struct netdev_boot_setup *s = dev_boot_setup;
 540         char name[IFNAMSIZ];
 541         int i;
 542
 543         sprintf(name, "%s%d", prefix, unit);
 544
 545         /*
 546          * If device already registered then return base of 1
 547          * to indicate not to probe for this interface
 548          */
 549         if (__dev_get_by_name(&init_net, name))
 550                 return 1;
 551
 552         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 553                 if (!strcmp(name, s[i].name))
 554                         return s[i].map.base_addr;
 555         return 0;
 556 }
 557
 558 /*
 559  * Saves at boot time configured settings for any netdevice.
 560  */
 561 int __init netdev_boot_setup(char *str)
 562 {
 563         int ints[5];
 564         struct ifmap map;
 565
 566         str = get_options(str, ARRAY_SIZE(ints), ints);
 567         if (!str || !*str)
 568                 return 0;
 569
 570         /* Save settings */
 571         memset(&map, 0, sizeof(map));
 572         if (ints[0] > 0)
 573                 map.irq = ints[1];
 574         if (ints[0] > 1)
 575                 map.base_addr = ints[2];
 576         if (ints[0] > 2)
 577                 map.mem_start = ints[3];
 578         if (ints[0] > 3)
 579                 map.mem_end = ints[4];
 580
 581         /* Add new entry to the list */
 582         return netdev_boot_setup_add(str, &map);
 583 }
 584
 585 __setup("netdev=", netdev_boot_setup);
 586
 587 /*******************************************************************************
 588
 589                             Device Interface Subroutines
 590
 591 *******************************************************************************/
 592
 593 /**
 594  *      __dev_get_by_name       - find a device by its name
 595  *      @net: the applicable net namespace
 596  *      @name: name to find
 597  *
 598  *      Find an interface by name. Must be called under RTNL semaphore
 599  *      or @dev_base_lock. If the name is found a pointer to the device
 600  *      is returned. If the name is not found then %NULL is returned. The
 601  *      reference counters are not incremented so the caller must be
 602  *      careful with locks.
 603  */
 604
 605 struct net_device *__dev_get_by_name(struct net *net, const char *name)
 606 {
 607         struct hlist_node *p;
 608         struct net_device *dev;
 609         struct hlist_head *head = dev_name_hash(net, name);
 610
 611         hlist_for_each_entry(dev, p, head, name_hlist)
 612                 if (!strncmp(dev->name, name, IFNAMSIZ))
 613                         return dev;
 614
 615         return NULL;
 616 }
 617 EXPORT_SYMBOL(__dev_get_by_name);
 618
 619 /**
 620  *      dev_get_by_name_rcu     - find a device by its name
 621  *      @net: the applicable net namespace
 622  *      @name: name to find
 623  *
 624  *      Find an interface by name.
 625  *      If the name is found a pointer to the device is returned.
 626  *      If the name is not found then %NULL is returned.
 627  *      The reference counters are not incremented so the caller must be
 628  *      careful with locks. The caller must hold RCU lock.
 629  */
 630
 631 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 632 {
 633         struct hlist_node *p;
 634         struct net_device *dev;
 635         struct hlist_head *head = dev_name_hash(net, name);
 636
 637         hlist_for_each_entry_rcu(dev, p, head, name_hlist)
 638                 if (!strncmp(dev->name, name, IFNAMSIZ))
 639                         return dev;
 640
 641         return NULL;
 642 }
 643 EXPORT_SYMBOL(dev_get_by_name_rcu);
 644
 645 /**
 646  *      dev_get_by_name         - find a device by its name
 647  *      @net: the applicable net namespace
 648  *      @name: name to find
 649  *
 650  *      Find an interface by name. This can be called from any
 651  *      context and does its own locking. The returned handle has
 652  *      the usage count incremented and the caller must use dev_put() to
 653  *      release it when it is no longer needed. %NULL is returned if no
 654  *      matching device is found.
 655  */
 656
 657 struct net_device *dev_get_by_name(struct net *net, const char *name)
 658 {
 659         struct net_device *dev;
 660
 661         rcu_read_lock();
 662         dev = dev_get_by_name_rcu(net, name);
 663         if (dev)
 664                 dev_hold(dev);
 665         rcu_read_unlock();
 666         return dev;
 667 }
 668 EXPORT_SYMBOL(dev_get_by_name);
 669
 670 /**
 671  *      __dev_get_by_index - find a device by its ifindex
 672  *      @net: the applicable net namespace
 673  *      @ifindex: index of device
 674  *
 675  *      Search for an interface by index. Returns %NULL if the device
 676  *      is not found or a pointer to the device. The device has not
 677  *      had its reference counter increased so the caller must be careful
 678  *      about locking. The caller must hold either the RTNL semaphore
 679  *      or @dev_base_lock.
 680  */
 681
 682 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 683 {
 684         struct hlist_node *p;
 685         struct net_device *dev;
 686         struct hlist_head *head = dev_index_hash(net, ifindex);
 687
 688         hlist_for_each_entry(dev, p, head, index_hlist)
 689                 if (dev->ifindex == ifindex)
 690                         return dev;
 691
 692         return NULL;
 693 }
 694 EXPORT_SYMBOL(__dev_get_by_index);
 695
 696 /**
 697  *      dev_get_by_index_rcu - find a device by its ifindex
 698  *      @net: the applicable net namespace
 699  *      @ifindex: index of device
 700  *
 701  *      Search for an interface by index. Returns %NULL if the device
 702  *      is not found or a pointer to the device. The device has not
 703  *      had its reference counter increased so the caller must be careful
 704  *      about locking. The caller must hold RCU lock.
 705  */
 706
 707 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 708 {
 709         struct hlist_node *p;
 710         struct net_device *dev;
 711         struct hlist_head *head = dev_index_hash(net, ifindex);
 712
 713         hlist_for_each_entry_rcu(dev, p, head, index_hlist)
 714                 if (dev->ifindex == ifindex)
 715                         return dev;
 716
 717         return NULL;
 718 }
 719 EXPORT_SYMBOL(dev_get_by_index_rcu);
 720
 721
 722 /**
 723  *      dev_get_by_index - find a device by its ifindex
 724  *      @net: the applicable net namespace
 725  *      @ifindex: index of device
 726  *
 727  *      Search for an interface by index. Returns NULL if the device
 728  *      is not found or a pointer to the device. The device returned has
 729  *      had a reference added and the pointer is safe until the user calls
 730  *      dev_put to indicate they have finished with it.
 731  */
 732
 733 struct net_device *dev_get_by_index(struct net *net, int ifindex)
 734 {
 735         struct net_device *dev;
 736
 737         rcu_read_lock();
 738         dev = dev_get_by_index_rcu(net, ifindex);
 739         if (dev)
 740                 dev_hold(dev);
 741         rcu_read_unlock();
 742         return dev;
 743 }
 744 EXPORT_SYMBOL(dev_get_by_index);
 745
 746 /**
 747  *      dev_getbyhwaddr_rcu - find a device by its hardware address
 748  *      @net: the applicable net namespace
 749  *      @type: media type of device
 750  *      @ha: hardware address
 751  *
 752  *      Search for an interface by MAC address. Returns NULL if the device
 753  *      is not found or a pointer to the device.
 754  *      The caller must hold RCU or RTNL.
 755  *      The returned device has not had its ref count increased
 756  *      and the caller must therefore be careful about locking
 757  *
 758  */
 759
 760 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 761                                        const char *ha)
 762 {
 763         struct net_device *dev;
 764
 765         for_each_netdev_rcu(net, dev)
 766                 if (dev->type == type &&
 767                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 768                         return dev;
 769
 770         return NULL;
 771 }
 772 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 773
 774 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 775 {
 776         struct net_device *dev;
 777
 778         ASSERT_RTNL();
 779         for_each_netdev(net, dev)
 780                 if (dev->type == type)
 781                         return dev;
 782
 783         return NULL;
 784 }
 785 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 786
 787 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 788 {
 789         struct net_device *dev, *ret = NULL;
 790
 791         rcu_read_lock();
 792         for_each_netdev_rcu(net, dev)
 793                 if (dev->type == type) {
 794                         dev_hold(dev);
 795                         ret = dev;
 796                         break;
 797                 }
 798         rcu_read_unlock();
 799         return ret;
 800 }
 801 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 802
 803 /**
 804  *      dev_get_by_flags_rcu - find any device with given flags
 805  *      @net: the applicable net namespace
 806  *      @if_flags: IFF_* values
 807  *      @mask: bitmask of bits in if_flags to check
 808  *
 809  *      Search for any interface with the given flags. Returns NULL if a device
 810  *      is not found or a pointer to the device. Must be called inside
 811  *      rcu_read_lock(), and result refcount is unchanged.
 812  */
 813
 814 struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
 815                                     unsigned short mask)
 816 {
 817         struct net_device *dev, *ret;
 818
 819         ret = NULL;
 820         for_each_netdev_rcu(net, dev) {
 821                 if (((dev->flags ^ if_flags) & mask) == 0) {
 822                         ret = dev;
 823                         break;
 824                 }
 825         }
 826         return ret;
 827 }
 828 EXPORT_SYMBOL(dev_get_by_flags_rcu);
 829
 830 /**
 831  *      dev_valid_name - check if name is okay for network device
 832  *      @name: name string
 833  *
 834  *      Network device names need to be valid file names to
 835  *      to allow sysfs to work.  We also disallow any kind of
 836  *      whitespace.
 837  */
 838 int dev_valid_name(const char *name)
 839 {
 840         if (*name == '\0')
 841                 return 0;
 842         if (strlen(name) >= IFNAMSIZ)
 843                 return 0;
 844         if (!strcmp(name, ".") || !strcmp(name, ".."))
 845                 return 0;
 846
 847         while (*name) {
 848                 if (*name == '/' || isspace(*name))
 849                         return 0;
 850                 name++;
 851         }
 852         return 1;
 853 }
 854 EXPORT_SYMBOL(dev_valid_name);
 855
 856 /**
 857  *      __dev_alloc_name - allocate a name for a device
 858  *      @net: network namespace to allocate the device name in
 859  *      @name: name format string
 860  *      @buf:  scratch buffer and result name string
 861  *
 862  *      Passed a format string - eg "lt%d" it will try and find a suitable
 863  *      id. It scans list of devices to build up a free map, then chooses
 864  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 865  *      while allocating the name and adding the device in order to avoid
 866  *      duplicates.
 867  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 868  *      Returns the number of the unit assigned or a negative errno code.
 869  */
 870
 871 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 872 {
 873         int i = 0;
 874         const char *p;
 875         const int max_netdevices = 8*PAGE_SIZE;
 876         unsigned long *inuse;
 877         struct net_device *d;
 878
 879         p = strnchr(name, IFNAMSIZ-1, '%');
 880         if (p) {
 881                 /*
 882                  * Verify the string as this thing may have come from
 883                  * the user.  There must be either one "%d" and no other "%"
 884                  * characters.
 885                  */
 886                 if (p[1] != 'd' || strchr(p + 2, '%'))
 887                         return -EINVAL;
 888
 889                 /* Use one page as a bit array of possible slots */
 890                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 891                 if (!inuse)
 892                         return -ENOMEM;
 893
 894                 for_each_netdev(net, d) {
 895                         if (!sscanf(d->name, name, &i))
 896                                 continue;
 897                         if (i < 0 || i >= max_netdevices)
 898                                 continue;
 899
 900                         /*  avoid cases where sscanf is not exact inverse of printf */
 901                         snprintf(buf, IFNAMSIZ, name, i);
 902                         if (!strncmp(buf, d->name, IFNAMSIZ))
 903                                 set_bit(i, inuse);
 904                 }
 905
 906                 i = find_first_zero_bit(inuse, max_netdevices);
 907                 free_page((unsigned long) inuse);
 908         }
 909
 910         if (buf != name)
 911                 snprintf(buf, IFNAMSIZ, name, i);
 912         if (!__dev_get_by_name(net, buf))
 913                 return i;
 914
 915         /* It is possible to run out of possible slots
 916          * when the name is long and there isn't enough space left
 917          * for the digits, or if all bits are used.
 918          */
 919         return -ENFILE;
 920 }
 921
 922 /**
 923  *      dev_alloc_name - allocate a name for a device
 924  *      @dev: device
 925  *      @name: name format string
 926  *
 927  *      Passed a format string - eg "lt%d" it will try and find a suitable
 928  *      id. It scans list of devices to build up a free map, then chooses
 929  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 930  *      while allocating the name and adding the device in order to avoid
 931  *      duplicates.
 932  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 933  *      Returns the number of the unit assigned or a negative errno code.
 934  */
 935
 936 int dev_alloc_name(struct net_device *dev, const char *name)
 937 {
 938         char buf[IFNAMSIZ];
 939         struct net *net;
 940         int ret;
 941
 942         BUG_ON(!dev_net(dev));
 943         net = dev_net(dev);
 944         ret = __dev_alloc_name(net, name, buf);
 945         if (ret >= 0)
 946                 strlcpy(dev->name, buf, IFNAMSIZ);
 947         return ret;
 948 }
 949 EXPORT_SYMBOL(dev_alloc_name);
 950
 951 static int dev_get_valid_name(struct net_device *dev, const char *name, bool fmt)
 952 {
 953         struct net *net;
 954
 955         BUG_ON(!dev_net(dev));
 956         net = dev_net(dev);
 957
 958         if (!dev_valid_name(name))
 959                 return -EINVAL;
 960
 961         if (fmt && strchr(name, '%'))
 962                 return dev_alloc_name(dev, name);
 963         else if (__dev_get_by_name(net, name))
 964                 return -EEXIST;
 965         else if (dev->name != name)
 966                 strlcpy(dev->name, name, IFNAMSIZ);
 967
 968         return 0;
 969 }
 970
 971 /**
 972  *      dev_change_name - change name of a device
 973  *      @dev: device
 974  *      @newname: name (or format string) must be at least IFNAMSIZ
 975  *
 976  *      Change name of a device, can pass format strings "eth%d".
 977  *      for wildcarding.
 978  */
 979 int dev_change_name(struct net_device *dev, const char *newname)
 980 {
 981         char oldname[IFNAMSIZ];
 982         int err = 0;
 983         int ret;
 984         struct net *net;
 985
 986         ASSERT_RTNL();
 987         BUG_ON(!dev_net(dev));
 988
 989         net = dev_net(dev);
 990         if (dev->flags & IFF_UP)
 991                 return -EBUSY;
 992
 993         if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
 994                 return 0;
 995
 996         memcpy(oldname, dev->name, IFNAMSIZ);
 997
 998         err = dev_get_valid_name(dev, newname, 1);
 999         if (err < 0)
1000                 return err;
1001
1002 rollback:
1003         ret = device_rename(&dev->dev, dev->name);
1004         if (ret) {
1005                 memcpy(dev->name, oldname, IFNAMSIZ);
1006                 return ret;
1007         }
1008
1009         write_lock_bh(&dev_base_lock);
1010         hlist_del(&dev->name_hlist);
1011         write_unlock_bh(&dev_base_lock);
1012
1013         synchronize_rcu();
1014
1015         write_lock_bh(&dev_base_lock);
1016         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1017         write_unlock_bh(&dev_base_lock);
1018
1019         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1020         ret = notifier_to_errno(ret);
1021
1022         if (ret) {
1023                 /* err >= 0 after dev_alloc_name() or stores the first errno */
1024                 if (err >= 0) {
1025                         err = ret;
1026                         memcpy(dev->name, oldname, IFNAMSIZ);
1027                         goto rollback;
1028                 } else {
1029                         printk(KERN_ERR
1030                                "%s: name change rollback failed: %d.\n",
1031                                dev->name, ret);
1032                 }
1033         }
1034
1035         return err;
1036 }
1037
1038 /**
1039  *      dev_set_alias - change ifalias of a device
1040  *      @dev: device
1041  *      @alias: name up to IFALIASZ
1042  *      @len: limit of bytes to copy from info
1043  *
1044  *      Set ifalias for a device,
1045  */
1046 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1047 {
1048         ASSERT_RTNL();
1049
1050         if (len >= IFALIASZ)
1051                 return -EINVAL;
1052
1053         if (!len) {
1054                 if (dev->ifalias) {
1055                         kfree(dev->ifalias);
1056                         dev->ifalias = NULL;
1057                 }
1058                 return 0;
1059         }
1060
1061         dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1062         if (!dev->ifalias)
1063                 return -ENOMEM;
1064
1065         strlcpy(dev->ifalias, alias, len+1);
1066         return len;
1067 }
1068
1069
1070 /**
1071  *      netdev_features_change - device changes features
1072  *      @dev: device to cause notification
1073  *
1074  *      Called to indicate a device has changed features.
1075  */
1076 void netdev_features_change(struct net_device *dev)
1077 {
1078         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1079 }
1080 EXPORT_SYMBOL(netdev_features_change);
1081
1082 /**
1083  *      netdev_state_change - device changes state
1084  *      @dev: device to cause notification
1085  *
1086  *      Called to indicate a device has changed state. This function calls
1087  *      the notifier chains for netdev_chain and sends a NEWLINK message
1088  *      to the routing socket.
1089  */
1090 void netdev_state_change(struct net_device *dev)
1091 {
1092         if (dev->flags & IFF_UP) {
1093                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1094                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1095         }
1096 }
1097 EXPORT_SYMBOL(netdev_state_change);
1098
1099 int netdev_bonding_change(struct net_device *dev, unsigned long event)
1100 {
1101         return call_netdevice_notifiers(event, dev);
1102 }
1103 EXPORT_SYMBOL(netdev_bonding_change);
1104
1105 /**
1106  *      dev_load        - load a network module
1107  *      @net: the applicable net namespace
1108  *      @name: name of interface
1109  *
1110  *      If a network interface is not present and the process has suitable
1111  *      privileges this function loads the module. If module loading is not
1112  *      available in this kernel then it becomes a nop.
1113  */
1114
1115 void dev_load(struct net *net, const char *name)
1116 {
1117         struct net_device *dev;
1118         int no_module;
1119
1120         rcu_read_lock();
1121         dev = dev_get_by_name_rcu(net, name);
1122         rcu_read_unlock();
1123
1124         no_module = !dev;
1125         if (no_module && capable(CAP_NET_ADMIN))
1126                 no_module = request_module("netdev-%s", name);
1127         if (no_module && capable(CAP_SYS_MODULE)) {
1128                 if (!request_module("%s", name))
1129                         pr_err("Loading kernel module for a network device "
1130 "with CAP_SYS_MODULE (deprecated).  Use CAP_NET_ADMIN and alias netdev-%s "
1131 "instead\n", name);
1132         }
1133 }
1134 EXPORT_SYMBOL(dev_load);
1135
1136 static int __dev_open(struct net_device *dev)
1137 {
1138         const struct net_device_ops *ops = dev->netdev_ops;
1139         int ret;
1140
1141         ASSERT_RTNL();
1142
1143         if (!netif_device_present(dev))
1144                 return -ENODEV;
1145
1146         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1147         ret = notifier_to_errno(ret);
1148         if (ret)
1149                 return ret;
1150
1151         set_bit(__LINK_STATE_START, &dev->state);
1152
1153         if (ops->ndo_validate_addr)
1154                 ret = ops->ndo_validate_addr(dev);
1155
1156         if (!ret && ops->ndo_open)
1157                 ret = ops->ndo_open(dev);
1158
1159         if (ret)
1160                 clear_bit(__LINK_STATE_START, &dev->state);
1161         else {
1162                 dev->flags |= IFF_UP;
1163                 net_dmaengine_get();
1164                 dev_set_rx_mode(dev);
1165                 dev_activate(dev);
1166         }
1167
1168         return ret;
1169 }
1170
1171 /**
1172  *      dev_open        - prepare an interface for use.
1173  *      @dev:   device to open
1174  *
1175  *      Takes a device from down to up state. The device's private open
1176  *      function is invoked and then the multicast lists are loaded. Finally
1177  *      the device is moved into the up state and a %NETDEV_UP message is
1178  *      sent to the netdev notifier chain.
1179  *
1180  *      Calling this function on an active interface is a nop. On a failure
1181  *      a negative errno code is returned.
1182  */
1183 int dev_open(struct net_device *dev)
1184 {
1185         int ret;
1186
1187         if (dev->flags & IFF_UP)
1188                 return 0;
1189
1190         ret = __dev_open(dev);
1191         if (ret < 0)
1192                 return ret;
1193
1194         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1195         call_netdevice_notifiers(NETDEV_UP, dev);
1196
1197         return ret;
1198 }
1199 EXPORT_SYMBOL(dev_open);
1200
1201 static int __dev_close_many(struct list_head *head)
1202 {
1203         struct net_device *dev;
1204
1205         ASSERT_RTNL();
1206         might_sleep();
1207
1208         list_for_each_entry(dev, head, unreg_list) {
1209                 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1210
1211                 clear_bit(__LINK_STATE_START, &dev->state);
1212
1213                 /* Synchronize to scheduled poll. We cannot touch poll list, it
1214                  * can be even on different cpu. So just clear netif_running().
1215                  *
1216                  * dev->stop() will invoke napi_disable() on all of it's
1217                  * napi_struct instances on this device.
1218                  */
1219                 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1220         }
1221
1222         dev_deactivate_many(head);
1223
1224         list_for_each_entry(dev, head, unreg_list) {
1225                 const struct net_device_ops *ops = dev->netdev_ops;
1226
1227                 /*
1228                  *      Call the device specific close. This cannot fail.
1229                  *      Only if device is UP
1230                  *
1231                  *      We allow it to be called even after a DETACH hot-plug
1232                  *      event.
1233                  */
1234                 if (ops->ndo_stop)
1235                         ops->ndo_stop(dev);
1236
1237                 dev->flags &= ~IFF_UP;
1238                 net_dmaengine_put();
1239         }
1240
1241         return 0;
1242 }
1243
1244 static int __dev_close(struct net_device *dev)
1245 {
1246         int retval;
1247         LIST_HEAD(single);
1248
1249         list_add(&dev->unreg_list, &single);
1250         retval = __dev_close_many(&single);
1251         list_del(&single);
1252         return retval;
1253 }
1254
1255 static int dev_close_many(struct list_head *head)
1256 {
1257         struct net_device *dev, *tmp;
1258         LIST_HEAD(tmp_list);
1259
1260         list_for_each_entry_safe(dev, tmp, head, unreg_list)
1261                 if (!(dev->flags & IFF_UP))
1262                         list_move(&dev->unreg_list, &tmp_list);
1263
1264         __dev_close_many(head);
1265
1266         list_for_each_entry(dev, head, unreg_list) {
1267                 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1268                 call_netdevice_notifiers(NETDEV_DOWN, dev);
1269         }
1270
1271         /* rollback_registered_many needs the complete original list */
1272         list_splice(&tmp_list, head);
1273         return 0;
1274 }
1275
1276 /**
1277  *      dev_close - shutdown an interface.
1278  *      @dev: device to shutdown
1279  *
1280  *      This function moves an active device into down state. A
1281  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1282  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1283  *      chain.
1284  */
1285 int dev_close(struct net_device *dev)
1286 {
1287         LIST_HEAD(single);
1288
1289         list_add(&dev->unreg_list, &single);
1290         dev_close_many(&single);
1291         list_del(&single);
1292         return 0;
1293 }
1294 EXPORT_SYMBOL(dev_close);
1295
1296
1297 /**
1298  *      dev_disable_lro - disable Large Receive Offload on a device
1299  *      @dev: device
1300  *
1301  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1302  *      called under RTNL.  This is needed if received packets may be
1303  *      forwarded to another interface.
1304  */
1305 void dev_disable_lro(struct net_device *dev)
1306 {
1307         u32 flags;
1308
1309         if (dev->ethtool_ops && dev->ethtool_ops->get_flags)
1310                 flags = dev->ethtool_ops->get_flags(dev);
1311         else
1312                 flags = ethtool_op_get_flags(dev);
1313
1314         if (!(flags & ETH_FLAG_LRO))
1315                 return;
1316
1317         __ethtool_set_flags(dev, flags & ~ETH_FLAG_LRO);
1318         WARN_ON(dev->features & NETIF_F_LRO);
1319 }
1320 EXPORT_SYMBOL(dev_disable_lro);
1321
1322
1323 static int dev_boot_phase = 1;
1324
1325 /**
1326  *      register_netdevice_notifier - register a network notifier block
1327  *      @nb: notifier
1328  *
1329  *      Register a notifier to be called when network device events occur.
1330  *      The notifier passed is linked into the kernel structures and must
1331  *      not be reused until it has been unregistered. A negative errno code
1332  *      is returned on a failure.
1333  *
1334  *      When registered all registration and up events are replayed
1335  *      to the new notifier to allow device to have a race free
1336  *      view of the network device list.
1337  */
1338
1339 int register_netdevice_notifier(struct notifier_block *nb)
1340 {
1341         struct net_device *dev;
1342         struct net_device *last;
1343         struct net *net;
1344         int err;
1345
1346         rtnl_lock();
1347         err = raw_notifier_chain_register(&netdev_chain, nb);
1348         if (err)
1349                 goto unlock;
1350         if (dev_boot_phase)
1351                 goto unlock;
1352         for_each_net(net) {
1353                 for_each_netdev(net, dev) {
1354                         err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1355                         err = notifier_to_errno(err);
1356                         if (err)
1357                                 goto rollback;
1358
1359                         if (!(dev->flags & IFF_UP))
1360                                 continue;
1361
1362                         nb->notifier_call(nb, NETDEV_UP, dev);
1363                 }
1364         }
1365
1366 unlock:
1367         rtnl_unlock();
1368         return err;
1369
1370 rollback:
1371         last = dev;
1372         for_each_net(net) {
1373                 for_each_netdev(net, dev) {
1374                         if (dev == last)
1375                                 break;
1376
1377                         if (dev->flags & IFF_UP) {
1378                                 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1379                                 nb->notifier_call(nb, NETDEV_DOWN, dev);
1380                         }
1381                         nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1382                         nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1383                 }
1384         }
1385
1386         raw_notifier_chain_unregister(&netdev_chain, nb);
1387         goto unlock;
1388 }
1389 EXPORT_SYMBOL(register_netdevice_notifier);
1390
1391 /**
1392  *      unregister_netdevice_notifier - unregister a network notifier block
1393  *      @nb: notifier
1394  *
1395  *      Unregister a notifier previously registered by
1396  *      register_netdevice_notifier(). The notifier is unlinked into the
1397  *      kernel structures and may then be reused. A negative errno code
1398  *      is returned on a failure.
1399  */
1400
1401 int unregister_netdevice_notifier(struct notifier_block *nb)
1402 {
1403         int err;
1404
1405         rtnl_lock();
1406         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1407         rtnl_unlock();
1408         return err;
1409 }
1410 EXPORT_SYMBOL(unregister_netdevice_notifier);
1411
1412 /**
1413  *      call_netdevice_notifiers - call all network notifier blocks
1414  *      @val: value passed unmodified to notifier function
1415  *      @dev: net_device pointer passed unmodified to notifier function
1416  *
1417  *      Call all network notifier blocks.  Parameters and return value
1418  *      are as for raw_notifier_call_chain().
1419  */
1420
1421 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1422 {
1423         ASSERT_RTNL();
1424         return raw_notifier_call_chain(&netdev_chain, val, dev);
1425 }
1426 EXPORT_SYMBOL(call_netdevice_notifiers);
1427
1428 /* When > 0 there are consumers of rx skb time stamps */
1429 static atomic_t netstamp_needed = ATOMIC_INIT(0);
1430
1431 void net_enable_timestamp(void)
1432 {
1433         atomic_inc(&netstamp_needed);
1434 }
1435 EXPORT_SYMBOL(net_enable_timestamp);
1436
1437 void net_disable_timestamp(void)
1438 {
1439         atomic_dec(&netstamp_needed);
1440 }
1441 EXPORT_SYMBOL(net_disable_timestamp);
1442
1443 static inline void net_timestamp_set(struct sk_buff *skb)
1444 {
1445         if (atomic_read(&netstamp_needed))
1446                 __net_timestamp(skb);
1447         else
1448                 skb->tstamp.tv64 = 0;
1449 }
1450
1451 static inline void net_timestamp_check(struct sk_buff *skb)
1452 {
1453         if (!skb->tstamp.tv64 && atomic_read(&netstamp_needed))
1454                 __net_timestamp(skb);
1455 }
1456
1457 /**
1458  * dev_forward_skb - loopback an skb to another netif
1459  *
1460  * @dev: destination network device
1461  * @skb: buffer to forward
1462  *
1463  * return values:
1464  *      NET_RX_SUCCESS  (no congestion)
1465  *      NET_RX_DROP     (packet was dropped, but freed)
1466  *
1467  * dev_forward_skb can be used for injecting an skb from the
1468  * start_xmit function of one device into the receive queue
1469  * of another device.
1470  *
1471  * The receiving device may be in another namespace, so
1472  * we have to clear all information in the skb that could
1473  * impact namespace isolation.
1474  */
1475 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1476 {
1477         skb_orphan(skb);
1478         nf_reset(skb);
1479
1480         if (unlikely(!(dev->flags & IFF_UP) ||
1481                      (skb->len > (dev->mtu + dev->hard_header_len + VLAN_HLEN)))) {
1482                 atomic_long_inc(&dev->rx_dropped);
1483                 kfree_skb(skb);
1484                 return NET_RX_DROP;
1485         }
1486         skb_set_dev(skb, dev);
1487         skb->tstamp.tv64 = 0;
1488         skb->pkt_type = PACKET_HOST;
1489         skb->protocol = eth_type_trans(skb, dev);
1490         return netif_rx(skb);
1491 }
1492 EXPORT_SYMBOL_GPL(dev_forward_skb);
1493
1494 static inline int deliver_skb(struct sk_buff *skb,
1495                               struct packet_type *pt_prev,
1496                               struct net_device *orig_dev)
1497 {
1498         atomic_inc(&skb->users);
1499         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1500 }
1501
1502 /*
1503  *      Support routine. Sends outgoing frames to any network
1504  *      taps currently in use.
1505  */
1506
1507 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1508 {
1509         struct packet_type *ptype;
1510         struct sk_buff *skb2 = NULL;
1511         struct packet_type *pt_prev = NULL;
1512
1513         rcu_read_lock();
1514         list_for_each_entry_rcu(ptype, &ptype_all, list) {
1515                 /* Never send packets back to the socket
1516                  * they originated from - MvS (miquels@drinkel.ow.org)
1517                  */
1518                 if ((ptype->dev == dev || !ptype->dev) &&
1519                     (ptype->af_packet_priv == NULL ||
1520                      (struct sock *)ptype->af_packet_priv != skb->sk)) {
1521                         if (pt_prev) {
1522                                 deliver_skb(skb2, pt_prev, skb->dev);
1523                                 pt_prev = ptype;
1524                                 continue;
1525                         }
1526
1527                         skb2 = skb_clone(skb, GFP_ATOMIC);
1528                         if (!skb2)
1529                                 break;
1530
1531                         net_timestamp_set(skb2);
1532
1533                         /* skb->nh should be correctly
1534                            set by sender, so that the second statement is
1535                            just protection against buggy protocols.
1536                          */
1537                         skb_reset_mac_header(skb2);
1538
1539                         if (skb_network_header(skb2) < skb2->data ||
1540                             skb2->network_header > skb2->tail) {
1541                                 if (net_ratelimit())
1542                                         printk(KERN_CRIT "protocol %04x is "
1543                                                "buggy, dev %s\n",
1544                                                ntohs(skb2->protocol),
1545                                                dev->name);
1546                                 skb_reset_network_header(skb2);
1547                         }
1548
1549                         skb2->transport_header = skb2->network_header;
1550                         skb2->pkt_type = PACKET_OUTGOING;
1551                         pt_prev = ptype;
1552                 }
1553         }
1554         if (pt_prev)
1555                 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1556         rcu_read_unlock();
1557 }
1558
1559 /* netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1560  * @dev: Network device
1561  * @txq: number of queues available
1562  *
1563  * If real_num_tx_queues is changed the tc mappings may no longer be
1564  * valid. To resolve this verify the tc mapping remains valid and if
1565  * not NULL the mapping. With no priorities mapping to this
1566  * offset/count pair it will no longer be used. In the worst case TC0
1567  * is invalid nothing can be done so disable priority mappings. If is
1568  * expected that drivers will fix this mapping if they can before
1569  * calling netif_set_real_num_tx_queues.
1570  */
1571 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1572 {
1573         int i;
1574         struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1575
1576         /* If TC0 is invalidated disable TC mapping */
1577         if (tc->offset + tc->count > txq) {
1578                 pr_warning("Number of in use tx queues changed "
1579                            "invalidating tc mappings. Priority "
1580                            "traffic classification disabled!\n");
1581                 dev->num_tc = 0;
1582                 return;
1583         }
1584
1585         /* Invalidated prio to tc mappings set to TC0 */
1586         for (i = 1; i < TC_BITMASK + 1; i++) {
1587                 int q = netdev_get_prio_tc_map(dev, i);
1588
1589                 tc = &dev->tc_to_txq[q];
1590                 if (tc->offset + tc->count > txq) {
1591                         pr_warning("Number of in use tx queues "
1592                                    "changed. Priority %i to tc "
1593                                    "mapping %i is no longer valid "
1594                                    "setting map to 0\n",
1595                                    i, q);
1596                         netdev_set_prio_tc_map(dev, i, 0);
1597                 }
1598         }
1599 }
1600
1601 /*
1602  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1603  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1604  */
1605 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1606 {
1607         int rc;
1608
1609         if (txq < 1 || txq > dev->num_tx_queues)
1610                 return -EINVAL;
1611
1612         if (dev->reg_state == NETREG_REGISTERED ||
1613             dev->reg_state == NETREG_UNREGISTERING) {
1614                 ASSERT_RTNL();
1615
1616                 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
1617                                                   txq);
1618                 if (rc)
1619                         return rc;
1620
1621                 if (dev->num_tc)
1622                         netif_setup_tc(dev, txq);
1623
1624                 if (txq < dev->real_num_tx_queues)
1625                         qdisc_reset_all_tx_gt(dev, txq);
1626         }
1627
1628         dev->real_num_tx_queues = txq;
1629         return 0;
1630 }
1631 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
1632
1633 #ifdef CONFIG_RPS
1634 /**
1635  *      netif_set_real_num_rx_queues - set actual number of RX queues used
1636  *      @dev: Network device
1637  *      @rxq: Actual number of RX queues
1638  *
1639  *      This must be called either with the rtnl_lock held or before
1640  *      registration of the net device.  Returns 0 on success, or a
1641  *      negative error code.  If called before registration, it always
1642  *      succeeds.
1643  */
1644 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
1645 {
1646         int rc;
1647
1648         if (rxq < 1 || rxq > dev->num_rx_queues)
1649                 return -EINVAL;
1650
1651         if (dev->reg_state == NETREG_REGISTERED) {
1652                 ASSERT_RTNL();
1653
1654                 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
1655                                                   rxq);
1656                 if (rc)
1657                         return rc;
1658         }
1659
1660         dev->real_num_rx_queues = rxq;
1661         return 0;
1662 }
1663 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
1664 #endif
1665
1666 static inline void __netif_reschedule(struct Qdisc *q)
1667 {
1668         struct softnet_data *sd;
1669         unsigned long flags;
1670
1671         local_irq_save(flags);
1672         sd = &__get_cpu_var(softnet_data);
1673         q->next_sched = NULL;
1674         *sd->output_queue_tailp = q;
1675         sd->output_queue_tailp = &q->next_sched;
1676         raise_softirq_irqoff(NET_TX_SOFTIRQ);
1677         local_irq_restore(flags);
1678 }
1679
1680 void __netif_schedule(struct Qdisc *q)
1681 {
1682         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1683                 __netif_reschedule(q);
1684 }
1685 EXPORT_SYMBOL(__netif_schedule);
1686
1687 void dev_kfree_skb_irq(struct sk_buff *skb)
1688 {
1689         if (atomic_dec_and_test(&skb->users)) {
1690                 struct softnet_data *sd;
1691                 unsigned long flags;
1692
1693                 local_irq_save(flags);
1694                 sd = &__get_cpu_var(softnet_data);
1695                 skb->next = sd->completion_queue;
1696                 sd->completion_queue = skb;
1697                 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1698                 local_irq_restore(flags);
1699         }
1700 }
1701 EXPORT_SYMBOL(dev_kfree_skb_irq);
1702
1703 void dev_kfree_skb_any(struct sk_buff *skb)
1704 {
1705         if (in_irq() || irqs_disabled())
1706                 dev_kfree_skb_irq(skb);
1707         else
1708                 dev_kfree_skb(skb);
1709 }
1710 EXPORT_SYMBOL(dev_kfree_skb_any);
1711
1712
1713 /**
1714  * netif_device_detach - mark device as removed
1715  * @dev: network device
1716  *
1717  * Mark device as removed from system and therefore no longer available.
1718  */
1719 void netif_device_detach(struct net_device *dev)
1720 {
1721         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1722             netif_running(dev)) {
1723                 netif_tx_stop_all_queues(dev);
1724         }
1725 }
1726 EXPORT_SYMBOL(netif_device_detach);
1727
1728 /**
1729  * netif_device_attach - mark device as attached
1730  * @dev: network device
1731  *
1732  * Mark device as attached from system and restart if needed.
1733  */
1734 void netif_device_attach(struct net_device *dev)
1735 {
1736         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1737             netif_running(dev)) {
1738                 netif_tx_wake_all_queues(dev);
1739                 __netdev_watchdog_up(dev);
1740         }
1741 }
1742 EXPORT_SYMBOL(netif_device_attach);
1743
1744 /**
1745  * skb_dev_set -- assign a new device to a buffer
1746  * @skb: buffer for the new device
1747  * @dev: network device
1748  *
1749  * If an skb is owned by a device already, we have to reset
1750  * all data private to the namespace a device belongs to
1751  * before assigning it a new device.
1752  */
1753 #ifdef CONFIG_NET_NS
1754 void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
1755 {
1756         skb_dst_drop(skb);
1757         if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
1758                 secpath_reset(skb);
1759                 nf_reset(skb);
1760                 skb_init_secmark(skb);
1761                 skb->mark = 0;
1762                 skb->priority = 0;
1763                 skb->nf_trace = 0;
1764                 skb->ipvs_property = 0;
1765 #ifdef CONFIG_NET_SCHED
1766                 skb->tc_index = 0;
1767 #endif
1768         }
1769         skb->dev = dev;
1770 }
1771 EXPORT_SYMBOL(skb_set_dev);
1772 #endif /* CONFIG_NET_NS */
1773
1774 /*
1775  * Invalidate hardware checksum when packet is to be mangled, and
1776  * complete checksum manually on outgoing path.
1777  */
1778 int skb_checksum_help(struct sk_buff *skb)
1779 {
1780         __wsum csum;
1781         int ret = 0, offset;
1782
1783         if (skb->ip_summed == CHECKSUM_COMPLETE)
1784                 goto out_set_summed;
1785
1786         if (unlikely(skb_shinfo(skb)->gso_size)) {
1787                 /* Let GSO fix up the checksum. */
1788                 goto out_set_summed;
1789         }
1790
1791         offset = skb_checksum_start_offset(skb);
1792         BUG_ON(offset >= skb_headlen(skb));
1793         csum = skb_checksum(skb, offset, skb->len - offset, 0);
1794
1795         offset += skb->csum_offset;
1796         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1797
1798         if (skb_cloned(skb) &&
1799             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1800                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1801                 if (ret)
1802                         goto out;
1803         }
1804
1805         *(__sum16 *)(skb->data + offset) = csum_fold(csum);
1806 out_set_summed:
1807         skb->ip_summed = CHECKSUM_NONE;
1808 out:
1809         return ret;
1810 }
1811 EXPORT_SYMBOL(skb_checksum_help);
1812
1813 /**
1814  *      skb_gso_segment - Perform segmentation on skb.
1815  *      @skb: buffer to segment
1816  *      @features: features for the output path (see dev->features)
1817  *
1818  *      This function segments the given skb and returns a list of segments.
1819  *
1820  *      It may return NULL if the skb requires no segmentation.  This is
1821  *      only possible when GSO is used for verifying header integrity.
1822  */
1823 struct sk_buff *skb_gso_segment(struct sk_buff *skb, u32 features)
1824 {
1825         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1826         struct packet_type *ptype;
1827         __be16 type = skb->protocol;
1828         int vlan_depth = ETH_HLEN;
1829         int err;
1830
1831         while (type == htons(ETH_P_8021Q)) {
1832                 struct vlan_hdr *vh;
1833
1834                 if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
1835                         return ERR_PTR(-EINVAL);
1836
1837                 vh = (struct vlan_hdr *)(skb->data + vlan_depth);
1838                 type = vh->h_vlan_encapsulated_proto;
1839                 vlan_depth += VLAN_HLEN;
1840         }
1841
1842         skb_reset_mac_header(skb);
1843         skb->mac_len = skb->network_header - skb->mac_header;
1844         __skb_pull(skb, skb->mac_len);
1845
1846         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1847                 struct net_device *dev = skb->dev;
1848                 struct ethtool_drvinfo info = {};
1849
1850                 if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1851                         dev->ethtool_ops->get_drvinfo(dev, &info);
1852
1853                 WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d ip_summed=%d\n",
1854                      info.driver, dev ? dev->features : 0L,
1855                      skb->sk ? skb->sk->sk_route_caps : 0L,
1856                      skb->len, skb->data_len, skb->ip_summed);
1857
1858                 if (skb_header_cloned(skb) &&
1859                     (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1860                         return ERR_PTR(err);
1861         }
1862
1863         rcu_read_lock();
1864         list_for_each_entry_rcu(ptype,
1865                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1866                 if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1867                         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1868                                 err = ptype->gso_send_check(skb);
1869                                 segs = ERR_PTR(err);
1870                                 if (err || skb_gso_ok(skb, features))
1871                                         break;
1872                                 __skb_push(skb, (skb->data -
1873                                                  skb_network_header(skb)));
1874                         }
1875                         segs = ptype->gso_segment(skb, features);
1876                         break;
1877                 }
1878         }
1879         rcu_read_unlock();
1880
1881         __skb_push(skb, skb->data - skb_mac_header(skb));
1882
1883         return segs;
1884 }
1885 EXPORT_SYMBOL(skb_gso_segment);
1886
1887 /* Take action when hardware reception checksum errors are detected. */
1888 #ifdef CONFIG_BUG
1889 void netdev_rx_csum_fault(struct net_device *dev)
1890 {
1891         if (net_ratelimit()) {
1892                 printk(KERN_ERR "%s: hw csum failure.\n",
1893                         dev ? dev->name : "<unknown>");
1894                 dump_stack();
1895         }
1896 }
1897 EXPORT_SYMBOL(netdev_rx_csum_fault);
1898 #endif
1899
1900 /* Actually, we should eliminate this check as soon as we know, that:
1901  * 1. IOMMU is present and allows to map all the memory.
1902  * 2. No high memory really exists on this machine.
1903  */
1904
1905 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1906 {
1907 #ifdef CONFIG_HIGHMEM
1908         int i;
1909         if (!(dev->features & NETIF_F_HIGHDMA)) {
1910                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1911                         if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1912                                 return 1;
1913         }
1914
1915         if (PCI_DMA_BUS_IS_PHYS) {
1916                 struct device *pdev = dev->dev.parent;
1917
1918                 if (!pdev)
1919                         return 0;
1920                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1921                         dma_addr_t addr = page_to_phys(skb_shinfo(skb)->frags[i].page);
1922                         if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
1923                                 return 1;
1924                 }
1925         }
1926 #endif
1927         return 0;
1928 }
1929
1930 struct dev_gso_cb {
1931         void (*destructor)(struct sk_buff *skb);
1932 };
1933
1934 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1935
1936 static void dev_gso_skb_destructor(struct sk_buff *skb)
1937 {
1938         struct dev_gso_cb *cb;
1939
1940         do {
1941                 struct sk_buff *nskb = skb->next;
1942
1943                 skb->next = nskb->next;
1944                 nskb->next = NULL;
1945                 kfree_skb(nskb);
1946         } while (skb->next);
1947
1948         cb = DEV_GSO_CB(skb);
1949         if (cb->destructor)
1950                 cb->destructor(skb);
1951 }
1952
1953 /**
1954  *      dev_gso_segment - Perform emulated hardware segmentation on skb.
1955  *      @skb: buffer to segment
1956  *      @features: device features as applicable to this skb
1957  *
1958  *      This function segments the given skb and stores the list of segments
1959  *      in skb->next.
1960  */
1961 static int dev_gso_segment(struct sk_buff *skb, int features)
1962 {
1963         struct sk_buff *segs;
1964
1965         segs = skb_gso_segment(skb, features);
1966
1967         /* Verifying header integrity only. */
1968         if (!segs)
1969                 return 0;
1970
1971         if (IS_ERR(segs))
1972                 return PTR_ERR(segs);
1973
1974         skb->next = segs;
1975         DEV_GSO_CB(skb)->destructor = skb->destructor;
1976         skb->destructor = dev_gso_skb_destructor;
1977
1978         return 0;
1979 }
1980
1981 /*
1982  * Try to orphan skb early, right before transmission by the device.
1983  * We cannot orphan skb if tx timestamp is requested or the sk-reference
1984  * is needed on driver level for other reasons, e.g. see net/can/raw.c
1985  */
1986 static inline void skb_orphan_try(struct sk_buff *skb)
1987 {
1988         struct sock *sk = skb->sk;
1989
1990         if (sk && !skb_shinfo(skb)->tx_flags) {
1991                 /* skb_tx_hash() wont be able to get sk.
1992                  * We copy sk_hash into skb->rxhash
1993                  */
1994                 if (!skb->rxhash)
1995                         skb->rxhash = sk->sk_hash;
1996                 skb_orphan(skb);
1997         }
1998 }
1999
2000 static bool can_checksum_protocol(unsigned long features, __be16 protocol)
2001 {
2002         return ((features & NETIF_F_GEN_CSUM) ||
2003                 ((features & NETIF_F_V4_CSUM) &&
2004                  protocol == htons(ETH_P_IP)) ||
2005                 ((features & NETIF_F_V6_CSUM) &&
2006                  protocol == htons(ETH_P_IPV6)) ||
2007                 ((features & NETIF_F_FCOE_CRC) &&
2008                  protocol == htons(ETH_P_FCOE)));
2009 }
2010
2011 static u32 harmonize_features(struct sk_buff *skb, __be16 protocol, u32 features)
2012 {
2013         if (!can_checksum_protocol(features, protocol)) {
2014                 features &= ~NETIF_F_ALL_CSUM;
2015                 features &= ~NETIF_F_SG;
2016         } else if (illegal_highdma(skb->dev, skb)) {
2017                 features &= ~NETIF_F_SG;
2018         }
2019
2020         return features;
2021 }
2022
2023 u32 netif_skb_features(struct sk_buff *skb)
2024 {
2025         __be16 protocol = skb->protocol;
2026         u32 features = skb->dev->features;
2027
2028         if (protocol == htons(ETH_P_8021Q)) {
2029                 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2030                 protocol = veh->h_vlan_encapsulated_proto;
2031         } else if (!vlan_tx_tag_present(skb)) {
2032                 return harmonize_features(skb, protocol, features);
2033         }
2034
2035         features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
2036
2037         if (protocol != htons(ETH_P_8021Q)) {
2038                 return harmonize_features(skb, protocol, features);
2039         } else {
2040                 features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2041                                 NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
2042                 return harmonize_features(skb, protocol, features);
2043         }
2044 }
2045 EXPORT_SYMBOL(netif_skb_features);
2046
2047 /*
2048  * Returns true if either:
2049  *      1. skb has frag_list and the device doesn't support FRAGLIST, or
2050  *      2. skb is fragmented and the device does not support SG, or if
2051  *         at least one of fragments is in highmem and device does not
2052  *         support DMA from it.
2053  */
2054 static inline int skb_needs_linearize(struct sk_buff *skb,
2055                                       int features)
2056 {
2057         return skb_is_nonlinear(skb) &&
2058                         ((skb_has_frag_list(skb) &&
2059                                 !(features & NETIF_F_FRAGLIST)) ||
2060                         (skb_shinfo(skb)->nr_frags &&
2061                                 !(features & NETIF_F_SG)));
2062 }
2063
2064 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2065                         struct netdev_queue *txq)
2066 {
2067         const struct net_device_ops *ops = dev->netdev_ops;
2068         int rc = NETDEV_TX_OK;
2069
2070         if (likely(!skb->next)) {
2071                 u32 features;
2072
2073                 /*
2074                  * If device doesnt need skb->dst, release it right now while
2075                  * its hot in this cpu cache
2076                  */
2077                 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2078                         skb_dst_drop(skb);
2079
2080                 if (!list_empty(&ptype_all))
2081                         dev_queue_xmit_nit(skb, dev);
2082
2083                 skb_orphan_try(skb);
2084
2085                 features = netif_skb_features(skb);
2086
2087                 if (vlan_tx_tag_present(skb) &&
2088                     !(features & NETIF_F_HW_VLAN_TX)) {
2089                         skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2090                         if (unlikely(!skb))
2091                                 goto out;
2092
2093                         skb->vlan_tci = 0;
2094                 }
2095
2096                 if (netif_needs_gso(skb, features)) {
2097                         if (unlikely(dev_gso_segment(skb, features)))
2098                                 goto out_kfree_skb;
2099                         if (skb->next)
2100                                 goto gso;
2101                 } else {
2102                         if (skb_needs_linearize(skb, features) &&
2103                             __skb_linearize(skb))
2104                                 goto out_kfree_skb;
2105
2106                         /* If packet is not checksummed and device does not
2107                          * support checksumming for this protocol, complete
2108                          * checksumming here.
2109                          */
2110                         if (skb->ip_summed == CHECKSUM_PARTIAL) {
2111                                 skb_set_transport_header(skb,
2112                                         skb_checksum_start_offset(skb));
2113                                 if (!(features & NETIF_F_ALL_CSUM) &&
2114                                      skb_checksum_help(skb))
2115                                         goto out_kfree_skb;
2116                         }
2117                 }
2118
2119                 rc = ops->ndo_start_xmit(skb, dev);
2120                 trace_net_dev_xmit(skb, rc);
2121                 if (rc == NETDEV_TX_OK)
2122                         txq_trans_update(txq);
2123                 return rc;
2124         }
2125
2126 gso:
2127         do {
2128                 struct sk_buff *nskb = skb->next;
2129
2130                 skb->next = nskb->next;
2131                 nskb->next = NULL;
2132
2133                 /*
2134                  * If device doesnt need nskb->dst, release it right now while
2135                  * its hot in this cpu cache
2136                  */
2137                 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2138                         skb_dst_drop(nskb);
2139
2140                 rc = ops->ndo_start_xmit(nskb, dev);
2141                 trace_net_dev_xmit(nskb, rc);
2142                 if (unlikely(rc != NETDEV_TX_OK)) {
2143                         if (rc & ~NETDEV_TX_MASK)
2144                                 goto out_kfree_gso_skb;
2145                         nskb->next = skb->next;
2146                         skb->next = nskb;
2147                         return rc;
2148                 }
2149                 txq_trans_update(txq);
2150                 if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
2151                         return NETDEV_TX_BUSY;
2152         } while (skb->next);
2153
2154 out_kfree_gso_skb:
2155         if (likely(skb->next == NULL))
2156                 skb->destructor = DEV_GSO_CB(skb)->destructor;
2157 out_kfree_skb:
2158         kfree_skb(skb);
2159 out:
2160         return rc;
2161 }
2162
2163 static u32 hashrnd __read_mostly;
2164
2165 /*
2166  * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2167  * to be used as a distribution range.
2168  */
2169 u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
2170                   unsigned int num_tx_queues)
2171 {
2172         u32 hash;
2173         u16 qoffset = 0;
2174         u16 qcount = num_tx_queues;
2175
2176         if (skb_rx_queue_recorded(skb)) {
2177                 hash = skb_get_rx_queue(skb);
2178                 while (unlikely(hash >= num_tx_queues))
2179                         hash -= num_tx_queues;
2180                 return hash;
2181         }
2182
2183         if (dev->num_tc) {
2184                 u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2185                 qoffset = dev->tc_to_txq[tc].offset;
2186                 qcount = dev->tc_to_txq[tc].count;
2187         }
2188
2189         if (skb->sk && skb->sk->sk_hash)
2190                 hash = skb->sk->sk_hash;
2191         else
2192                 hash = (__force u16) skb->protocol ^ skb->rxhash;
2193         hash = jhash_1word(hash, hashrnd);
2194
2195         return (u16) (((u64) hash * qcount) >> 32) + qoffset;
2196 }
2197 EXPORT_SYMBOL(__skb_tx_hash);
2198
2199 static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2200 {
2201         if (unlikely(queue_index >= dev->real_num_tx_queues)) {
2202                 if (net_ratelimit()) {
2203                         pr_warning("%s selects TX queue %d, but "
2204                                 "real number of TX queues is %d\n",
2205                                 dev->name, queue_index, dev->real_num_tx_queues);
2206                 }
2207                 return 0;
2208         }
2209         return queue_index;
2210 }
2211
2212 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2213 {
2214 #ifdef CONFIG_XPS
2215         struct xps_dev_maps *dev_maps;
2216         struct xps_map *map;
2217         int queue_index = -1;
2218
2219         rcu_read_lock();
2220         dev_maps = rcu_dereference(dev->xps_maps);
2221         if (dev_maps) {
2222                 map = rcu_dereference(
2223                     dev_maps->cpu_map[raw_smp_processor_id()]);
2224                 if (map) {
2225                         if (map->len == 1)
2226                                 queue_index = map->queues[0];
2227                         else {
2228                                 u32 hash;
2229                                 if (skb->sk && skb->sk->sk_hash)
2230                                         hash = skb->sk->sk_hash;
2231                                 else
2232                                         hash = (__force u16) skb->protocol ^
2233                                             skb->rxhash;
2234                                 hash = jhash_1word(hash, hashrnd);
2235                                 queue_index = map->queues[
2236                                     ((u64)hash * map->len) >> 32];
2237                         }
2238                         if (unlikely(queue_index >= dev->real_num_tx_queues))
2239                                 queue_index = -1;
2240                 }
2241         }
2242         rcu_read_unlock();
2243
2244         return queue_index;
2245 #else
2246         return -1;
2247 #endif
2248 }
2249
2250 static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2251                                         struct sk_buff *skb)
2252 {
2253         int queue_index;
2254         const struct net_device_ops *ops = dev->netdev_ops;
2255
2256         if (dev->real_num_tx_queues == 1)
2257                 queue_index = 0;
2258         else if (ops->ndo_select_queue) {
2259                 queue_index = ops->ndo_select_queue(dev, skb);
2260                 queue_index = dev_cap_txqueue(dev, queue_index);
2261         } else {
2262                 struct sock *sk = skb->sk;
2263                 queue_index = sk_tx_queue_get(sk);
2264
2265                 if (queue_index < 0 || skb->ooo_okay ||
2266                     queue_index >= dev->real_num_tx_queues) {
2267                         int old_index = queue_index;
2268
2269                         queue_index = get_xps_queue(dev, skb);
2270                         if (queue_index < 0)
2271                                 queue_index = skb_tx_hash(dev, skb);
2272
2273                         if (queue_index != old_index && sk) {
2274                                 struct dst_entry *dst =
2275                                     rcu_dereference_check(sk->sk_dst_cache, 1);
2276
2277                                 if (dst && skb_dst(skb) == dst)
2278                                         sk_tx_queue_set(sk, queue_index);
2279                         }
2280                 }
2281         }
2282
2283         skb_set_queue_mapping(skb, queue_index);
2284         return netdev_get_tx_queue(dev, queue_index);
2285 }
2286
2287 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2288                                  struct net_device *dev,
2289                                  struct netdev_queue *txq)
2290 {
2291         spinlock_t *root_lock = qdisc_lock(q);
2292         bool contended;
2293         int rc;
2294
2295         qdisc_skb_cb(skb)->pkt_len = skb->len;
2296         qdisc_calculate_pkt_len(skb, q);
2297         /*
2298          * Heuristic to force contended enqueues to serialize on a
2299          * separate lock before trying to get qdisc main lock.
2300          * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2301          * and dequeue packets faster.
2302          */
2303         contended = qdisc_is_running(q);
2304         if (unlikely(contended))
2305                 spin_lock(&q->busylock);
2306
2307         spin_lock(root_lock);
2308         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2309                 kfree_skb(skb);
2310                 rc = NET_XMIT_DROP;
2311         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2312                    qdisc_run_begin(q)) {
2313                 /*
2314                  * This is a work-conserving queue; there are no old skbs
2315                  * waiting to be sent out; and the qdisc is not running -
2316                  * xmit the skb directly.
2317                  */
2318                 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2319                         skb_dst_force(skb);
2320
2321                 qdisc_bstats_update(q, skb);
2322
2323                 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2324                         if (unlikely(contended)) {
2325                                 spin_unlock(&q->busylock);
2326                                 contended = false;
2327                         }
2328                         __qdisc_run(q);
2329                 } else
2330                         qdisc_run_end(q);
2331
2332                 rc = NET_XMIT_SUCCESS;
2333         } else {
2334                 skb_dst_force(skb);
2335                 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2336                 if (qdisc_run_begin(q)) {
2337                         if (unlikely(contended)) {
2338                                 spin_unlock(&q->busylock);
2339                                 contended = false;
2340                         }
2341                         __qdisc_run(q);
2342                 }
2343         }
2344         spin_unlock(root_lock);
2345         if (unlikely(contended))
2346                 spin_unlock(&q->busylock);
2347         return rc;
2348 }
2349
2350 static DEFINE_PER_CPU(int, xmit_recursion);
2351 #define RECURSION_LIMIT 10
2352
2353 /**
2354  *      dev_queue_xmit - transmit a buffer
2355  *      @skb: buffer to transmit
2356  *
2357  *      Queue a buffer for transmission to a network device. The caller must
2358  *      have set the device and priority and built the buffer before calling
2359  *      this function. The function can be called from an interrupt.
2360  *
2361  *      A negative errno code is returned on a failure. A success does not
2362  *      guarantee the frame will be transmitted as it may be dropped due
2363  *      to congestion or traffic shaping.
2364  *
2365  * -----------------------------------------------------------------------------------
2366  *      I notice this method can also return errors from the queue disciplines,
2367  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2368  *      be positive.
2369  *
2370  *      Regardless of the return value, the skb is consumed, so it is currently
2371  *      difficult to retry a send to this method.  (You can bump the ref count
2372  *      before sending to hold a reference for retry if you are careful.)
2373  *
2374  *      When calling this method, interrupts MUST be enabled.  This is because
2375  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2376  *          --BLG
2377  */
2378 int dev_queue_xmit(struct sk_buff *skb)
2379 {
2380         struct net_device *dev = skb->dev;
2381         struct netdev_queue *txq;
2382         struct Qdisc *q;
2383         int rc = -ENOMEM;
2384
2385         /* Disable soft irqs for various locks below. Also
2386          * stops preemption for RCU.
2387          */
2388         rcu_read_lock_bh();
2389
2390         txq = dev_pick_tx(dev, skb);
2391         q = rcu_dereference_bh(txq->qdisc);
2392
2393 #ifdef CONFIG_NET_CLS_ACT
2394         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2395 #endif
2396         trace_net_dev_queue(skb);
2397         if (q->enqueue) {
2398                 rc = __dev_xmit_skb(skb, q, dev, txq);
2399                 goto out;
2400         }
2401
2402         /* The device has no queue. Common case for software devices:
2403            loopback, all the sorts of tunnels...
2404
2405            Really, it is unlikely that netif_tx_lock protection is necessary
2406            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2407            counters.)
2408            However, it is possible, that they rely on protection
2409            made by us here.
2410
2411            Check this and shot the lock. It is not prone from deadlocks.
2412            Either shot noqueue qdisc, it is even simpler 8)
2413          */
2414         if (dev->flags & IFF_UP) {
2415                 int cpu = smp_processor_id(); /* ok because BHs are off */
2416
2417                 if (txq->xmit_lock_owner != cpu) {
2418
2419                         if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2420                                 goto recursion_alert;
2421
2422                         HARD_TX_LOCK(dev, txq, cpu);
2423
2424                         if (!netif_tx_queue_stopped(txq)) {
2425                                 __this_cpu_inc(xmit_recursion);
2426                                 rc = dev_hard_start_xmit(skb, dev, txq);
2427                                 __this_cpu_dec(xmit_recursion);
2428                                 if (dev_xmit_complete(rc)) {
2429                                         HARD_TX_UNLOCK(dev, txq);
2430                                         goto out;
2431                                 }
2432                         }
2433                         HARD_TX_UNLOCK(dev, txq);
2434                         if (net_ratelimit())
2435                                 printk(KERN_CRIT "Virtual device %s asks to "
2436                                        "queue packet!\n", dev->name);
2437                 } else {
2438                         /* Recursion is detected! It is possible,
2439                          * unfortunately
2440                          */
2441 recursion_alert:
2442                         if (net_ratelimit())
2443                                 printk(KERN_CRIT "Dead loop on virtual device "
2444                                        "%s, fix it urgently!\n", dev->name);
2445                 }
2446         }
2447
2448         rc = -ENETDOWN;
2449         rcu_read_unlock_bh();
2450
2451         kfree_skb(skb);
2452         return rc;
2453 out:
2454         rcu_read_unlock_bh();
2455         return rc;
2456 }
2457 EXPORT_SYMBOL(dev_queue_xmit);
2458
2459
2460 /*=======================================================================
2461                         Receiver routines
2462   =======================================================================*/
2463
2464 int netdev_max_backlog __read_mostly = 1000;
2465 int netdev_tstamp_prequeue __read_mostly = 1;
2466 int netdev_budget __read_mostly = 300;
2467 int weight_p __read_mostly = 64;            /* old backlog weight */
2468
2469 /* Called with irq disabled */
2470 static inline void ____napi_schedule(struct softnet_data *sd,
2471                                      struct napi_struct *napi)
2472 {
2473         list_add_tail(&napi->poll_list, &sd->poll_list);
2474         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2475 }
2476
2477 /*
2478  * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
2479  * and src/dst port numbers. Returns a non-zero hash number on success
2480  * and 0 on failure.
2481  */
2482 __u32 __skb_get_rxhash(struct sk_buff *skb)
2483 {
2484         int nhoff, hash = 0, poff;
2485         struct ipv6hdr *ip6;
2486         struct iphdr *ip;
2487         u8 ip_proto;
2488         u32 addr1, addr2, ihl;
2489         union {
2490                 u32 v32;
2491                 u16 v16[2];
2492         } ports;
2493
2494         nhoff = skb_network_offset(skb);
2495
2496         switch (skb->protocol) {
2497         case __constant_htons(ETH_P_IP):
2498                 if (!pskb_may_pull(skb, sizeof(*ip) + nhoff))
2499                         goto done;
2500
2501                 ip = (struct iphdr *) (skb->data + nhoff);
2502                 if (ip->frag_off & htons(IP_MF | IP_OFFSET))
2503                         ip_proto = 0;
2504                 else
2505                         ip_proto = ip->protocol;
2506                 addr1 = (__force u32) ip->saddr;
2507                 addr2 = (__force u32) ip->daddr;
2508                 ihl = ip->ihl;
2509                 break;
2510         case __constant_htons(ETH_P_IPV6):
2511                 if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff))
2512                         goto done;
2513
2514                 ip6 = (struct ipv6hdr *) (skb->data + nhoff);
2515                 ip_proto = ip6->nexthdr;
2516                 addr1 = (__force u32) ip6->saddr.s6_addr32[3];
2517                 addr2 = (__force u32) ip6->daddr.s6_addr32[3];
2518                 ihl = (40 >> 2);
2519                 break;
2520         default:
2521                 goto done;
2522         }
2523
2524         ports.v32 = 0;
2525         poff = proto_ports_offset(ip_proto);
2526         if (poff >= 0) {
2527                 nhoff += ihl * 4 + poff;
2528                 if (pskb_may_pull(skb, nhoff + 4)) {
2529                         ports.v32 = * (__force u32 *) (skb->data + nhoff);
2530                         if (ports.v16[1] < ports.v16[0])
2531                                 swap(ports.v16[0], ports.v16[1]);
2532                 }
2533         }
2534
2535         /* get a consistent hash (same value on both flow directions) */
2536         if (addr2 < addr1)
2537                 swap(addr1, addr2);
2538
2539         hash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
2540         if (!hash)
2541                 hash = 1;
2542
2543 done:
2544         return hash;
2545 }
2546 EXPORT_SYMBOL(__skb_get_rxhash);
2547
2548 #ifdef CONFIG_RPS
2549
2550 /* One global table that all flow-based protocols share. */
2551 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2552 EXPORT_SYMBOL(rps_sock_flow_table);
2553
2554 static struct rps_dev_flow *
2555 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2556             struct rps_dev_flow *rflow, u16 next_cpu)
2557 {
2558         u16 tcpu;
2559
2560         tcpu = rflow->cpu = next_cpu;
2561         if (tcpu != RPS_NO_CPU) {
2562 #ifdef CONFIG_RFS_ACCEL
2563                 struct netdev_rx_queue *rxqueue;
2564                 struct rps_dev_flow_table *flow_table;
2565                 struct rps_dev_flow *old_rflow;
2566                 u32 flow_id;
2567                 u16 rxq_index;
2568                 int rc;
2569
2570                 /* Should we steer this flow to a different hardware queue? */
2571                 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2572                     !(dev->features & NETIF_F_NTUPLE))
2573                         goto out;
2574                 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2575                 if (rxq_index == skb_get_rx_queue(skb))
2576                         goto out;
2577
2578                 rxqueue = dev->_rx + rxq_index;
2579                 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2580                 if (!flow_table)
2581                         goto out;
2582                 flow_id = skb->rxhash & flow_table->mask;
2583                 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2584                                                         rxq_index, flow_id);
2585                 if (rc < 0)
2586                         goto out;
2587                 old_rflow = rflow;
2588                 rflow = &flow_table->flows[flow_id];
2589                 rflow->cpu = next_cpu;
2590                 rflow->filter = rc;
2591                 if (old_rflow->filter == rflow->filter)
2592                         old_rflow->filter = RPS_NO_FILTER;
2593         out:
2594 #endif
2595                 rflow->last_qtail =
2596                         per_cpu(softnet_data, tcpu).input_queue_head;
2597         }
2598
2599         return rflow;
2600 }
2601
2602 /*
2603  * get_rps_cpu is called from netif_receive_skb and returns the target
2604  * CPU from the RPS map of the receiving queue for a given skb.
2605  * rcu_read_lock must be held on entry.
2606  */
2607 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2608                        struct rps_dev_flow **rflowp)
2609 {
2610         struct netdev_rx_queue *rxqueue;
2611         struct rps_map *map;
2612         struct rps_dev_flow_table *flow_table;
2613         struct rps_sock_flow_table *sock_flow_table;
2614         int cpu = -1;
2615         u16 tcpu;
2616
2617         if (skb_rx_queue_recorded(skb)) {
2618                 u16 index = skb_get_rx_queue(skb);
2619                 if (unlikely(index >= dev->real_num_rx_queues)) {
2620                         WARN_ONCE(dev->real_num_rx_queues > 1,
2621                                   "%s received packet on queue %u, but number "
2622                                   "of RX queues is %u\n",
2623                                   dev->name, index, dev->real_num_rx_queues);
2624                         goto done;
2625                 }
2626                 rxqueue = dev->_rx + index;
2627         } else
2628                 rxqueue = dev->_rx;
2629
2630         map = rcu_dereference(rxqueue->rps_map);
2631         if (map) {
2632                 if (map->len == 1 &&
2633                     !rcu_dereference_raw(rxqueue->rps_flow_table)) {
2634                         tcpu = map->cpus[0];
2635                         if (cpu_online(tcpu))
2636                                 cpu = tcpu;
2637                         goto done;
2638                 }
2639         } else if (!rcu_dereference_raw(rxqueue->rps_flow_table)) {
2640                 goto done;
2641         }
2642
2643         skb_reset_network_header(skb);
2644         if (!skb_get_rxhash(skb))
2645                 goto done;
2646
2647         flow_table = rcu_dereference(rxqueue->rps_flow_table);
2648         sock_flow_table = rcu_dereference(rps_sock_flow_table);
2649         if (flow_table && sock_flow_table) {
2650                 u16 next_cpu;
2651                 struct rps_dev_flow *rflow;
2652
2653                 rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2654                 tcpu = rflow->cpu;
2655
2656                 next_cpu = sock_flow_table->ents[skb->rxhash &
2657                     sock_flow_table->mask];
2658
2659                 /*
2660                  * If the desired CPU (where last recvmsg was done) is
2661                  * different from current CPU (one in the rx-queue flow
2662                  * table entry), switch if one of the following holds:
2663                  *   - Current CPU is unset (equal to RPS_NO_CPU).
2664                  *   - Current CPU is offline.
2665                  *   - The current CPU's queue tail has advanced beyond the
2666                  *     last packet that was enqueued using this table entry.
2667                  *     This guarantees that all previous packets for the flow
2668                  *     have been dequeued, thus preserving in order delivery.
2669                  */
2670                 if (unlikely(tcpu != next_cpu) &&
2671                     (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2672                      ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2673                       rflow->last_qtail)) >= 0))
2674                         rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
2675
2676                 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2677                         *rflowp = rflow;
2678                         cpu = tcpu;
2679                         goto done;
2680                 }
2681         }
2682
2683         if (map) {
2684                 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2685
2686                 if (cpu_online(tcpu)) {
2687                         cpu = tcpu;
2688                         goto done;
2689                 }
2690         }
2691
2692 done:
2693         return cpu;
2694 }
2695
2696 #ifdef CONFIG_RFS_ACCEL
2697
2698 /**
2699  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
2700  * @dev: Device on which the filter was set
2701  * @rxq_index: RX queue index
2702  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
2703  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
2704  *
2705  * Drivers that implement ndo_rx_flow_steer() should periodically call
2706  * this function for each installed filter and remove the filters for
2707  * which it returns %true.
2708  */
2709 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
2710                          u32 flow_id, u16 filter_id)
2711 {
2712         struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
2713         struct rps_dev_flow_table *flow_table;
2714         struct rps_dev_flow *rflow;
2715         bool expire = true;
2716         int cpu;
2717
2718         rcu_read_lock();
2719         flow_table = rcu_dereference(rxqueue->rps_flow_table);
2720         if (flow_table && flow_id <= flow_table->mask) {
2721                 rflow = &flow_table->flows[flow_id];
2722                 cpu = ACCESS_ONCE(rflow->cpu);
2723                 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
2724                     ((int)(per_cpu(softnet_data, cpu).input_queue_head -
2725                            rflow->last_qtail) <
2726                      (int)(10 * flow_table->mask)))
2727                         expire = false;
2728         }
2729         rcu_read_unlock();
2730         return expire;
2731 }
2732 EXPORT_SYMBOL(rps_may_expire_flow);
2733
2734 #endif /* CONFIG_RFS_ACCEL */
2735
2736 /* Called from hardirq (IPI) context */
2737 static void rps_trigger_softirq(void *data)
2738 {
2739         struct softnet_data *sd = data;
2740
2741         ____napi_schedule(sd, &sd->backlog);
2742         sd->received_rps++;
2743 }
2744
2745 #endif /* CONFIG_RPS */
2746
2747 /*
2748  * Check if this softnet_data structure is another cpu one
2749  * If yes, queue it to our IPI list and return 1
2750  * If no, return 0
2751  */
2752 static int rps_ipi_queued(struct softnet_data *sd)
2753 {
2754 #ifdef CONFIG_RPS
2755         struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2756
2757         if (sd != mysd) {
2758                 sd->rps_ipi_next = mysd->rps_ipi_list;
2759                 mysd->rps_ipi_list = sd;
2760
2761                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2762                 return 1;
2763         }
2764 #endif /* CONFIG_RPS */
2765         return 0;
2766 }
2767
2768 /*
2769  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2770  * queue (may be a remote CPU queue).
2771  */
2772 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2773                               unsigned int *qtail)
2774 {
2775         struct softnet_data *sd;
2776         unsigned long flags;
2777
2778         sd = &per_cpu(softnet_data, cpu);
2779
2780         local_irq_save(flags);
2781
2782         rps_lock(sd);
2783         if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
2784                 if (skb_queue_len(&sd->input_pkt_queue)) {
2785 enqueue:
2786                         __skb_queue_tail(&sd->input_pkt_queue, skb);
2787                         input_queue_tail_incr_save(sd, qtail);
2788                         rps_unlock(sd);
2789                         local_irq_restore(flags);
2790                         return NET_RX_SUCCESS;
2791                 }
2792
2793                 /* Schedule NAPI for backlog device
2794                  * We can use non atomic operation since we own the queue lock
2795                  */
2796                 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
2797                         if (!rps_ipi_queued(sd))
2798                                 ____napi_schedule(sd, &sd->backlog);
2799                 }
2800                 goto enqueue;
2801         }
2802
2803         sd->dropped++;
2804         rps_unlock(sd);
2805
2806         local_irq_restore(flags);
2807
2808         atomic_long_inc(&skb->dev->rx_dropped);
2809         kfree_skb(skb);
2810         return NET_RX_DROP;
2811 }
2812
2813 /**
2814  *      netif_rx        -       post buffer to the network code
2815  *      @skb: buffer to post
2816  *
2817  *      This function receives a packet from a device driver and queues it for
2818  *      the upper (protocol) levels to process.  It always succeeds. The buffer
2819  *      may be dropped during processing for congestion control or by the
2820  *      protocol layers.
2821  *
2822  *      return values:
2823  *      NET_RX_SUCCESS  (no congestion)
2824  *      NET_RX_DROP     (packet was dropped)
2825  *
2826  */
2827
2828 int netif_rx(struct sk_buff *skb)
2829 {
2830         int ret;
2831
2832         /* if netpoll wants it, pretend we never saw it */
2833         if (netpoll_rx(skb))
2834                 return NET_RX_DROP;
2835
2836         if (netdev_tstamp_prequeue)
2837                 net_timestamp_check(skb);
2838
2839         trace_netif_rx(skb);
2840 #ifdef CONFIG_RPS
2841         {
2842                 struct rps_dev_flow voidflow, *rflow = &voidflow;
2843                 int cpu;
2844
2845                 preempt_disable();
2846                 rcu_read_lock();
2847
2848                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
2849                 if (cpu < 0)
2850                         cpu = smp_processor_id();
2851
2852                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2853
2854                 rcu_read_unlock();
2855                 preempt_enable();
2856         }
2857 #else
2858         {
2859                 unsigned int qtail;
2860                 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
2861                 put_cpu();
2862         }
2863 #endif
2864         return ret;
2865 }
2866 EXPORT_SYMBOL(netif_rx);
2867
2868 int netif_rx_ni(struct sk_buff *skb)
2869 {
2870         int err;
2871
2872         preempt_disable();
2873         err = netif_rx(skb);
2874         if (local_softirq_pending())
2875                 do_softirq();
2876         preempt_enable();
2877
2878         return err;
2879 }
2880 EXPORT_SYMBOL(netif_rx_ni);
2881
2882 static void net_tx_action(struct softirq_action *h)
2883 {
2884         struct softnet_data *sd = &__get_cpu_var(softnet_data);
2885
2886         if (sd->completion_queue) {
2887                 struct sk_buff *clist;
2888
2889                 local_irq_disable();
2890                 clist = sd->completion_queue;
2891                 sd->completion_queue = NULL;
2892                 local_irq_enable();
2893
2894                 while (clist) {
2895                         struct sk_buff *skb = clist;
2896                         clist = clist->next;
2897
2898                         WARN_ON(atomic_read(&skb->users));
2899                         trace_kfree_skb(skb, net_tx_action);
2900                         __kfree_skb(skb);
2901                 }
2902         }
2903
2904         if (sd->output_queue) {
2905                 struct Qdisc *head;
2906
2907                 local_irq_disable();
2908                 head = sd->output_queue;
2909                 sd->output_queue = NULL;
2910                 sd->output_queue_tailp = &sd->output_queue;
2911                 local_irq_enable();
2912
2913                 while (head) {
2914                         struct Qdisc *q = head;
2915                         spinlock_t *root_lock;
2916
2917                         head = head->next_sched;
2918
2919                         root_lock = qdisc_lock(q);
2920                         if (spin_trylock(root_lock)) {
2921                                 smp_mb__before_clear_bit();
2922                                 clear_bit(__QDISC_STATE_SCHED,
2923                                           &q->state);
2924                                 qdisc_run(q);
2925                                 spin_unlock(root_lock);
2926                         } else {
2927                                 if (!test_bit(__QDISC_STATE_DEACTIVATED,
2928                                               &q->state)) {
2929                                         __netif_reschedule(q);
2930                                 } else {
2931                                         smp_mb__before_clear_bit();
2932                                         clear_bit(__QDISC_STATE_SCHED,
2933                                                   &q->state);
2934                                 }
2935                         }
2936                 }
2937         }
2938 }
2939
2940 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
2941     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
2942 /* This hook is defined here for ATM LANE */
2943 int (*br_fdb_test_addr_hook)(struct net_device *dev,
2944                              unsigned char *addr) __read_mostly;
2945 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
2946 #endif
2947
2948 #ifdef CONFIG_NET_CLS_ACT
2949 /* TODO: Maybe we should just force sch_ingress to be compiled in
2950  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2951  * a compare and 2 stores extra right now if we dont have it on
2952  * but have CONFIG_NET_CLS_ACT
2953  * NOTE: This doesnt stop any functionality; if you dont have
2954  * the ingress scheduler, you just cant add policies on ingress.
2955  *
2956  */
2957 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
2958 {
2959         struct net_device *dev = skb->dev;
2960         u32 ttl = G_TC_RTTL(skb->tc_verd);
2961         int result = TC_ACT_OK;
2962         struct Qdisc *q;
2963
2964         if (unlikely(MAX_RED_LOOP < ttl++)) {
2965                 if (net_ratelimit())
2966                         pr_warning( "Redir loop detected Dropping packet (%d->%d)\n",
2967                                skb->skb_iif, dev->ifindex);
2968                 return TC_ACT_SHOT;
2969         }
2970
2971         skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
2972         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
2973
2974         q = rxq->qdisc;
2975         if (q != &noop_qdisc) {
2976                 spin_lock(qdisc_lock(q));
2977                 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
2978                         result = qdisc_enqueue_root(skb, q);
2979                 spin_unlock(qdisc_lock(q));
2980         }
2981
2982         return result;
2983 }
2984
2985 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
2986                                          struct packet_type **pt_prev,
2987                                          int *ret, struct net_device *orig_dev)
2988 {
2989         struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
2990
2991         if (!rxq || rxq->qdisc == &noop_qdisc)
2992                 goto out;
2993
2994         if (*pt_prev) {
2995                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
2996                 *pt_prev = NULL;
2997         }
2998
2999         switch (ing_filter(skb, rxq)) {
3000         case TC_ACT_SHOT:
3001         case TC_ACT_STOLEN:
3002                 kfree_skb(skb);
3003                 return NULL;
3004         }
3005
3006 out:
3007         skb->tc_verd = 0;
3008         return skb;
3009 }
3010 #endif
3011
3012 /**
3013  *      netdev_rx_handler_register - register receive handler
3014  *      @dev: device to register a handler for
3015  *      @rx_handler: receive handler to register
3016  *      @rx_handler_data: data pointer that is used by rx handler
3017  *
3018  *      Register a receive hander for a device. This handler will then be
3019  *      called from __netif_receive_skb. A negative errno code is returned
3020  *      on a failure.
3021  *
3022  *      The caller must hold the rtnl_mutex.
3023  *
3024  *      For a general description of rx_handler, see enum rx_handler_result.
3025  */
3026 int netdev_rx_handler_register(struct net_device *dev,
3027                                rx_handler_func_t *rx_handler,
3028                                void *rx_handler_data)
3029 {
3030         ASSERT_RTNL();
3031
3032         if (dev->rx_handler)
3033                 return -EBUSY;
3034
3035         rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3036         rcu_assign_pointer(dev->rx_handler, rx_handler);
3037
3038         return 0;
3039 }
3040 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3041
3042 /**
3043  *      netdev_rx_handler_unregister - unregister receive handler
3044  *      @dev: device to unregister a handler from
3045  *
3046  *      Unregister a receive hander from a device.
3047  *
3048  *      The caller must hold the rtnl_mutex.
3049  */
3050 void netdev_rx_handler_unregister(struct net_device *dev)
3051 {
3052
3053         ASSERT_RTNL();
3054         rcu_assign_pointer(dev->rx_handler, NULL);
3055         rcu_assign_pointer(dev->rx_handler_data, NULL);
3056 }
3057 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3058
3059 static void vlan_on_bond_hook(struct sk_buff *skb)
3060 {
3061         /*
3062          * Make sure ARP frames received on VLAN interfaces stacked on
3063          * bonding interfaces still make their way to any base bonding
3064          * device that may have registered for a specific ptype.
3065          */
3066         if (skb->dev->priv_flags & IFF_802_1Q_VLAN &&
3067             vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING &&
3068             skb->protocol == htons(ETH_P_ARP)) {
3069                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
3070
3071                 if (!skb2)
3072                         return;
3073                 skb2->dev = vlan_dev_real_dev(skb->dev);
3074                 netif_rx(skb2);
3075         }
3076 }
3077
3078 static int __netif_receive_skb(struct sk_buff *skb)
3079 {
3080         struct packet_type *ptype, *pt_prev;
3081         rx_handler_func_t *rx_handler;
3082         struct net_device *orig_dev;
3083         struct net_device *null_or_dev;
3084         bool deliver_exact = false;
3085         int ret = NET_RX_DROP;
3086         __be16 type;
3087
3088         if (!netdev_tstamp_prequeue)
3089                 net_timestamp_check(skb);
3090
3091         trace_netif_receive_skb(skb);
3092
3093         /* if we've gotten here through NAPI, check netpoll */
3094         if (netpoll_receive_skb(skb))
3095                 return NET_RX_DROP;
3096
3097         if (!skb->skb_iif)
3098                 skb->skb_iif = skb->dev->ifindex;
3099         orig_dev = skb->dev;
3100
3101         skb_reset_network_header(skb);
3102         skb_reset_transport_header(skb);
3103         skb->mac_len = skb->network_header - skb->mac_header;
3104
3105         pt_prev = NULL;
3106
3107         rcu_read_lock();
3108
3109 another_round:
3110
3111         __this_cpu_inc(softnet_data.processed);
3112
3113 #ifdef CONFIG_NET_CLS_ACT
3114         if (skb->tc_verd & TC_NCLS) {
3115                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3116                 goto ncls;
3117         }
3118 #endif
3119
3120         list_for_each_entry_rcu(ptype, &ptype_all, list) {
3121                 if (!ptype->dev || ptype->dev == skb->dev) {
3122                         if (pt_prev)
3123                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3124                         pt_prev = ptype;
3125                 }
3126         }
3127
3128 #ifdef CONFIG_NET_CLS_ACT
3129         skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3130         if (!skb)
3131                 goto out;
3132 ncls:
3133 #endif
3134
3135         rx_handler = rcu_dereference(skb->dev->rx_handler);
3136         if (rx_handler) {
3137                 if (pt_prev) {
3138                         ret = deliver_skb(skb, pt_prev, orig_dev);
3139                         pt_prev = NULL;
3140                 }
3141                 switch (rx_handler(&skb)) {
3142                 case RX_HANDLER_CONSUMED:
3143                         goto out;
3144                 case RX_HANDLER_ANOTHER:
3145                         goto another_round;
3146                 case RX_HANDLER_EXACT:
3147                         deliver_exact = true;
3148                 case RX_HANDLER_PASS:
3149                         break;
3150                 default:
3151                         BUG();
3152                 }
3153         }
3154
3155         if (vlan_tx_tag_present(skb)) {
3156                 if (pt_prev) {
3157                         ret = deliver_skb(skb, pt_prev, orig_dev);
3158                         pt_prev = NULL;
3159                 }
3160                 if (vlan_hwaccel_do_receive(&skb)) {
3161                         ret = __netif_receive_skb(skb);
3162                         goto out;
3163                 } else if (unlikely(!skb))
3164                         goto out;
3165         }
3166
3167         vlan_on_bond_hook(skb);
3168
3169         /* deliver only exact match when indicated */
3170         null_or_dev = deliver_exact ? skb->dev : NULL;
3171
3172         type = skb->protocol;
3173         list_for_each_entry_rcu(ptype,
3174                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3175                 if (ptype->type == type &&
3176                     (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3177                      ptype->dev == orig_dev)) {
3178                         if (pt_prev)
3179                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3180                         pt_prev = ptype;
3181                 }
3182         }
3183
3184         if (pt_prev) {
3185                 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3186         } else {
3187                 atomic_long_inc(&skb->dev->rx_dropped);
3188                 kfree_skb(skb);
3189                 /* Jamal, now you will not able to escape explaining
3190                  * me how you were going to use this. :-)
3191                  */
3192                 ret = NET_RX_DROP;
3193         }
3194
3195 out:
3196         rcu_read_unlock();
3197         return ret;
3198 }
3199
3200 /**
3201  *      netif_receive_skb - process receive buffer from network
3202  *      @skb: buffer to process
3203  *
3204  *      netif_receive_skb() is the main receive data processing function.
3205  *      It always succeeds. The buffer may be dropped during processing
3206  *      for congestion control or by the protocol layers.
3207  *
3208  *      This function may only be called from softirq context and interrupts
3209  *      should be enabled.
3210  *
3211  *      Return values (usually ignored):
3212  *      NET_RX_SUCCESS: no congestion
3213  *      NET_RX_DROP: packet was dropped
3214  */
3215 int netif_receive_skb(struct sk_buff *skb)
3216 {
3217         if (netdev_tstamp_prequeue)
3218                 net_timestamp_check(skb);
3219
3220         if (skb_defer_rx_timestamp(skb))
3221                 return NET_RX_SUCCESS;
3222
3223 #ifdef CONFIG_RPS
3224         {
3225                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3226                 int cpu, ret;
3227
3228                 rcu_read_lock();
3229
3230                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3231
3232                 if (cpu >= 0) {
3233                         ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3234                         rcu_read_unlock();
3235                 } else {
3236                         rcu_read_unlock();
3237                         ret = __netif_receive_skb(skb);
3238                 }
3239
3240                 return ret;
3241         }
3242 #else
3243         return __netif_receive_skb(skb);
3244 #endif
3245 }
3246 EXPORT_SYMBOL(netif_receive_skb);
3247
3248 /* Network device is going away, flush any packets still pending
3249  * Called with irqs disabled.
3250  */
3251 static void flush_backlog(void *arg)
3252 {
3253         struct net_device *dev = arg;
3254         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3255         struct sk_buff *skb, *tmp;
3256
3257         rps_lock(sd);
3258         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3259                 if (skb->dev == dev) {
3260                         __skb_unlink(skb, &sd->input_pkt_queue);
3261                         kfree_skb(skb);
3262                         input_queue_head_incr(sd);
3263                 }
3264         }
3265         rps_unlock(sd);
3266
3267         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3268                 if (skb->dev == dev) {
3269                         __skb_unlink(skb, &sd->process_queue);
3270                         kfree_skb(skb);
3271                         input_queue_head_incr(sd);
3272                 }
3273         }
3274 }
3275
3276 static int napi_gro_complete(struct sk_buff *skb)
3277 {
3278         struct packet_type *ptype;
3279         __be16 type = skb->protocol;
3280         struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3281         int err = -ENOENT;
3282
3283         if (NAPI_GRO_CB(skb)->count == 1) {
3284                 skb_shinfo(skb)->gso_size = 0;
3285                 goto out;
3286         }
3287
3288         rcu_read_lock();
3289         list_for_each_entry_rcu(ptype, head, list) {
3290                 if (ptype->type != type || ptype->dev || !ptype->gro_complete)
3291                         continue;
3292
3293                 err = ptype->gro_complete(skb);
3294                 break;
3295         }
3296         rcu_read_unlock();
3297
3298         if (err) {
3299                 WARN_ON(&ptype->list == head);
3300                 kfree_skb(skb);
3301                 return NET_RX_SUCCESS;
3302         }
3303
3304 out:
3305         return netif_receive_skb(skb);
3306 }
3307
3308 inline void napi_gro_flush(struct napi_struct *napi)
3309 {
3310         struct sk_buff *skb, *next;
3311
3312         for (skb = napi->gro_list; skb; skb = next) {
3313                 next = skb->next;
3314                 skb->next = NULL;
3315                 napi_gro_complete(skb);
3316         }
3317
3318         napi->gro_count = 0;
3319         napi->gro_list = NULL;
3320 }
3321 EXPORT_SYMBOL(napi_gro_flush);
3322
3323 enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3324 {
3325         struct sk_buff **pp = NULL;
3326         struct packet_type *ptype;
3327         __be16 type = skb->protocol;
3328         struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3329         int same_flow;
3330         int mac_len;
3331         enum gro_result ret;
3332
3333         if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3334                 goto normal;
3335
3336         if (skb_is_gso(skb) || skb_has_frag_list(skb))
3337                 goto normal;
3338
3339         rcu_read_lock();
3340         list_for_each_entry_rcu(ptype, head, list) {
3341                 if (ptype->type != type || ptype->dev || !ptype->gro_receive)
3342                         continue;
3343
3344                 skb_set_network_header(skb, skb_gro_offset(skb));
3345                 mac_len = skb->network_header - skb->mac_header;
3346                 skb->mac_len = mac_len;
3347                 NAPI_GRO_CB(skb)->same_flow = 0;
3348                 NAPI_GRO_CB(skb)->flush = 0;
3349                 NAPI_GRO_CB(skb)->free = 0;
3350
3351                 pp = ptype->gro_receive(&napi->gro_list, skb);
3352                 break;
3353         }
3354         rcu_read_unlock();
3355
3356         if (&ptype->list == head)
3357                 goto normal;
3358
3359         same_flow = NAPI_GRO_CB(skb)->same_flow;
3360         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3361
3362         if (pp) {
3363                 struct sk_buff *nskb = *pp;
3364
3365                 *pp = nskb->next;
3366                 nskb->next = NULL;
3367                 napi_gro_complete(nskb);
3368                 napi->gro_count--;
3369         }
3370
3371         if (same_flow)
3372                 goto ok;
3373
3374         if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3375                 goto normal;
3376
3377         napi->gro_count++;
3378         NAPI_GRO_CB(skb)->count = 1;
3379         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3380         skb->next = napi->gro_list;
3381         napi->gro_list = skb;
3382         ret = GRO_HELD;
3383
3384 pull:
3385         if (skb_headlen(skb) < skb_gro_offset(skb)) {
3386                 int grow = skb_gro_offset(skb) - skb_headlen(skb);
3387
3388                 BUG_ON(skb->end - skb->tail < grow);
3389
3390                 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3391
3392                 skb->tail += grow;
3393                 skb->data_len -= grow;
3394
3395                 skb_shinfo(skb)->frags[0].page_offset += grow;
3396                 skb_shinfo(skb)->frags[0].size -= grow;
3397
3398                 if (unlikely(!skb_shinfo(skb)->frags[0].size)) {
3399                         put_page(skb_shinfo(skb)->frags[0].page);
3400                         memmove(skb_shinfo(skb)->frags,
3401                                 skb_shinfo(skb)->frags + 1,
3402                                 --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3403                 }
3404         }
3405
3406 ok:
3407         return ret;
3408
3409 normal:
3410         ret = GRO_NORMAL;
3411         goto pull;
3412 }
3413 EXPORT_SYMBOL(dev_gro_receive);
3414
3415 static inline gro_result_t
3416 __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3417 {
3418         struct sk_buff *p;
3419
3420         for (p = napi->gro_list; p; p = p->next) {
3421                 unsigned long diffs;
3422
3423                 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3424                 diffs |= p->vlan_tci ^ skb->vlan_tci;
3425                 diffs |= compare_ether_header(skb_mac_header(p),
3426                                               skb_gro_mac_header(skb));
3427                 NAPI_GRO_CB(p)->same_flow = !diffs;
3428                 NAPI_GRO_CB(p)->flush = 0;
3429         }
3430
3431         return dev_gro_receive(napi, skb);
3432 }
3433
3434 gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3435 {
3436         switch (ret) {
3437         case GRO_NORMAL:
3438                 if (netif_receive_skb(skb))
3439                         ret = GRO_DROP;
3440                 break;
3441
3442         case GRO_DROP:
3443         case GRO_MERGED_FREE:
3444                 kfree_skb(skb);
3445                 break;
3446
3447         case GRO_HELD:
3448         case GRO_MERGED:
3449                 break;
3450         }
3451
3452         return ret;
3453 }
3454 EXPORT_SYMBOL(napi_skb_finish);
3455
3456 void skb_gro_reset_offset(struct sk_buff *skb)
3457 {
3458         NAPI_GRO_CB(skb)->data_offset = 0;
3459         NAPI_GRO_CB(skb)->frag0 = NULL;
3460         NAPI_GRO_CB(skb)->frag0_len = 0;
3461
3462         if (skb->mac_header == skb->tail &&
3463             !PageHighMem(skb_shinfo(skb)->frags[0].page)) {
3464                 NAPI_GRO_CB(skb)->frag0 =
3465                         page_address(skb_shinfo(skb)->frags[0].page) +
3466                         skb_shinfo(skb)->frags[0].page_offset;
3467                 NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size;
3468         }
3469 }
3470 EXPORT_SYMBOL(skb_gro_reset_offset);
3471
3472 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3473 {
3474         skb_gro_reset_offset(skb);
3475
3476         return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
3477 }
3478 EXPORT_SYMBOL(napi_gro_receive);
3479
3480 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3481 {
3482         __skb_pull(skb, skb_headlen(skb));
3483         skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
3484         skb->vlan_tci = 0;
3485         skb->dev = napi->dev;
3486         skb->skb_iif = 0;
3487
3488         napi->skb = skb;
3489 }
3490
3491 struct sk_buff *napi_get_frags(struct napi_struct *napi)
3492 {
3493         struct sk_buff *skb = napi->skb;
3494
3495         if (!skb) {
3496                 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3497                 if (skb)
3498                         napi->skb = skb;
3499         }
3500         return skb;
3501 }
3502 EXPORT_SYMBOL(napi_get_frags);
3503
3504 gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3505                                gro_result_t ret)
3506 {
3507         switch (ret) {
3508         case GRO_NORMAL:
3509         case GRO_HELD:
3510                 skb->protocol = eth_type_trans(skb, skb->dev);
3511
3512                 if (ret == GRO_HELD)
3513                         skb_gro_pull(skb, -ETH_HLEN);
3514                 else if (netif_receive_skb(skb))
3515                         ret = GRO_DROP;
3516                 break;
3517
3518         case GRO_DROP:
3519         case GRO_MERGED_FREE:
3520                 napi_reuse_skb(napi, skb);
3521                 break;
3522
3523         case GRO_MERGED:
3524                 break;
3525         }
3526
3527         return ret;
3528 }
3529 EXPORT_SYMBOL(napi_frags_finish);
3530
3531 struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3532 {
3533         struct sk_buff *skb = napi->skb;
3534         struct ethhdr *eth;
3535         unsigned int hlen;
3536         unsigned int off;
3537
3538         napi->skb = NULL;
3539
3540         skb_reset_mac_header(skb);
3541         skb_gro_reset_offset(skb);
3542
3543         off = skb_gro_offset(skb);
3544         hlen = off + sizeof(*eth);
3545         eth = skb_gro_header_fast(skb, off);
3546         if (skb_gro_header_hard(skb, hlen)) {
3547                 eth = skb_gro_header_slow(skb, hlen, off);
3548                 if (unlikely(!eth)) {
3549                         napi_reuse_skb(napi, skb);
3550                         skb = NULL;
3551                         goto out;
3552                 }
3553         }
3554
3555         skb_gro_pull(skb, sizeof(*eth));
3556
3557         /*
3558          * This works because the only protocols we care about don't require
3559          * special handling.  We'll fix it up properly at the end.
3560          */
3561         skb->protocol = eth->h_proto;
3562
3563 out:
3564         return skb;
3565 }
3566 EXPORT_SYMBOL(napi_frags_skb);
3567
3568 gro_result_t napi_gro_frags(struct napi_struct *napi)
3569 {
3570         struct sk_buff *skb = napi_frags_skb(napi);
3571
3572         if (!skb)
3573                 return GRO_DROP;
3574
3575         return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
3576 }
3577 EXPORT_SYMBOL(napi_gro_frags);
3578
3579 /*
3580  * net_rps_action sends any pending IPI's for rps.
3581  * Note: called with local irq disabled, but exits with local irq enabled.
3582  */
3583 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3584 {
3585 #ifdef CONFIG_RPS
3586         struct softnet_data *remsd = sd->rps_ipi_list;
3587
3588         if (remsd) {
3589                 sd->rps_ipi_list = NULL;
3590
3591                 local_irq_enable();
3592
3593                 /* Send pending IPI's to kick RPS processing on remote cpus. */
3594                 while (remsd) {
3595                         struct softnet_data *next = remsd->rps_ipi_next;
3596
3597                         if (cpu_online(remsd->cpu))
3598                                 __smp_call_function_single(remsd->cpu,
3599                                                            &remsd->csd, 0);
3600                         remsd = next;
3601                 }
3602         } else
3603 #endif
3604                 local_irq_enable();
3605 }
3606
3607 static int process_backlog(struct napi_struct *napi, int quota)
3608 {
3609         int work = 0;
3610         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
3611
3612 #ifdef CONFIG_RPS
3613         /* Check if we have pending ipi, its better to send them now,
3614          * not waiting net_rx_action() end.
3615          */
3616         if (sd->rps_ipi_list) {
3617                 local_irq_disable();
3618                 net_rps_action_and_irq_enable(sd);
3619         }
3620 #endif
3621         napi->weight = weight_p;
3622         local_irq_disable();
3623         while (work < quota) {
3624                 struct sk_buff *skb;
3625                 unsigned int qlen;
3626
3627                 while ((skb = __skb_dequeue(&sd->process_queue))) {
3628                         local_irq_enable();
3629                         __netif_receive_skb(skb);
3630                         local_irq_disable();
3631                         input_queue_head_incr(sd);
3632                         if (++work >= quota) {
3633                                 local_irq_enable();
3634                                 return work;
3635                         }
3636                 }
3637
3638                 rps_lock(sd);
3639                 qlen = skb_queue_len(&sd->input_pkt_queue);
3640                 if (qlen)
3641                         skb_queue_splice_tail_init(&sd->input_pkt_queue,
3642                                                    &sd->process_queue);
3643
3644                 if (qlen < quota - work) {
3645                         /*
3646                          * Inline a custom version of __napi_complete().
3647                          * only current cpu owns and manipulates this napi,
3648                          * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3649                          * we can use a plain write instead of clear_bit(),
3650                          * and we dont need an smp_mb() memory barrier.
3651                          */
3652                         list_del(&napi->poll_list);
3653                         napi->state = 0;
3654
3655                         quota = work + qlen;
3656                 }
3657                 rps_unlock(sd);
3658         }
3659         local_irq_enable();
3660
3661         return work;
3662 }
3663
3664 /**
3665  * __napi_schedule - schedule for receive
3666  * @n: entry to schedule
3667  *
3668  * The entry's receive function will be scheduled to run
3669  */
3670 void __napi_schedule(struct napi_struct *n)
3671 {
3672         unsigned long flags;
3673
3674         local_irq_save(flags);
3675         ____napi_schedule(&__get_cpu_var(softnet_data), n);
3676         local_irq_restore(flags);
3677 }
3678 EXPORT_SYMBOL(__napi_schedule);
3679
3680 void __napi_complete(struct napi_struct *n)
3681 {
3682         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3683         BUG_ON(n->gro_list);
3684
3685         list_del(&n->poll_list);
3686         smp_mb__before_clear_bit();
3687         clear_bit(NAPI_STATE_SCHED, &n->state);
3688 }
3689 EXPORT_SYMBOL(__napi_complete);
3690
3691 void napi_complete(struct napi_struct *n)
3692 {
3693         unsigned long flags;
3694
3695         /*
3696          * don't let napi dequeue from the cpu poll list
3697          * just in case its running on a different cpu
3698          */
3699         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3700                 return;
3701
3702         napi_gro_flush(n);
3703         local_irq_save(flags);
3704         __napi_complete(n);
3705         local_irq_restore(flags);
3706 }
3707 EXPORT_SYMBOL(napi_complete);
3708
3709 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3710                     int (*poll)(struct napi_struct *, int), int weight)
3711 {
3712         INIT_LIST_HEAD(&napi->poll_list);
3713         napi->gro_count = 0;
3714         napi->gro_list = NULL;
3715         napi->skb = NULL;
3716         napi->poll = poll;
3717         napi->weight = weight;
3718         list_add(&napi->dev_list, &dev->napi_list);
3719         napi->dev = dev;
3720 #ifdef CONFIG_NETPOLL
3721         spin_lock_init(&napi->poll_lock);
3722         napi->poll_owner = -1;
3723 #endif
3724         set_bit(NAPI_STATE_SCHED, &napi->state);
3725 }
3726 EXPORT_SYMBOL(netif_napi_add);
3727
3728 void netif_napi_del(struct napi_struct *napi)
3729 {
3730         struct sk_buff *skb, *next;
3731
3732         list_del_init(&napi->dev_list);
3733         napi_free_frags(napi);
3734
3735         for (skb = napi->gro_list; skb; skb = next) {
3736                 next = skb->next;
3737                 skb->next = NULL;
3738                 kfree_skb(skb);
3739         }
3740
3741         napi->gro_list = NULL;
3742         napi->gro_count = 0;
3743 }
3744 EXPORT_SYMBOL(netif_napi_del);
3745
3746 static void net_rx_action(struct softirq_action *h)
3747 {
3748         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3749         unsigned long time_limit = jiffies + 2;
3750         int budget = netdev_budget;
3751         void *have;
3752
3753         local_irq_disable();
3754
3755         while (!list_empty(&sd->poll_list)) {
3756                 struct napi_struct *n;
3757                 int work, weight;
3758
3759                 /* If softirq window is exhuasted then punt.
3760                  * Allow this to run for 2 jiffies since which will allow
3761                  * an average latency of 1.5/HZ.
3762                  */
3763                 if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
3764                         goto softnet_break;
3765
3766                 local_irq_enable();
3767
3768                 /* Even though interrupts have been re-enabled, this
3769                  * access is safe because interrupts can only add new
3770                  * entries to the tail of this list, and only ->poll()
3771                  * calls can remove this head entry from the list.
3772                  */
3773                 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
3774
3775                 have = netpoll_poll_lock(n);
3776
3777                 weight = n->weight;
3778
3779                 /* This NAPI_STATE_SCHED test is for avoiding a race
3780                  * with netpoll's poll_napi().  Only the entity which
3781                  * obtains the lock and sees NAPI_STATE_SCHED set will
3782                  * actually make the ->poll() call.  Therefore we avoid
3783                  * accidently calling ->poll() when NAPI is not scheduled.
3784                  */
3785                 work = 0;
3786                 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
3787                         work = n->poll(n, weight);
3788                         trace_napi_poll(n);
3789                 }
3790
3791                 WARN_ON_ONCE(work > weight);
3792
3793                 budget -= work;
3794
3795                 local_irq_disable();
3796
3797                 /* Drivers must not modify the NAPI state if they
3798                  * consume the entire weight.  In such cases this code
3799                  * still "owns" the NAPI instance and therefore can
3800                  * move the instance around on the list at-will.
3801                  */
3802                 if (unlikely(work == weight)) {
3803                         if (unlikely(napi_disable_pending(n))) {
3804                                 local_irq_enable();
3805                                 napi_complete(n);
3806                                 local_irq_disable();
3807                         } else
3808                                 list_move_tail(&n->poll_list, &sd->poll_list);
3809                 }
3810
3811                 netpoll_poll_unlock(have);
3812         }
3813 out:
3814         net_rps_action_and_irq_enable(sd);
3815
3816 #ifdef CONFIG_NET_DMA
3817         /*
3818          * There may not be any more sk_buffs coming right now, so push
3819          * any pending DMA copies to hardware
3820          */
3821         dma_issue_pending_all();
3822 #endif
3823
3824         return;
3825
3826 softnet_break:
3827         sd->time_squeeze++;
3828         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3829         goto out;
3830 }
3831
3832 static gifconf_func_t *gifconf_list[NPROTO];
3833
3834 /**
3835  *      register_gifconf        -       register a SIOCGIF handler
3836  *      @family: Address family
3837  *      @gifconf: Function handler
3838  *
3839  *      Register protocol dependent address dumping routines. The handler
3840  *      that is passed must not be freed or reused until it has been replaced
3841  *      by another handler.
3842  */
3843 int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
3844 {
3845         if (family >= NPROTO)
3846                 return -EINVAL;
3847         gifconf_list[family] = gifconf;
3848         return 0;
3849 }
3850 EXPORT_SYMBOL(register_gifconf);
3851
3852
3853 /*
3854  *      Map an interface index to its name (SIOCGIFNAME)
3855  */
3856
3857 /*
3858  *      We need this ioctl for efficient implementation of the
3859  *      if_indextoname() function required by the IPv6 API.  Without
3860  *      it, we would have to search all the interfaces to find a
3861  *      match.  --pb
3862  */
3863
3864 static int dev_ifname(struct net *net, struct ifreq __user *arg)
3865 {
3866         struct net_device *dev;
3867         struct ifreq ifr;
3868
3869         /*
3870          *      Fetch the caller's info block.
3871          */
3872
3873         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3874                 return -EFAULT;
3875
3876         rcu_read_lock();
3877         dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
3878         if (!dev) {
3879                 rcu_read_unlock();
3880                 return -ENODEV;
3881         }
3882
3883         strcpy(ifr.ifr_name, dev->name);
3884         rcu_read_unlock();
3885
3886         if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
3887                 return -EFAULT;
3888         return 0;
3889 }
3890
3891 /*
3892  *      Perform a SIOCGIFCONF call. This structure will change
3893  *      size eventually, and there is nothing I can do about it.
3894  *      Thus we will need a 'compatibility mode'.
3895  */
3896
3897 static int dev_ifconf(struct net *net, char __user *arg)
3898 {
3899         struct ifconf ifc;
3900         struct net_device *dev;
3901         char __user *pos;
3902         int len;
3903         int total;
3904         int i;
3905
3906         /*
3907          *      Fetch the caller's info block.
3908          */
3909
3910         if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
3911                 return -EFAULT;
3912
3913         pos = ifc.ifc_buf;
3914         len = ifc.ifc_len;
3915
3916         /*
3917          *      Loop over the interfaces, and write an info block for each.
3918          */
3919
3920         total = 0;
3921         for_each_netdev(net, dev) {
3922                 for (i = 0; i < NPROTO; i++) {
3923                         if (gifconf_list[i]) {
3924                                 int done;
3925                                 if (!pos)
3926                                         done = gifconf_list[i](dev, NULL, 0);
3927                                 else
3928                                         done = gifconf_list[i](dev, pos + total,
3929                                                                len - total);
3930                                 if (done < 0)
3931                                         return -EFAULT;
3932                                 total += done;
3933                         }
3934                 }
3935         }
3936
3937         /*
3938          *      All done.  Write the updated control block back to the caller.
3939          */
3940         ifc.ifc_len = total;
3941
3942         /*
3943          *      Both BSD and Solaris return 0 here, so we do too.
3944          */
3945         return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
3946 }
3947
3948 #ifdef CONFIG_PROC_FS
3949 /*
3950  *      This is invoked by the /proc filesystem handler to display a device
3951  *      in detail.
3952  */
3953 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
3954         __acquires(RCU)
3955 {
3956         struct net *net = seq_file_net(seq);
3957         loff_t off;
3958         struct net_device *dev;
3959
3960         rcu_read_lock();
3961         if (!*pos)
3962                 return SEQ_START_TOKEN;
3963
3964         off = 1;
3965         for_each_netdev_rcu(net, dev)
3966                 if (off++ == *pos)
3967                         return dev;
3968
3969         return NULL;
3970 }
3971
3972 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3973 {
3974         struct net_device *dev = v;
3975
3976         if (v == SEQ_START_TOKEN)
3977                 dev = first_net_device_rcu(seq_file_net(seq));
3978         else
3979                 dev = next_net_device_rcu(dev);
3980
3981         ++*pos;
3982         return dev;
3983 }
3984
3985 void dev_seq_stop(struct seq_file *seq, void *v)
3986         __releases(RCU)
3987 {
3988         rcu_read_unlock();
3989 }
3990
3991 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
3992 {
3993         struct rtnl_link_stats64 temp;
3994         const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
3995
3996         seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
3997                    "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
3998                    dev->name, stats->rx_bytes, stats->rx_packets,
3999                    stats->rx_errors,
4000                    stats->rx_dropped + stats->rx_missed_errors,
4001                    stats->rx_fifo_errors,
4002                    stats->rx_length_errors + stats->rx_over_errors +
4003                     stats->rx_crc_errors + stats->rx_frame_errors,
4004                    stats->rx_compressed, stats->multicast,
4005                    stats->tx_bytes, stats->tx_packets,
4006                    stats->tx_errors, stats->tx_dropped,
4007                    stats->tx_fifo_errors, stats->collisions,
4008                    stats->tx_carrier_errors +
4009                     stats->tx_aborted_errors +
4010                     stats->tx_window_errors +
4011                     stats->tx_heartbeat_errors,
4012                    stats->tx_compressed);
4013 }
4014
4015 /*
4016  *      Called from the PROCfs module. This now uses the new arbitrary sized
4017  *      /proc/net interface to create /proc/net/dev
4018  */
4019 static int dev_seq_show(struct seq_file *seq, void *v)
4020 {
4021         if (v == SEQ_START_TOKEN)
4022                 seq_puts(seq, "Inter-|   Receive                            "
4023                               "                    |  Transmit\n"
4024                               " face |bytes    packets errs drop fifo frame "
4025                               "compressed multicast|bytes    packets errs "
4026                               "drop fifo colls carrier compressed\n");
4027         else
4028                 dev_seq_printf_stats(seq, v);
4029         return 0;
4030 }
4031
4032 static struct softnet_data *softnet_get_online(loff_t *pos)
4033 {
4034         struct softnet_data *sd = NULL;
4035
4036         while (*pos < nr_cpu_ids)
4037                 if (cpu_online(*pos)) {
4038                         sd = &per_cpu(softnet_data, *pos);
4039                         break;
4040                 } else
4041                         ++*pos;
4042         return sd;
4043 }
4044
4045 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
4046 {
4047         return softnet_get_online(pos);
4048 }
4049
4050 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4051 {
4052         ++*pos;
4053         return softnet_get_online(pos);
4054 }
4055
4056 static void softnet_seq_stop(struct seq_file *seq, void *v)
4057 {
4058 }
4059
4060 static int softnet_seq_show(struct seq_file *seq, void *v)
4061 {
4062         struct softnet_data *sd = v;
4063
4064         seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
4065                    sd->processed, sd->dropped, sd->time_squeeze, 0,
4066                    0, 0, 0, 0, /* was fastroute */
4067                    sd->cpu_collision, sd->received_rps);
4068         return 0;
4069 }
4070
4071 static const struct seq_operations dev_seq_ops = {
4072         .start = dev_seq_start,
4073         .next  = dev_seq_next,
4074         .stop  = dev_seq_stop,
4075         .show  = dev_seq_show,
4076 };
4077
4078 static int dev_seq_open(struct inode *inode, struct file *file)
4079 {
4080         return seq_open_net(inode, file, &dev_seq_ops,
4081                             sizeof(struct seq_net_private));
4082 }
4083
4084 static const struct file_operations dev_seq_fops = {
4085         .owner   = THIS_MODULE,
4086         .open    = dev_seq_open,
4087         .read    = seq_read,
4088         .llseek  = seq_lseek,
4089         .release = seq_release_net,
4090 };
4091
4092 static const struct seq_operations softnet_seq_ops = {
4093         .start = softnet_seq_start,
4094         .next  = softnet_seq_next,
4095         .stop  = softnet_seq_stop,
4096         .show  = softnet_seq_show,
4097 };
4098
4099 static int softnet_seq_open(struct inode *inode, struct file *file)
4100 {
4101         return seq_open(file, &softnet_seq_ops);
4102 }
4103
4104 static const struct file_operations softnet_seq_fops = {
4105         .owner   = THIS_MODULE,
4106         .open    = softnet_seq_open,
4107         .read    = seq_read,
4108         .llseek  = seq_lseek,
4109         .release = seq_release,
4110 };
4111
4112 static void *ptype_get_idx(loff_t pos)
4113 {
4114         struct packet_type *pt = NULL;
4115         loff_t i = 0;
4116         int t;
4117
4118         list_for_each_entry_rcu(pt, &ptype_all, list) {
4119                 if (i == pos)
4120                         return pt;
4121                 ++i;
4122         }
4123
4124         for (t = 0; t < PTYPE_HASH_SIZE; t++) {
4125                 list_for_each_entry_rcu(pt, &ptype_base[t], list) {
4126                         if (i == pos)
4127                                 return pt;
4128                         ++i;
4129                 }
4130         }
4131         return NULL;
4132 }
4133
4134 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
4135         __acquires(RCU)
4136 {
4137         rcu_read_lock();
4138         return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
4139 }
4140
4141 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4142 {
4143         struct packet_type *pt;
4144         struct list_head *nxt;
4145         int hash;
4146
4147         ++*pos;
4148         if (v == SEQ_START_TOKEN)
4149                 return ptype_get_idx(0);
4150
4151         pt = v;
4152         nxt = pt->list.next;
4153         if (pt->type == htons(ETH_P_ALL)) {
4154                 if (nxt != &ptype_all)
4155                         goto found;
4156                 hash = 0;
4157                 nxt = ptype_base[0].next;
4158         } else
4159                 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
4160
4161         while (nxt == &ptype_base[hash]) {
4162                 if (++hash >= PTYPE_HASH_SIZE)
4163                         return NULL;
4164                 nxt = ptype_base[hash].next;
4165         }
4166 found:
4167         return list_entry(nxt, struct packet_type, list);
4168 }
4169
4170 static void ptype_seq_stop(struct seq_file *seq, void *v)
4171         __releases(RCU)
4172 {
4173         rcu_read_unlock();
4174 }
4175
4176 static int ptype_seq_show(struct seq_file *seq, void *v)
4177 {
4178         struct packet_type *pt = v;
4179
4180         if (v == SEQ_START_TOKEN)
4181                 seq_puts(seq, "Type Device      Function\n");
4182         else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
4183                 if (pt->type == htons(ETH_P_ALL))
4184                         seq_puts(seq, "ALL ");
4185                 else
4186                         seq_printf(seq, "%04x", ntohs(pt->type));
4187
4188                 seq_printf(seq, " %-8s %pF\n",
4189                            pt->dev ? pt->dev->name : "", pt->func);
4190         }
4191
4192         return 0;
4193 }
4194
4195 static const struct seq_operations ptype_seq_ops = {
4196         .start = ptype_seq_start,
4197         .next  = ptype_seq_next,
4198         .stop  = ptype_seq_stop,
4199         .show  = ptype_seq_show,
4200 };
4201
4202 static int ptype_seq_open(struct inode *inode, struct file *file)
4203 {
4204         return seq_open_net(inode, file, &ptype_seq_ops,
4205                         sizeof(struct seq_net_private));
4206 }
4207
4208 static const struct file_operations ptype_seq_fops = {
4209         .owner   = THIS_MODULE,
4210         .open    = ptype_seq_open,
4211         .read    = seq_read,
4212         .llseek  = seq_lseek,
4213         .release = seq_release_net,
4214 };
4215
4216
4217 static int __net_init dev_proc_net_init(struct net *net)
4218 {
4219         int rc = -ENOMEM;
4220
4221         if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
4222                 goto out;
4223         if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
4224                 goto out_dev;
4225         if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
4226                 goto out_softnet;
4227
4228         if (wext_proc_init(net))
4229                 goto out_ptype;
4230         rc = 0;
4231 out:
4232         return rc;
4233 out_ptype:
4234         proc_net_remove(net, "ptype");
4235 out_softnet:
4236         proc_net_remove(net, "softnet_stat");
4237 out_dev:
4238         proc_net_remove(net, "dev");
4239         goto out;
4240 }
4241
4242 static void __net_exit dev_proc_net_exit(struct net *net)
4243 {
4244         wext_proc_exit(net);
4245
4246         proc_net_remove(net, "ptype");
4247         proc_net_remove(net, "softnet_stat");
4248         proc_net_remove(net, "dev");
4249 }
4250
4251 static struct pernet_operations __net_initdata dev_proc_ops = {
4252         .init = dev_proc_net_init,
4253         .exit = dev_proc_net_exit,
4254 };
4255
4256 static int __init dev_proc_init(void)
4257 {
4258         return register_pernet_subsys(&dev_proc_ops);
4259 }
4260 #else
4261 #define dev_proc_init() 0
4262 #endif  /* CONFIG_PROC_FS */
4263
4264
4265 /**
4266  *      netdev_set_master       -       set up master pointer
4267  *      @slave: slave device
4268  *      @master: new master device
4269  *
4270  *      Changes the master device of the slave. Pass %NULL to break the
4271  *      bonding. The caller must hold the RTNL semaphore. On a failure
4272  *      a negative errno code is returned. On success the reference counts
4273  *      are adjusted and the function returns zero.
4274  */
4275 int netdev_set_master(struct net_device *slave, struct net_device *master)
4276 {
4277         struct net_device *old = slave->master;
4278
4279         ASSERT_RTNL();
4280
4281         if (master) {
4282                 if (old)
4283                         return -EBUSY;
4284                 dev_hold(master);
4285         }
4286
4287         slave->master = master;
4288
4289         if (old) {
4290                 synchronize_net();
4291                 dev_put(old);
4292         }
4293         return 0;
4294 }
4295 EXPORT_SYMBOL(netdev_set_master);
4296
4297 /**
4298  *      netdev_set_bond_master  -       set up bonding master/slave pair
4299  *      @slave: slave device
4300  *      @master: new master device
4301  *
4302  *      Changes the master device of the slave. Pass %NULL to break the
4303  *      bonding. The caller must hold the RTNL semaphore. On a failure
4304  *      a negative errno code is returned. On success %RTM_NEWLINK is sent
4305  *      to the routing socket and the function returns zero.
4306  */
4307 int netdev_set_bond_master(struct net_device *slave, struct net_device *master)
4308 {
4309         int err;
4310
4311         ASSERT_RTNL();
4312
4313         err = netdev_set_master(slave, master);
4314         if (err)
4315                 return err;
4316         if (master)
4317                 slave->flags |= IFF_SLAVE;
4318         else
4319                 slave->flags &= ~IFF_SLAVE;
4320
4321         rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4322         return 0;
4323 }
4324 EXPORT_SYMBOL(netdev_set_bond_master);
4325
4326 static void dev_change_rx_flags(struct net_device *dev, int flags)
4327 {
4328         const struct net_device_ops *ops = dev->netdev_ops;
4329
4330         if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4331                 ops->ndo_change_rx_flags(dev, flags);
4332 }
4333
4334 static int __dev_set_promiscuity(struct net_device *dev, int inc)
4335 {
4336         unsigned short old_flags = dev->flags;
4337         uid_t uid;
4338         gid_t gid;
4339
4340         ASSERT_RTNL();
4341
4342         dev->flags |= IFF_PROMISC;
4343         dev->promiscuity += inc;
4344         if (dev->promiscuity == 0) {
4345                 /*
4346                  * Avoid overflow.
4347                  * If inc causes overflow, untouch promisc and return error.
4348                  */
4349                 if (inc < 0)
4350                         dev->flags &= ~IFF_PROMISC;
4351                 else {
4352                         dev->promiscuity -= inc;
4353                         printk(KERN_WARNING "%s: promiscuity touches roof, "
4354                                 "set promiscuity failed, promiscuity feature "
4355                                 "of device might be broken.\n", dev->name);
4356                         return -EOVERFLOW;
4357                 }
4358         }
4359         if (dev->flags != old_flags) {
4360                 printk(KERN_INFO "device %s %s promiscuous mode\n",
4361                        dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
4362                                                                "left");
4363                 if (audit_enabled) {
4364                         current_uid_gid(&uid, &gid);
4365                         audit_log(current->audit_context, GFP_ATOMIC,
4366                                 AUDIT_ANOM_PROMISCUOUS,
4367                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4368                                 dev->name, (dev->flags & IFF_PROMISC),
4369                                 (old_flags & IFF_PROMISC),
4370                                 audit_get_loginuid(current),
4371                                 uid, gid,
4372                                 audit_get_sessionid(current));
4373                 }
4374
4375                 dev_change_rx_flags(dev, IFF_PROMISC);
4376         }
4377         return 0;
4378 }
4379
4380 /**
4381  *      dev_set_promiscuity     - update promiscuity count on a device
4382  *      @dev: device
4383  *      @inc: modifier
4384  *
4385  *      Add or remove promiscuity from a device. While the count in the device
4386  *      remains above zero the interface remains promiscuous. Once it hits zero
4387  *      the device reverts back to normal filtering operation. A negative inc
4388  *      value is used to drop promiscuity on the device.
4389  *      Return 0 if successful or a negative errno code on error.
4390  */
4391 int dev_set_promiscuity(struct net_device *dev, int inc)
4392 {
4393         unsigned short old_flags = dev->flags;
4394         int err;
4395
4396         err = __dev_set_promiscuity(dev, inc);
4397         if (err < 0)
4398                 return err;
4399         if (dev->flags != old_flags)
4400                 dev_set_rx_mode(dev);
4401         return err;
4402 }
4403 EXPORT_SYMBOL(dev_set_promiscuity);
4404
4405 /**
4406  *      dev_set_allmulti        - update allmulti count on a device
4407  *      @dev: device
4408  *      @inc: modifier
4409  *
4410  *      Add or remove reception of all multicast frames to a device. While the
4411  *      count in the device remains above zero the interface remains listening
4412  *      to all interfaces. Once it hits zero the device reverts back to normal
4413  *      filtering operation. A negative @inc value is used to drop the counter
4414  *      when releasing a resource needing all multicasts.
4415  *      Return 0 if successful or a negative errno code on error.
4416  */
4417
4418 int dev_set_allmulti(struct net_device *dev, int inc)
4419 {
4420         unsigned short old_flags = dev->flags;
4421
4422         ASSERT_RTNL();
4423
4424         dev->flags |= IFF_ALLMULTI;
4425         dev->allmulti += inc;
4426         if (dev->allmulti == 0) {
4427                 /*
4428                  * Avoid overflow.
4429                  * If inc causes overflow, untouch allmulti and return error.
4430                  */
4431                 if (inc < 0)
4432                         dev->flags &= ~IFF_ALLMULTI;
4433                 else {
4434                         dev->allmulti -= inc;
4435                         printk(KERN_WARNING "%s: allmulti touches roof, "
4436                                 "set allmulti failed, allmulti feature of "
4437                                 "device might be broken.\n", dev->name);
4438                         return -EOVERFLOW;
4439                 }
4440         }
4441         if (dev->flags ^ old_flags) {
4442                 dev_change_rx_flags(dev, IFF_ALLMULTI);
4443                 dev_set_rx_mode(dev);
4444         }
4445         return 0;
4446 }
4447 EXPORT_SYMBOL(dev_set_allmulti);
4448
4449 /*
4450  *      Upload unicast and multicast address lists to device and
4451  *      configure RX filtering. When the device doesn't support unicast
4452  *      filtering it is put in promiscuous mode while unicast addresses
4453  *      are present.
4454  */
4455 void __dev_set_rx_mode(struct net_device *dev)
4456 {
4457         const struct net_device_ops *ops = dev->netdev_ops;
4458
4459         /* dev_open will call this function so the list will stay sane. */
4460         if (!(dev->flags&IFF_UP))
4461                 return;
4462
4463         if (!netif_device_present(dev))
4464                 return;
4465
4466         if (ops->ndo_set_rx_mode)
4467                 ops->ndo_set_rx_mode(dev);
4468         else {
4469                 /* Unicast addresses changes may only happen under the rtnl,
4470                  * therefore calling __dev_set_promiscuity here is safe.
4471                  */
4472                 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4473                         __dev_set_promiscuity(dev, 1);
4474                         dev->uc_promisc = 1;
4475                 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4476                         __dev_set_promiscuity(dev, -1);
4477                         dev->uc_promisc = 0;
4478                 }
4479
4480                 if (ops->ndo_set_multicast_list)
4481                         ops->ndo_set_multicast_list(dev);
4482         }
4483 }
4484
4485 void dev_set_rx_mode(struct net_device *dev)
4486 {
4487         netif_addr_lock_bh(dev);
4488         __dev_set_rx_mode(dev);
4489         netif_addr_unlock_bh(dev);
4490 }
4491
4492 /**
4493  *      dev_get_flags - get flags reported to userspace
4494  *      @dev: device
4495  *
4496  *      Get the combination of flag bits exported through APIs to userspace.
4497  */
4498 unsigned dev_get_flags(const struct net_device *dev)
4499 {
4500         unsigned flags;
4501
4502         flags = (dev->flags & ~(IFF_PROMISC |
4503                                 IFF_ALLMULTI |
4504                                 IFF_RUNNING |
4505                                 IFF_LOWER_UP |
4506                                 IFF_DORMANT)) |
4507                 (dev->gflags & (IFF_PROMISC |
4508                                 IFF_ALLMULTI));
4509
4510         if (netif_running(dev)) {
4511                 if (netif_oper_up(dev))
4512                         flags |= IFF_RUNNING;
4513                 if (netif_carrier_ok(dev))
4514                         flags |= IFF_LOWER_UP;
4515                 if (netif_dormant(dev))
4516                         flags |= IFF_DORMANT;
4517         }
4518
4519         return flags;
4520 }
4521 EXPORT_SYMBOL(dev_get_flags);
4522
4523 int __dev_change_flags(struct net_device *dev, unsigned int flags)
4524 {
4525         int old_flags = dev->flags;
4526         int ret;
4527
4528         ASSERT_RTNL();
4529
4530         /*
4531          *      Set the flags on our device.
4532          */
4533
4534         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4535                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4536                                IFF_AUTOMEDIA)) |
4537                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4538                                     IFF_ALLMULTI));
4539
4540         /*
4541          *      Load in the correct multicast list now the flags have changed.
4542          */
4543
4544         if ((old_flags ^ flags) & IFF_MULTICAST)
4545                 dev_change_rx_flags(dev, IFF_MULTICAST);
4546
4547         dev_set_rx_mode(dev);
4548
4549         /*
4550          *      Have we downed the interface. We handle IFF_UP ourselves
4551          *      according to user attempts to set it, rather than blindly
4552          *      setting it.
4553          */
4554
4555         ret = 0;
4556         if ((old_flags ^ flags) & IFF_UP) {     /* Bit is different  ? */
4557                 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4558
4559                 if (!ret)
4560                         dev_set_rx_mode(dev);
4561         }
4562
4563         if ((flags ^ dev->gflags) & IFF_PROMISC) {
4564                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
4565
4566                 dev->gflags ^= IFF_PROMISC;
4567                 dev_set_promiscuity(dev, inc);
4568         }
4569
4570         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4571            is important. Some (broken) drivers set IFF_PROMISC, when
4572            IFF_ALLMULTI is requested not asking us and not reporting.
4573          */
4574         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4575                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4576
4577                 dev->gflags ^= IFF_ALLMULTI;
4578                 dev_set_allmulti(dev, inc);
4579         }
4580
4581         return ret;
4582 }
4583
4584 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4585 {
4586         unsigned int changes = dev->flags ^ old_flags;
4587
4588         if (changes & IFF_UP) {
4589                 if (dev->flags & IFF_UP)
4590                         call_netdevice_notifiers(NETDEV_UP, dev);
4591                 else
4592                         call_netdevice_notifiers(NETDEV_DOWN, dev);
4593         }
4594
4595         if (dev->flags & IFF_UP &&
4596             (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4597                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
4598 }
4599
4600 /**
4601  *      dev_change_flags - change device settings
4602  *      @dev: device
4603  *      @flags: device state flags
4604  *
4605  *      Change settings on device based state flags. The flags are
4606  *      in the userspace exported format.
4607  */
4608 int dev_change_flags(struct net_device *dev, unsigned flags)
4609 {
4610         int ret, changes;
4611         int old_flags = dev->flags;
4612
4613         ret = __dev_change_flags(dev, flags);
4614         if (ret < 0)
4615                 return ret;
4616
4617         changes = old_flags ^ dev->flags;
4618         if (changes)
4619                 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4620
4621         __dev_notify_flags(dev, old_flags);
4622         return ret;
4623 }
4624 EXPORT_SYMBOL(dev_change_flags);
4625
4626 /**
4627  *      dev_set_mtu - Change maximum transfer unit
4628  *      @dev: device
4629  *      @new_mtu: new transfer unit
4630  *
4631  *      Change the maximum transfer size of the network device.
4632  */
4633 int dev_set_mtu(struct net_device *dev, int new_mtu)
4634 {
4635         const struct net_device_ops *ops = dev->netdev_ops;
4636         int err;
4637
4638         if (new_mtu == dev->mtu)
4639                 return 0;
4640
4641         /*      MTU must be positive.    */
4642         if (new_mtu < 0)
4643                 return -EINVAL;
4644
4645         if (!netif_device_present(dev))
4646                 return -ENODEV;
4647
4648         err = 0;
4649         if (ops->ndo_change_mtu)
4650                 err = ops->ndo_change_mtu(dev, new_mtu);
4651         else
4652                 dev->mtu = new_mtu;
4653
4654         if (!err && dev->flags & IFF_UP)
4655                 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4656         return err;
4657 }
4658 EXPORT_SYMBOL(dev_set_mtu);
4659
4660 /**
4661  *      dev_set_group - Change group this device belongs to
4662  *      @dev: device
4663  *      @new_group: group this device should belong to
4664  */
4665 void dev_set_group(struct net_device *dev, int new_group)
4666 {
4667         dev->group = new_group;
4668 }
4669 EXPORT_SYMBOL(dev_set_group);
4670
4671 /**
4672  *      dev_set_mac_address - Change Media Access Control Address
4673  *      @dev: device
4674  *      @sa: new address
4675  *
4676  *      Change the hardware (MAC) address of the device
4677  */
4678 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4679 {
4680         const struct net_device_ops *ops = dev->netdev_ops;
4681         int err;
4682
4683         if (!ops->ndo_set_mac_address)
4684                 return -EOPNOTSUPP;
4685         if (sa->sa_family != dev->type)
4686                 return -EINVAL;
4687         if (!netif_device_present(dev))
4688                 return -ENODEV;
4689         err = ops->ndo_set_mac_address(dev, sa);
4690         if (!err)
4691                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4692         return err;
4693 }
4694 EXPORT_SYMBOL(dev_set_mac_address);
4695
4696 /*
4697  *      Perform the SIOCxIFxxx calls, inside rcu_read_lock()
4698  */
4699 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4700 {
4701         int err;
4702         struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
4703
4704         if (!dev)
4705                 return -ENODEV;
4706
4707         switch (cmd) {
4708         case SIOCGIFFLAGS:      /* Get interface flags */
4709                 ifr->ifr_flags = (short) dev_get_flags(dev);
4710                 return 0;
4711
4712         case SIOCGIFMETRIC:     /* Get the metric on the interface
4713                                    (currently unused) */
4714                 ifr->ifr_metric = 0;
4715                 return 0;
4716
4717         case SIOCGIFMTU:        /* Get the MTU of a device */
4718                 ifr->ifr_mtu = dev->mtu;
4719                 return 0;
4720
4721         case SIOCGIFHWADDR:
4722                 if (!dev->addr_len)
4723                         memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4724                 else
4725                         memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4726                                min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4727                 ifr->ifr_hwaddr.sa_family = dev->type;
4728                 return 0;
4729
4730         case SIOCGIFSLAVE:
4731                 err = -EINVAL;
4732                 break;
4733
4734         case SIOCGIFMAP:
4735                 ifr->ifr_map.mem_start = dev->mem_start;
4736                 ifr->ifr_map.mem_end   = dev->mem_end;
4737                 ifr->ifr_map.base_addr = dev->base_addr;
4738                 ifr->ifr_map.irq       = dev->irq;
4739                 ifr->ifr_map.dma       = dev->dma;
4740                 ifr->ifr_map.port      = dev->if_port;
4741                 return 0;
4742
4743         case SIOCGIFINDEX:
4744                 ifr->ifr_ifindex = dev->ifindex;
4745                 return 0;
4746
4747         case SIOCGIFTXQLEN:
4748                 ifr->ifr_qlen = dev->tx_queue_len;
4749                 return 0;
4750
4751         default:
4752                 /* dev_ioctl() should ensure this case
4753                  * is never reached
4754                  */
4755                 WARN_ON(1);
4756                 err = -EINVAL;
4757                 break;
4758
4759         }
4760         return err;
4761 }
4762
4763 /*
4764  *      Perform the SIOCxIFxxx calls, inside rtnl_lock()
4765  */
4766 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4767 {
4768         int err;
4769         struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4770         const struct net_device_ops *ops;
4771
4772         if (!dev)
4773                 return -ENODEV;
4774
4775         ops = dev->netdev_ops;
4776
4777         switch (cmd) {
4778         case SIOCSIFFLAGS:      /* Set interface flags */
4779                 return dev_change_flags(dev, ifr->ifr_flags);
4780
4781         case SIOCSIFMETRIC:     /* Set the metric on the interface
4782                                    (currently unused) */
4783                 return -EOPNOTSUPP;
4784
4785         case SIOCSIFMTU:        /* Set the MTU of a device */
4786                 return dev_set_mtu(dev, ifr->ifr_mtu);
4787
4788         case SIOCSIFHWADDR:
4789                 return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
4790
4791         case SIOCSIFHWBROADCAST:
4792                 if (ifr->ifr_hwaddr.sa_family != dev->type)
4793                         return -EINVAL;
4794                 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4795                        min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4796                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4797                 return 0;
4798
4799         case SIOCSIFMAP:
4800                 if (ops->ndo_set_config) {
4801                         if (!netif_device_present(dev))
4802                                 return -ENODEV;
4803                         return ops->ndo_set_config(dev, &ifr->ifr_map);
4804                 }
4805                 return -EOPNOTSUPP;
4806
4807         case SIOCADDMULTI:
4808                 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4809                     ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4810                         return -EINVAL;
4811                 if (!netif_device_present(dev))
4812                         return -ENODEV;
4813                 return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
4814
4815         case SIOCDELMULTI:
4816                 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4817                     ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4818                         return -EINVAL;
4819                 if (!netif_device_present(dev))
4820                         return -ENODEV;
4821                 return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
4822
4823         case SIOCSIFTXQLEN:
4824                 if (ifr->ifr_qlen < 0)
4825                         return -EINVAL;
4826                 dev->tx_queue_len = ifr->ifr_qlen;
4827                 return 0;
4828
4829         case SIOCSIFNAME:
4830                 ifr->ifr_newname[IFNAMSIZ-1] = '\0';
4831                 return dev_change_name(dev, ifr->ifr_newname);
4832
4833         /*
4834          *      Unknown or private ioctl
4835          */
4836         default:
4837                 if ((cmd >= SIOCDEVPRIVATE &&
4838                     cmd <= SIOCDEVPRIVATE + 15) ||
4839                     cmd == SIOCBONDENSLAVE ||
4840                     cmd == SIOCBONDRELEASE ||
4841                     cmd == SIOCBONDSETHWADDR ||
4842                     cmd == SIOCBONDSLAVEINFOQUERY ||
4843                     cmd == SIOCBONDINFOQUERY ||
4844                     cmd == SIOCBONDCHANGEACTIVE ||
4845                     cmd == SIOCGMIIPHY ||
4846                     cmd == SIOCGMIIREG ||
4847                     cmd == SIOCSMIIREG ||
4848                     cmd == SIOCBRADDIF ||
4849                     cmd == SIOCBRDELIF ||
4850                     cmd == SIOCSHWTSTAMP ||
4851                     cmd == SIOCWANDEV) {
4852                         err = -EOPNOTSUPP;
4853                         if (ops->ndo_do_ioctl) {
4854                                 if (netif_device_present(dev))
4855                                         err = ops->ndo_do_ioctl(dev, ifr, cmd);
4856                                 else
4857                                         err = -ENODEV;
4858                         }
4859                 } else
4860                         err = -EINVAL;
4861
4862         }
4863         return err;
4864 }
4865
4866 /*
4867  *      This function handles all "interface"-type I/O control requests. The actual
4868  *      'doing' part of this is dev_ifsioc above.
4869  */
4870
4871 /**
4872  *      dev_ioctl       -       network device ioctl
4873  *      @net: the applicable net namespace
4874  *      @cmd: command to issue
4875  *      @arg: pointer to a struct ifreq in user space
4876  *
4877  *      Issue ioctl functions to devices. This is normally called by the
4878  *      user space syscall interfaces but can sometimes be useful for
4879  *      other purposes. The return value is the return from the syscall if
4880  *      positive or a negative errno code on error.
4881  */
4882
4883 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
4884 {
4885         struct ifreq ifr;
4886         int ret;
4887         char *colon;
4888
4889         /* One special case: SIOCGIFCONF takes ifconf argument
4890            and requires shared lock, because it sleeps writing
4891            to user space.
4892          */
4893
4894         if (cmd == SIOCGIFCONF) {
4895                 rtnl_lock();
4896                 ret = dev_ifconf(net, (char __user *) arg);
4897                 rtnl_unlock();
4898                 return ret;
4899         }
4900         if (cmd == SIOCGIFNAME)
4901                 return dev_ifname(net, (struct ifreq __user *)arg);
4902
4903         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4904                 return -EFAULT;
4905
4906         ifr.ifr_name[IFNAMSIZ-1] = 0;
4907
4908         colon = strchr(ifr.ifr_name, ':');
4909         if (colon)
4910                 *colon = 0;
4911
4912         /*
4913          *      See which interface the caller is talking about.
4914          */
4915
4916         switch (cmd) {
4917         /*
4918          *      These ioctl calls:
4919          *      - can be done by all.
4920          *      - atomic and do not require locking.
4921          *      - return a value
4922          */
4923         case SIOCGIFFLAGS:
4924         case SIOCGIFMETRIC:
4925         case SIOCGIFMTU:
4926         case SIOCGIFHWADDR:
4927         case SIOCGIFSLAVE:
4928         case SIOCGIFMAP:
4929         case SIOCGIFINDEX:
4930         case SIOCGIFTXQLEN:
4931                 dev_load(net, ifr.ifr_name);
4932                 rcu_read_lock();
4933                 ret = dev_ifsioc_locked(net, &ifr, cmd);
4934                 rcu_read_unlock();
4935                 if (!ret) {
4936                         if (colon)
4937                                 *colon = ':';
4938                         if (copy_to_user(arg, &ifr,
4939                                          sizeof(struct ifreq)))
4940                                 ret = -EFAULT;
4941                 }
4942                 return ret;
4943
4944         case SIOCETHTOOL:
4945                 dev_load(net, ifr.ifr_name);
4946                 rtnl_lock();
4947                 ret = dev_ethtool(net, &ifr);
4948                 rtnl_unlock();
4949                 if (!ret) {
4950                         if (colon)
4951                                 *colon = ':';
4952                         if (copy_to_user(arg, &ifr,
4953                                          sizeof(struct ifreq)))
4954                                 ret = -EFAULT;
4955                 }
4956                 return ret;
4957
4958         /*
4959          *      These ioctl calls:
4960          *      - require superuser power.
4961          *      - require strict serialization.
4962          *      - return a value
4963          */
4964         case SIOCGMIIPHY:
4965         case SIOCGMIIREG:
4966         case SIOCSIFNAME:
4967                 if (!capable(CAP_NET_ADMIN))
4968                         return -EPERM;
4969                 dev_load(net, ifr.ifr_name);
4970                 rtnl_lock();
4971                 ret = dev_ifsioc(net, &ifr, cmd);
4972                 rtnl_unlock();
4973                 if (!ret) {
4974                         if (colon)
4975                                 *colon = ':';
4976                         if (copy_to_user(arg, &ifr,
4977                                          sizeof(struct ifreq)))
4978                                 ret = -EFAULT;
4979                 }
4980                 return ret;
4981
4982         /*
4983          *      These ioctl calls:
4984          *      - require superuser power.
4985          *      - require strict serialization.
4986          *      - do not return a value
4987          */
4988         case SIOCSIFFLAGS:
4989         case SIOCSIFMETRIC:
4990         case SIOCSIFMTU:
4991         case SIOCSIFMAP:
4992         case SIOCSIFHWADDR:
4993         case SIOCSIFSLAVE:
4994         case SIOCADDMULTI:
4995         case SIOCDELMULTI:
4996         case SIOCSIFHWBROADCAST:
4997         case SIOCSIFTXQLEN:
4998         case SIOCSMIIREG:
4999         case SIOCBONDENSLAVE:
5000         case SIOCBONDRELEASE:
5001         case SIOCBONDSETHWADDR:
5002         case SIOCBONDCHANGEACTIVE:
5003         case SIOCBRADDIF:
5004         case SIOCBRDELIF:
5005         case SIOCSHWTSTAMP:
5006                 if (!capable(CAP_NET_ADMIN))
5007                         return -EPERM;
5008                 /* fall through */
5009         case SIOCBONDSLAVEINFOQUERY:
5010         case SIOCBONDINFOQUERY:
5011                 dev_load(net, ifr.ifr_name);
5012                 rtnl_lock();
5013                 ret = dev_ifsioc(net, &ifr, cmd);
5014                 rtnl_unlock();
5015                 return ret;
5016
5017         case SIOCGIFMEM:
5018                 /* Get the per device memory space. We can add this but
5019                  * currently do not support it */
5020         case SIOCSIFMEM:
5021                 /* Set the per device memory buffer space.
5022                  * Not applicable in our case */
5023         case SIOCSIFLINK:
5024                 return -EINVAL;
5025
5026         /*
5027          *      Unknown or private ioctl.
5028          */
5029         default:
5030                 if (cmd == SIOCWANDEV ||
5031                     (cmd >= SIOCDEVPRIVATE &&
5032                      cmd <= SIOCDEVPRIVATE + 15)) {
5033                         dev_load(net, ifr.ifr_name);
5034                         rtnl_lock();
5035                         ret = dev_ifsioc(net, &ifr, cmd);
5036                         rtnl_unlock();
5037                         if (!ret && copy_to_user(arg, &ifr,
5038                                                  sizeof(struct ifreq)))
5039                                 ret = -EFAULT;
5040                         return ret;
5041                 }
5042                 /* Take care of Wireless Extensions */
5043                 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
5044                         return wext_handle_ioctl(net, &ifr, cmd, arg);
5045                 return -EINVAL;
5046         }
5047 }
5048
5049
5050 /**
5051  *      dev_new_index   -       allocate an ifindex
5052  *      @net: the applicable net namespace
5053  *
5054  *      Returns a suitable unique value for a new device interface
5055  *      number.  The caller must hold the rtnl semaphore or the
5056  *      dev_base_lock to be sure it remains unique.
5057  */
5058 static int dev_new_index(struct net *net)
5059 {
5060         static int ifindex;
5061         for (;;) {
5062                 if (++ifindex <= 0)
5063                         ifindex = 1;
5064                 if (!__dev_get_by_index(net, ifindex))
5065                         return ifindex;
5066         }
5067 }
5068
5069 /* Delayed registration/unregisteration */
5070 static LIST_HEAD(net_todo_list);
5071
5072 static void net_set_todo(struct net_device *dev)
5073 {
5074         list_add_tail(&dev->todo_list, &net_todo_list);
5075 }
5076
5077 static void rollback_registered_many(struct list_head *head)
5078 {
5079         struct net_device *dev, *tmp;
5080
5081         BUG_ON(dev_boot_phase);
5082         ASSERT_RTNL();
5083
5084         list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5085                 /* Some devices call without registering
5086                  * for initialization unwind. Remove those
5087                  * devices and proceed with the remaining.
5088                  */
5089                 if (dev->reg_state == NETREG_UNINITIALIZED) {
5090                         pr_debug("unregister_netdevice: device %s/%p never "
5091                                  "was registered\n", dev->name, dev);
5092
5093                         WARN_ON(1);
5094                         list_del(&dev->unreg_list);
5095                         continue;
5096                 }
5097
5098                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
5099         }
5100
5101         /* If device is running, close it first. */
5102         dev_close_many(head);
5103
5104         list_for_each_entry(dev, head, unreg_list) {
5105                 /* And unlink it from device chain. */
5106                 unlist_netdevice(dev);
5107
5108                 dev->reg_state = NETREG_UNREGISTERING;
5109         }
5110
5111         synchronize_net();
5112
5113         list_for_each_entry(dev, head, unreg_list) {
5114                 /* Shutdown queueing discipline. */
5115                 dev_shutdown(dev);
5116
5117
5118                 /* Notify protocols, that we are about to destroy
5119                    this device. They should clean all the things.
5120                 */
5121                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5122
5123                 if (!dev->rtnl_link_ops ||
5124                     dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5125                         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5126
5127                 /*
5128                  *      Flush the unicast and multicast chains
5129                  */
5130                 dev_uc_flush(dev);
5131                 dev_mc_flush(dev);
5132
5133                 if (dev->netdev_ops->ndo_uninit)
5134                         dev->netdev_ops->ndo_uninit(dev);
5135
5136                 /* Notifier chain MUST detach us from master device. */
5137                 WARN_ON(dev->master);
5138
5139                 /* Remove entries from kobject tree */
5140                 netdev_unregister_kobject(dev);
5141         }
5142
5143         /* Process any work delayed until the end of the batch */
5144         dev = list_first_entry(head, struct net_device, unreg_list);
5145         call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5146
5147         rcu_barrier();
5148
5149         list_for_each_entry(dev, head, unreg_list)
5150                 dev_put(dev);
5151 }
5152
5153 static void rollback_registered(struct net_device *dev)
5154 {
5155         LIST_HEAD(single);
5156
5157         list_add(&dev->unreg_list, &single);
5158         rollback_registered_many(&single);
5159         list_del(&single);
5160 }
5161
5162 u32 netdev_fix_features(struct net_device *dev, u32 features)
5163 {
5164         /* Fix illegal checksum combinations */
5165         if ((features & NETIF_F_HW_CSUM) &&
5166             (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5167                 netdev_info(dev, "mixed HW and IP checksum settings.\n");
5168                 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5169         }
5170
5171         if ((features & NETIF_F_NO_CSUM) &&
5172             (features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5173                 netdev_info(dev, "mixed no checksumming and other settings.\n");
5174                 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
5175         }
5176
5177         /* Fix illegal SG+CSUM combinations. */
5178         if ((features & NETIF_F_SG) &&
5179             !(features & NETIF_F_ALL_CSUM)) {
5180                 netdev_info(dev,
5181                             "Dropping NETIF_F_SG since no checksum feature.\n");
5182                 features &= ~NETIF_F_SG;
5183         }
5184
5185         /* TSO requires that SG is present as well. */
5186         if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) {
5187                 netdev_info(dev, "Dropping NETIF_F_TSO since no SG feature.\n");
5188                 features &= ~NETIF_F_TSO;
5189         }
5190
5191         /* Software GSO depends on SG. */
5192         if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5193                 netdev_info(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5194                 features &= ~NETIF_F_GSO;
5195         }
5196
5197         /* UFO needs SG and checksumming */
5198         if (features & NETIF_F_UFO) {
5199                 /* maybe split UFO into V4 and V6? */
5200                 if (!((features & NETIF_F_GEN_CSUM) ||
5201                     (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5202                             == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5203                         netdev_info(dev,
5204                                 "Dropping NETIF_F_UFO since no checksum offload features.\n");
5205                         features &= ~NETIF_F_UFO;
5206                 }
5207
5208                 if (!(features & NETIF_F_SG)) {
5209                         netdev_info(dev,
5210                                 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5211                         features &= ~NETIF_F_UFO;
5212                 }
5213         }
5214
5215         return features;
5216 }
5217 EXPORT_SYMBOL(netdev_fix_features);
5218
5219 void netdev_update_features(struct net_device *dev)
5220 {
5221         u32 features;
5222         int err = 0;
5223
5224         features = netdev_get_wanted_features(dev);
5225
5226         if (dev->netdev_ops->ndo_fix_features)
5227                 features = dev->netdev_ops->ndo_fix_features(dev, features);
5228
5229         /* driver might be less strict about feature dependencies */
5230         features = netdev_fix_features(dev, features);
5231
5232         if (dev->features == features)
5233                 return;
5234
5235         netdev_info(dev, "Features changed: 0x%08x -> 0x%08x\n",
5236                 dev->features, features);
5237
5238         if (dev->netdev_ops->ndo_set_features)
5239                 err = dev->netdev_ops->ndo_set_features(dev, features);
5240
5241         if (!err)
5242                 dev->features = features;
5243         else if (err < 0)
5244                 netdev_err(dev,
5245                         "set_features() failed (%d); wanted 0x%08x, left 0x%08x\n",
5246                         err, features, dev->features);
5247 }
5248 EXPORT_SYMBOL(netdev_update_features);
5249
5250 /**
5251  *      netif_stacked_transfer_operstate -      transfer operstate
5252  *      @rootdev: the root or lower level device to transfer state from
5253  *      @dev: the device to transfer operstate to
5254  *
5255  *      Transfer operational state from root to device. This is normally
5256  *      called when a stacking relationship exists between the root
5257  *      device and the device(a leaf device).
5258  */
5259 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5260                                         struct net_device *dev)
5261 {
5262         if (rootdev->operstate == IF_OPER_DORMANT)
5263                 netif_dormant_on(dev);
5264         else
5265                 netif_dormant_off(dev);
5266
5267         if (netif_carrier_ok(rootdev)) {
5268                 if (!netif_carrier_ok(dev))
5269                         netif_carrier_on(dev);
5270         } else {
5271                 if (netif_carrier_ok(dev))
5272                         netif_carrier_off(dev);
5273         }
5274 }
5275 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5276
5277 #ifdef CONFIG_RPS
5278 static int netif_alloc_rx_queues(struct net_device *dev)
5279 {
5280         unsigned int i, count = dev->num_rx_queues;
5281         struct netdev_rx_queue *rx;
5282
5283         BUG_ON(count < 1);
5284
5285         rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5286         if (!rx) {
5287                 pr_err("netdev: Unable to allocate %u rx queues.\n", count);
5288                 return -ENOMEM;
5289         }
5290         dev->_rx = rx;
5291
5292         for (i = 0; i < count; i++)
5293                 rx[i].dev = dev;
5294         return 0;
5295 }
5296 #endif
5297
5298 static void netdev_init_one_queue(struct net_device *dev,
5299                                   struct netdev_queue *queue, void *_unused)
5300 {
5301         /* Initialize queue lock */
5302         spin_lock_init(&queue->_xmit_lock);
5303         netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5304         queue->xmit_lock_owner = -1;
5305         netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5306         queue->dev = dev;
5307 }
5308
5309 static int netif_alloc_netdev_queues(struct net_device *dev)
5310 {
5311         unsigned int count = dev->num_tx_queues;
5312         struct netdev_queue *tx;
5313
5314         BUG_ON(count < 1);
5315
5316         tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5317         if (!tx) {
5318                 pr_err("netdev: Unable to allocate %u tx queues.\n",
5319                        count);
5320                 return -ENOMEM;
5321         }
5322         dev->_tx = tx;
5323
5324         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5325         spin_lock_init(&dev->tx_global_lock);
5326
5327         return 0;
5328 }
5329
5330 /**
5331  *      register_netdevice      - register a network device
5332  *      @dev: device to register
5333  *
5334  *      Take a completed network device structure and add it to the kernel
5335  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5336  *      chain. 0 is returned on success. A negative errno code is returned
5337  *      on a failure to set up the device, or if the name is a duplicate.
5338  *
5339  *      Callers must hold the rtnl semaphore. You may want
5340  *      register_netdev() instead of this.
5341  *
5342  *      BUGS:
5343  *      The locking appears insufficient to guarantee two parallel registers
5344  *      will not get the same name.
5345  */
5346
5347 int register_netdevice(struct net_device *dev)
5348 {
5349         int ret;
5350         struct net *net = dev_net(dev);
5351
5352         BUG_ON(dev_boot_phase);
5353         ASSERT_RTNL();
5354
5355         might_sleep();
5356
5357         /* When net_device's are persistent, this will be fatal. */
5358         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5359         BUG_ON(!net);
5360
5361         spin_lock_init(&dev->addr_list_lock);
5362         netdev_set_addr_lockdep_class(dev);
5363
5364         dev->iflink = -1;
5365
5366         /* Init, if this function is available */
5367         if (dev->netdev_ops->ndo_init) {
5368                 ret = dev->netdev_ops->ndo_init(dev);
5369                 if (ret) {
5370                         if (ret > 0)
5371                                 ret = -EIO;
5372                         goto out;
5373                 }
5374         }
5375
5376         ret = dev_get_valid_name(dev, dev->name, 0);
5377         if (ret)
5378                 goto err_uninit;
5379
5380         dev->ifindex = dev_new_index(net);
5381         if (dev->iflink == -1)
5382                 dev->iflink = dev->ifindex;
5383
5384         /* Transfer changeable features to wanted_features and enable
5385          * software offloads (GSO and GRO).
5386          */
5387         dev->hw_features |= NETIF_F_SOFT_FEATURES;
5388         dev->features |= NETIF_F_SOFT_FEATURES;
5389         dev->wanted_features = dev->features & dev->hw_features;
5390
5391         /* Avoid warning from netdev_fix_features() for GSO without SG */
5392         if (!(dev->wanted_features & NETIF_F_SG)) {
5393                 dev->wanted_features &= ~NETIF_F_GSO;
5394                 dev->features &= ~NETIF_F_GSO;
5395         }
5396
5397         /* Enable GRO and NETIF_F_HIGHDMA for vlans by default,
5398          * vlan_dev_init() will do the dev->features check, so these features
5399          * are enabled only if supported by underlying device.
5400          */
5401         dev->vlan_features |= (NETIF_F_GRO | NETIF_F_HIGHDMA);
5402
5403         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5404         ret = notifier_to_errno(ret);
5405         if (ret)
5406                 goto err_uninit;
5407
5408         ret = netdev_register_kobject(dev);
5409         if (ret)
5410                 goto err_uninit;
5411         dev->reg_state = NETREG_REGISTERED;
5412
5413         netdev_update_features(dev);
5414
5415         /*
5416          *      Default initial state at registry is that the
5417          *      device is present.
5418          */
5419
5420         set_bit(__LINK_STATE_PRESENT, &dev->state);
5421
5422         dev_init_scheduler(dev);
5423         dev_hold(dev);
5424         list_netdevice(dev);
5425
5426         /* Notify protocols, that a new device appeared. */
5427         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5428         ret = notifier_to_errno(ret);
5429         if (ret) {
5430                 rollback_registered(dev);
5431                 dev->reg_state = NETREG_UNREGISTERED;
5432         }
5433         /*
5434          *      Prevent userspace races by waiting until the network
5435          *      device is fully setup before sending notifications.
5436          */
5437         if (!dev->rtnl_link_ops ||
5438             dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5439                 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5440
5441 out:
5442         return ret;
5443
5444 err_uninit:
5445         if (dev->netdev_ops->ndo_uninit)
5446                 dev->netdev_ops->ndo_uninit(dev);
5447         goto out;
5448 }
5449 EXPORT_SYMBOL(register_netdevice);
5450
5451 /**
5452  *      init_dummy_netdev       - init a dummy network device for NAPI
5453  *      @dev: device to init
5454  *
5455  *      This takes a network device structure and initialize the minimum
5456  *      amount of fields so it can be used to schedule NAPI polls without
5457  *      registering a full blown interface. This is to be used by drivers
5458  *      that need to tie several hardware interfaces to a single NAPI
5459  *      poll scheduler due to HW limitations.
5460  */
5461 int init_dummy_netdev(struct net_device *dev)
5462 {
5463         /* Clear everything. Note we don't initialize spinlocks
5464          * are they aren't supposed to be taken by any of the
5465          * NAPI code and this dummy netdev is supposed to be
5466          * only ever used for NAPI polls
5467          */
5468         memset(dev, 0, sizeof(struct net_device));
5469
5470         /* make sure we BUG if trying to hit standard
5471          * register/unregister code path
5472          */
5473         dev->reg_state = NETREG_DUMMY;
5474
5475         /* NAPI wants this */
5476         INIT_LIST_HEAD(&dev->napi_list);
5477
5478         /* a dummy interface is started by default */
5479         set_bit(__LINK_STATE_PRESENT, &dev->state);
5480         set_bit(__LINK_STATE_START, &dev->state);
5481
5482         /* Note : We dont allocate pcpu_refcnt for dummy devices,
5483          * because users of this 'device' dont need to change
5484          * its refcount.
5485          */
5486
5487         return 0;
5488 }
5489 EXPORT_SYMBOL_GPL(init_dummy_netdev);
5490
5491
5492 /**
5493  *      register_netdev - register a network device
5494  *      @dev: device to register
5495  *
5496  *      Take a completed network device structure and add it to the kernel
5497  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5498  *      chain. 0 is returned on success. A negative errno code is returned
5499  *      on a failure to set up the device, or if the name is a duplicate.
5500  *
5501  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
5502  *      and expands the device name if you passed a format string to
5503  *      alloc_netdev.
5504  */
5505 int register_netdev(struct net_device *dev)
5506 {
5507         int err;
5508
5509         rtnl_lock();
5510
5511         /*
5512          * If the name is a format string the caller wants us to do a
5513          * name allocation.
5514          */
5515         if (strchr(dev->name, '%')) {
5516                 err = dev_alloc_name(dev, dev->name);
5517                 if (err < 0)
5518                         goto out;
5519         }
5520
5521         err = register_netdevice(dev);
5522 out:
5523         rtnl_unlock();
5524         return err;
5525 }
5526 EXPORT_SYMBOL(register_netdev);
5527
5528 int netdev_refcnt_read(const struct net_device *dev)
5529 {
5530         int i, refcnt = 0;
5531
5532         for_each_possible_cpu(i)
5533                 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5534         return refcnt;
5535 }
5536 EXPORT_SYMBOL(netdev_refcnt_read);
5537
5538 /*
5539  * netdev_wait_allrefs - wait until all references are gone.
5540  *
5541  * This is called when unregistering network devices.
5542  *
5543  * Any protocol or device that holds a reference should register
5544  * for netdevice notification, and cleanup and put back the
5545  * reference if they receive an UNREGISTER event.
5546  * We can get stuck here if buggy protocols don't correctly
5547  * call dev_put.
5548  */
5549 static void netdev_wait_allrefs(struct net_device *dev)
5550 {
5551         unsigned long rebroadcast_time, warning_time;
5552         int refcnt;
5553
5554         linkwatch_forget_dev(dev);
5555
5556         rebroadcast_time = warning_time = jiffies;
5557         refcnt = netdev_refcnt_read(dev);
5558
5559         while (refcnt != 0) {
5560                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5561                         rtnl_lock();
5562
5563                         /* Rebroadcast unregister notification */
5564                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5565                         /* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
5566                          * should have already handle it the first time */
5567
5568                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5569                                      &dev->state)) {
5570                                 /* We must not have linkwatch events
5571                                  * pending on unregister. If this
5572                                  * happens, we simply run the queue
5573                                  * unscheduled, resulting in a noop
5574                                  * for this device.
5575                                  */
5576                                 linkwatch_run_queue();
5577                         }
5578
5579                         __rtnl_unlock();
5580
5581                         rebroadcast_time = jiffies;
5582                 }
5583
5584                 msleep(250);
5585
5586                 refcnt = netdev_refcnt_read(dev);
5587
5588                 if (time_after(jiffies, warning_time + 10 * HZ)) {
5589                         printk(KERN_EMERG "unregister_netdevice: "
5590                                "waiting for %s to become free. Usage "
5591                                "count = %d\n",
5592                                dev->name, refcnt);
5593                         warning_time = jiffies;
5594                 }
5595         }
5596 }
5597
5598 /* The sequence is:
5599  *
5600  *      rtnl_lock();
5601  *      ...
5602  *      register_netdevice(x1);
5603  *      register_netdevice(x2);
5604  *      ...
5605  *      unregister_netdevice(y1);
5606  *      unregister_netdevice(y2);
5607  *      ...
5608  *      rtnl_unlock();
5609  *      free_netdev(y1);
5610  *      free_netdev(y2);
5611  *
5612  * We are invoked by rtnl_unlock().
5613  * This allows us to deal with problems:
5614  * 1) We can delete sysfs objects which invoke hotplug
5615  *    without deadlocking with linkwatch via keventd.
5616  * 2) Since we run with the RTNL semaphore not held, we can sleep
5617  *    safely in order to wait for the netdev refcnt to drop to zero.
5618  *
5619  * We must not return until all unregister events added during
5620  * the interval the lock was held have been completed.
5621  */
5622 void netdev_run_todo(void)
5623 {
5624         struct list_head list;
5625
5626         /* Snapshot list, allow later requests */
5627         list_replace_init(&net_todo_list, &list);
5628
5629         __rtnl_unlock();
5630
5631         while (!list_empty(&list)) {
5632                 struct net_device *dev
5633                         = list_first_entry(&list, struct net_device, todo_list);
5634                 list_del(&dev->todo_list);
5635
5636                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5637                         printk(KERN_ERR "network todo '%s' but state %d\n",
5638                                dev->name, dev->reg_state);
5639                         dump_stack();
5640                         continue;
5641                 }
5642
5643                 dev->reg_state = NETREG_UNREGISTERED;
5644
5645                 on_each_cpu(flush_backlog, dev, 1);
5646
5647                 netdev_wait_allrefs(dev);
5648
5649                 /* paranoia */
5650                 BUG_ON(netdev_refcnt_read(dev));
5651                 WARN_ON(rcu_dereference_raw(dev->ip_ptr));
5652                 WARN_ON(rcu_dereference_raw(dev->ip6_ptr));
5653                 WARN_ON(dev->dn_ptr);
5654
5655                 if (dev->destructor)
5656                         dev->destructor(dev);
5657
5658                 /* Free network device */
5659                 kobject_put(&dev->dev.kobj);
5660         }
5661 }
5662
5663 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
5664  * fields in the same order, with only the type differing.
5665  */
5666 static void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5667                                     const struct net_device_stats *netdev_stats)
5668 {
5669 #if BITS_PER_LONG == 64
5670         BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5671         memcpy(stats64, netdev_stats, sizeof(*stats64));
5672 #else
5673         size_t i, n = sizeof(*stats64) / sizeof(u64);
5674         const unsigned long *src = (const unsigned long *)netdev_stats;
5675         u64 *dst = (u64 *)stats64;
5676
5677         BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5678                      sizeof(*stats64) / sizeof(u64));
5679         for (i = 0; i < n; i++)
5680                 dst[i] = src[i];
5681 #endif
5682 }
5683
5684 /**
5685  *      dev_get_stats   - get network device statistics
5686  *      @dev: device to get statistics from
5687  *      @storage: place to store stats
5688  *
5689  *      Get network statistics from device. Return @storage.
5690  *      The device driver may provide its own method by setting
5691  *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5692  *      otherwise the internal statistics structure is used.
5693  */
5694 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5695                                         struct rtnl_link_stats64 *storage)
5696 {
5697         const struct net_device_ops *ops = dev->netdev_ops;
5698
5699         if (ops->ndo_get_stats64) {
5700                 memset(storage, 0, sizeof(*storage));
5701                 ops->ndo_get_stats64(dev, storage);
5702         } else if (ops->ndo_get_stats) {
5703                 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
5704         } else {
5705                 netdev_stats_to_stats64(storage, &dev->stats);
5706         }
5707         storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
5708         return storage;
5709 }
5710 EXPORT_SYMBOL(dev_get_stats);
5711
5712 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
5713 {
5714         struct netdev_queue *queue = dev_ingress_queue(dev);
5715
5716 #ifdef CONFIG_NET_CLS_ACT
5717         if (queue)
5718                 return queue;
5719         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5720         if (!queue)
5721                 return NULL;
5722         netdev_init_one_queue(dev, queue, NULL);
5723         queue->qdisc = &noop_qdisc;
5724         queue->qdisc_sleeping = &noop_qdisc;
5725         rcu_assign_pointer(dev->ingress_queue, queue);
5726 #endif
5727         return queue;
5728 }
5729
5730 /**
5731  *      alloc_netdev_mqs - allocate network device
5732  *      @sizeof_priv:   size of private data to allocate space for
5733  *      @name:          device name format string
5734  *      @setup:         callback to initialize device
5735  *      @txqs:          the number of TX subqueues to allocate
5736  *      @rxqs:          the number of RX subqueues to allocate
5737  *
5738  *      Allocates a struct net_device with private data area for driver use
5739  *      and performs basic initialization.  Also allocates subquue structs
5740  *      for each queue on the device.
5741  */
5742 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
5743                 void (*setup)(struct net_device *),
5744                 unsigned int txqs, unsigned int rxqs)
5745 {
5746         struct net_device *dev;
5747         size_t alloc_size;
5748         struct net_device *p;
5749
5750         BUG_ON(strlen(name) >= sizeof(dev->name));
5751
5752         if (txqs < 1) {
5753                 pr_err("alloc_netdev: Unable to allocate device "
5754                        "with zero queues.\n");
5755                 return NULL;
5756         }
5757
5758 #ifdef CONFIG_RPS
5759         if (rxqs < 1) {
5760                 pr_err("alloc_netdev: Unable to allocate device "
5761                        "with zero RX queues.\n");
5762                 return NULL;
5763         }
5764 #endif
5765
5766         alloc_size = sizeof(struct net_device);
5767         if (sizeof_priv) {
5768                 /* ensure 32-byte alignment of private area */
5769                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5770                 alloc_size += sizeof_priv;
5771         }
5772         /* ensure 32-byte alignment of whole construct */
5773         alloc_size += NETDEV_ALIGN - 1;
5774
5775         p = kzalloc(alloc_size, GFP_KERNEL);
5776         if (!p) {
5777                 printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
5778                 return NULL;
5779         }
5780
5781         dev = PTR_ALIGN(p, NETDEV_ALIGN);
5782         dev->padded = (char *)dev - (char *)p;
5783
5784         dev->pcpu_refcnt = alloc_percpu(int);
5785         if (!dev->pcpu_refcnt)
5786                 goto free_p;
5787
5788         if (dev_addr_init(dev))
5789                 goto free_pcpu;
5790
5791         dev_mc_init(dev);
5792         dev_uc_init(dev);
5793
5794         dev_net_set(dev, &init_net);
5795
5796         dev->gso_max_size = GSO_MAX_SIZE;
5797
5798         INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list);
5799         dev->ethtool_ntuple_list.count = 0;
5800         INIT_LIST_HEAD(&dev->napi_list);
5801         INIT_LIST_HEAD(&dev->unreg_list);
5802         INIT_LIST_HEAD(&dev->link_watch_list);
5803         dev->priv_flags = IFF_XMIT_DST_RELEASE;
5804         setup(dev);
5805
5806         dev->num_tx_queues = txqs;
5807         dev->real_num_tx_queues = txqs;
5808         if (netif_alloc_netdev_queues(dev))
5809                 goto free_all;
5810
5811 #ifdef CONFIG_RPS
5812         dev->num_rx_queues = rxqs;
5813         dev->real_num_rx_queues = rxqs;
5814         if (netif_alloc_rx_queues(dev))
5815                 goto free_all;
5816 #endif
5817
5818         strcpy(dev->name, name);
5819         dev->group = INIT_NETDEV_GROUP;
5820         return dev;
5821
5822 free_all:
5823         free_netdev(dev);
5824         return NULL;
5825
5826 free_pcpu:
5827         free_percpu(dev->pcpu_refcnt);
5828         kfree(dev->_tx);
5829 #ifdef CONFIG_RPS
5830         kfree(dev->_rx);
5831 #endif
5832
5833 free_p:
5834         kfree(p);
5835         return NULL;
5836 }
5837 EXPORT_SYMBOL(alloc_netdev_mqs);
5838
5839 /**
5840  *      free_netdev - free network device
5841  *      @dev: device
5842  *
5843  *      This function does the last stage of destroying an allocated device
5844  *      interface. The reference to the device object is released.
5845  *      If this is the last reference then it will be freed.
5846  */
5847 void free_netdev(struct net_device *dev)
5848 {
5849         struct napi_struct *p, *n;
5850
5851         release_net(dev_net(dev));
5852
5853         kfree(dev->_tx);
5854 #ifdef CONFIG_RPS
5855         kfree(dev->_rx);
5856 #endif
5857
5858         kfree(rcu_dereference_raw(dev->ingress_queue));
5859
5860         /* Flush device addresses */
5861         dev_addr_flush(dev);
5862
5863         /* Clear ethtool n-tuple list */
5864         ethtool_ntuple_flush(dev);
5865
5866         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5867                 netif_napi_del(p);
5868
5869         free_percpu(dev->pcpu_refcnt);
5870         dev->pcpu_refcnt = NULL;
5871
5872         /*  Compatibility with error handling in drivers */
5873         if (dev->reg_state == NETREG_UNINITIALIZED) {
5874                 kfree((char *)dev - dev->padded);
5875                 return;
5876         }
5877
5878         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5879         dev->reg_state = NETREG_RELEASED;
5880
5881         /* will free via device release */
5882         put_device(&dev->dev);
5883 }
5884 EXPORT_SYMBOL(free_netdev);
5885
5886 /**
5887  *      synchronize_net -  Synchronize with packet receive processing
5888  *
5889  *      Wait for packets currently being received to be done.
5890  *      Does not block later packets from starting.
5891  */
5892 void synchronize_net(void)
5893 {
5894         might_sleep();
5895         synchronize_rcu();
5896 }
5897 EXPORT_SYMBOL(synchronize_net);
5898
5899 /**
5900  *      unregister_netdevice_queue - remove device from the kernel
5901  *      @dev: device
5902  *      @head: list
5903  *
5904  *      This function shuts down a device interface and removes it
5905  *      from the kernel tables.
5906  *      If head not NULL, device is queued to be unregistered later.
5907  *
5908  *      Callers must hold the rtnl semaphore.  You may want
5909  *      unregister_netdev() instead of this.
5910  */
5911
5912 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
5913 {
5914         ASSERT_RTNL();
5915
5916         if (head) {
5917                 list_move_tail(&dev->unreg_list, head);
5918         } else {
5919                 rollback_registered(dev);
5920                 /* Finish processing unregister after unlock */
5921                 net_set_todo(dev);
5922         }
5923 }
5924 EXPORT_SYMBOL(unregister_netdevice_queue);
5925
5926 /**
5927  *      unregister_netdevice_many - unregister many devices
5928  *      @head: list of devices
5929  */
5930 void unregister_netdevice_many(struct list_head *head)
5931 {
5932         struct net_device *dev;
5933
5934         if (!list_empty(head)) {
5935                 rollback_registered_many(head);
5936                 list_for_each_entry(dev, head, unreg_list)
5937                         net_set_todo(dev);
5938         }
5939 }
5940 EXPORT_SYMBOL(unregister_netdevice_many);
5941
5942 /**
5943  *      unregister_netdev - remove device from the kernel
5944  *      @dev: device
5945  *
5946  *      This function shuts down a device interface and removes it
5947  *      from the kernel tables.
5948  *
5949  *      This is just a wrapper for unregister_netdevice that takes
5950  *      the rtnl semaphore.  In general you want to use this and not
5951  *      unregister_netdevice.
5952  */
5953 void unregister_netdev(struct net_device *dev)
5954 {
5955         rtnl_lock();
5956         unregister_netdevice(dev);
5957         rtnl_unlock();
5958 }
5959 EXPORT_SYMBOL(unregister_netdev);
5960
5961 /**
5962  *      dev_change_net_namespace - move device to different nethost namespace
5963  *      @dev: device
5964  *      @net: network namespace
5965  *      @pat: If not NULL name pattern to try if the current device name
5966  *            is already taken in the destination network namespace.
5967  *
5968  *      This function shuts down a device interface and moves it
5969  *      to a new network namespace. On success 0 is returned, on
5970  *      a failure a netagive errno code is returned.
5971  *
5972  *      Callers must hold the rtnl semaphore.
5973  */
5974
5975 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
5976 {
5977         int err;
5978
5979         ASSERT_RTNL();
5980
5981         /* Don't allow namespace local devices to be moved. */
5982         err = -EINVAL;
5983         if (dev->features & NETIF_F_NETNS_LOCAL)
5984                 goto out;
5985
5986         /* Ensure the device has been registrered */
5987         err = -EINVAL;
5988         if (dev->reg_state != NETREG_REGISTERED)
5989                 goto out;
5990
5991         /* Get out if there is nothing todo */
5992         err = 0;
5993         if (net_eq(dev_net(dev), net))
5994                 goto out;
5995
5996         /* Pick the destination device name, and ensure
5997          * we can use it in the destination network namespace.
5998          */
5999         err = -EEXIST;
6000         if (__dev_get_by_name(net, dev->name)) {
6001                 /* We get here if we can't use the current device name */
6002                 if (!pat)
6003                         goto out;
6004                 if (dev_get_valid_name(dev, pat, 1))
6005                         goto out;
6006         }
6007
6008         /*
6009          * And now a mini version of register_netdevice unregister_netdevice.
6010          */
6011
6012         /* If device is running close it first. */
6013         dev_close(dev);
6014
6015         /* And unlink it from device chain */
6016         err = -ENODEV;
6017         unlist_netdevice(dev);
6018
6019         synchronize_net();
6020
6021         /* Shutdown queueing discipline. */
6022         dev_shutdown(dev);
6023
6024         /* Notify protocols, that we are about to destroy
6025            this device. They should clean all the things.
6026
6027            Note that dev->reg_state stays at NETREG_REGISTERED.
6028            This is wanted because this way 8021q and macvlan know
6029            the device is just moving and can keep their slaves up.
6030         */
6031         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6032         call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
6033
6034         /*
6035          *      Flush the unicast and multicast chains
6036          */
6037         dev_uc_flush(dev);
6038         dev_mc_flush(dev);
6039
6040         /* Actually switch the network namespace */
6041         dev_net_set(dev, net);
6042
6043         /* If there is an ifindex conflict assign a new one */
6044         if (__dev_get_by_index(net, dev->ifindex)) {
6045                 int iflink = (dev->iflink == dev->ifindex);
6046                 dev->ifindex = dev_new_index(net);
6047                 if (iflink)
6048                         dev->iflink = dev->ifindex;
6049         }
6050
6051         /* Fixup kobjects */
6052         err = device_rename(&dev->dev, dev->name);
6053         WARN_ON(err);
6054
6055         /* Add the device back in the hashes */
6056         list_netdevice(dev);
6057
6058         /* Notify protocols, that a new device appeared. */
6059         call_netdevice_notifiers(NETDEV_REGISTER, dev);
6060
6061         /*
6062          *      Prevent userspace races by waiting until the network
6063          *      device is fully setup before sending notifications.
6064          */
6065         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6066
6067         synchronize_net();
6068         err = 0;
6069 out:
6070         return err;
6071 }
6072 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6073
6074 static int dev_cpu_callback(struct notifier_block *nfb,
6075                             unsigned long action,
6076                             void *ocpu)
6077 {
6078         struct sk_buff **list_skb;
6079         struct sk_buff *skb;
6080         unsigned int cpu, oldcpu = (unsigned long)ocpu;
6081         struct softnet_data *sd, *oldsd;
6082
6083         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6084                 return NOTIFY_OK;
6085
6086         local_irq_disable();
6087         cpu = smp_processor_id();
6088         sd = &per_cpu(softnet_data, cpu);
6089         oldsd = &per_cpu(softnet_data, oldcpu);
6090
6091         /* Find end of our completion_queue. */
6092         list_skb = &sd->completion_queue;
6093         while (*list_skb)
6094                 list_skb = &(*list_skb)->next;
6095         /* Append completion queue from offline CPU. */
6096         *list_skb = oldsd->completion_queue;
6097         oldsd->completion_queue = NULL;
6098
6099         /* Append output queue from offline CPU. */
6100         if (oldsd->output_queue) {
6101                 *sd->output_queue_tailp = oldsd->output_queue;
6102                 sd->output_queue_tailp = oldsd->output_queue_tailp;
6103                 oldsd->output_queue = NULL;
6104                 oldsd->output_queue_tailp = &oldsd->output_queue;
6105         }
6106
6107         raise_softirq_irqoff(NET_TX_SOFTIRQ);
6108         local_irq_enable();
6109
6110         /* Process offline CPU's input_pkt_queue */
6111         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6112                 netif_rx(skb);
6113                 input_queue_head_incr(oldsd);
6114         }
6115         while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6116                 netif_rx(skb);
6117                 input_queue_head_incr(oldsd);
6118         }
6119
6120         return NOTIFY_OK;
6121 }
6122
6123
6124 /**
6125  *      netdev_increment_features - increment feature set by one
6126  *      @all: current feature set
6127  *      @one: new feature set
6128  *      @mask: mask feature set
6129  *
6130  *      Computes a new feature set after adding a device with feature set
6131  *      @one to the master device with current feature set @all.  Will not
6132  *      enable anything that is off in @mask. Returns the new feature set.
6133  */
6134 u32 netdev_increment_features(u32 all, u32 one, u32 mask)
6135 {
6136         /* If device needs checksumming, downgrade to it. */
6137         if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
6138                 all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM);
6139         else if (mask & NETIF_F_ALL_CSUM) {
6140                 /* If one device supports v4/v6 checksumming, set for all. */
6141                 if (one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM) &&
6142                     !(all & NETIF_F_GEN_CSUM)) {
6143                         all &= ~NETIF_F_ALL_CSUM;
6144                         all |= one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
6145                 }
6146
6147                 /* If one device supports hw checksumming, set for all. */
6148                 if (one & NETIF_F_GEN_CSUM && !(all & NETIF_F_GEN_CSUM)) {
6149                         all &= ~NETIF_F_ALL_CSUM;
6150                         all |= NETIF_F_HW_CSUM;
6151                 }
6152         }
6153
6154         one |= NETIF_F_ALL_CSUM;
6155
6156         one |= all & NETIF_F_ONE_FOR_ALL;
6157         all &= one | NETIF_F_LLTX | NETIF_F_GSO | NETIF_F_UFO;
6158         all |= one & mask & NETIF_F_ONE_FOR_ALL;
6159
6160         return all;
6161 }
6162 EXPORT_SYMBOL(netdev_increment_features);
6163
6164 static struct hlist_head *netdev_create_hash(void)
6165 {
6166         int i;
6167         struct hlist_head *hash;
6168
6169         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6170         if (hash != NULL)
6171                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
6172                         INIT_HLIST_HEAD(&hash[i]);
6173
6174         return hash;
6175 }
6176
6177 /* Initialize per network namespace state */
6178 static int __net_init netdev_init(struct net *net)
6179 {
6180         INIT_LIST_HEAD(&net->dev_base_head);
6181
6182         net->dev_name_head = netdev_create_hash();
6183         if (net->dev_name_head == NULL)
6184                 goto err_name;
6185
6186         net->dev_index_head = netdev_create_hash();
6187         if (net->dev_index_head == NULL)
6188                 goto err_idx;
6189
6190         return 0;
6191
6192 err_idx:
6193         kfree(net->dev_name_head);
6194 err_name:
6195         return -ENOMEM;
6196 }
6197
6198 /**
6199  *      netdev_drivername - network driver for the device
6200  *      @dev: network device
6201  *      @buffer: buffer for resulting name
6202  *      @len: size of buffer
6203  *
6204  *      Determine network driver for device.
6205  */
6206 char *netdev_drivername(const struct net_device *dev, char *buffer, int len)
6207 {
6208         const struct device_driver *driver;
6209         const struct device *parent;
6210
6211         if (len <= 0 || !buffer)
6212                 return buffer;
6213         buffer[0] = 0;
6214
6215         parent = dev->dev.parent;
6216
6217         if (!parent)
6218                 return buffer;
6219
6220         driver = parent->driver;
6221         if (driver && driver->name)
6222                 strlcpy(buffer, driver->name, len);
6223         return buffer;
6224 }
6225
6226 static int __netdev_printk(const char *level, const struct net_device *dev,
6227                            struct va_format *vaf)
6228 {
6229         int r;
6230
6231         if (dev && dev->dev.parent)
6232                 r = dev_printk(level, dev->dev.parent, "%s: %pV",
6233                                netdev_name(dev), vaf);
6234         else if (dev)
6235                 r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6236         else
6237                 r = printk("%s(NULL net_device): %pV", level, vaf);
6238
6239         return r;
6240 }
6241
6242 int netdev_printk(const char *level, const struct net_device *dev,
6243                   const char *format, ...)
6244 {
6245         struct va_format vaf;
6246         va_list args;
6247         int r;
6248
6249         va_start(args, format);
6250
6251         vaf.fmt = format;
6252         vaf.va = &args;
6253
6254         r = __netdev_printk(level, dev, &vaf);
6255         va_end(args);
6256
6257         return r;
6258 }
6259 EXPORT_SYMBOL(netdev_printk);
6260
6261 #define define_netdev_printk_level(func, level)                 \
6262 int func(const struct net_device *dev, const char *fmt, ...)    \
6263 {                                                               \
6264         int r;                                                  \
6265         struct va_format vaf;                                   \
6266         va_list args;                                           \
6267                                                                 \
6268         va_start(args, fmt);                                    \
6269                                                                 \
6270         vaf.fmt = fmt;                                          \
6271         vaf.va = &args;                                         \
6272                                                                 \
6273         r = __netdev_printk(level, dev, &vaf);                  \
6274         va_end(args);                                           \
6275                                                                 \
6276         return r;                                               \
6277 }                                                               \
6278 EXPORT_SYMBOL(func);
6279
6280 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6281 define_netdev_printk_level(netdev_alert, KERN_ALERT);
6282 define_netdev_printk_level(netdev_crit, KERN_CRIT);
6283 define_netdev_printk_level(netdev_err, KERN_ERR);
6284 define_netdev_printk_level(netdev_warn, KERN_WARNING);
6285 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6286 define_netdev_printk_level(netdev_info, KERN_INFO);
6287
6288 static void __net_exit netdev_exit(struct net *net)
6289 {
6290         kfree(net->dev_name_head);
6291         kfree(net->dev_index_head);
6292 }
6293
6294 static struct pernet_operations __net_initdata netdev_net_ops = {
6295         .init = netdev_init,
6296         .exit = netdev_exit,
6297 };
6298
6299 static void __net_exit default_device_exit(struct net *net)
6300 {
6301         struct net_device *dev, *aux;
6302         /*
6303          * Push all migratable network devices back to the
6304          * initial network namespace
6305          */
6306         rtnl_lock();
6307         for_each_netdev_safe(net, dev, aux) {
6308                 int err;
6309                 char fb_name[IFNAMSIZ];
6310
6311                 /* Ignore unmoveable devices (i.e. loopback) */
6312                 if (dev->features & NETIF_F_NETNS_LOCAL)
6313                         continue;
6314
6315                 /* Leave virtual devices for the generic cleanup */
6316                 if (dev->rtnl_link_ops)
6317                         continue;
6318
6319                 /* Push remaing network devices to init_net */
6320                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6321                 err = dev_change_net_namespace(dev, &init_net, fb_name);
6322                 if (err) {
6323                         printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
6324                                 __func__, dev->name, err);
6325                         BUG();
6326                 }
6327         }
6328         rtnl_unlock();
6329 }
6330
6331 static void __net_exit default_device_exit_batch(struct list_head *net_list)
6332 {
6333         /* At exit all network devices most be removed from a network
6334          * namespace.  Do this in the reverse order of registration.
6335          * Do this across as many network namespaces as possible to
6336          * improve batching efficiency.
6337          */
6338         struct net_device *dev;
6339         struct net *net;
6340         LIST_HEAD(dev_kill_list);
6341
6342         rtnl_lock();
6343         list_for_each_entry(net, net_list, exit_list) {
6344                 for_each_netdev_reverse(net, dev) {
6345                         if (dev->rtnl_link_ops)
6346                                 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6347                         else
6348                                 unregister_netdevice_queue(dev, &dev_kill_list);
6349                 }
6350         }
6351         unregister_netdevice_many(&dev_kill_list);
6352         list_del(&dev_kill_list);
6353         rtnl_unlock();
6354 }
6355
6356 static struct pernet_operations __net_initdata default_device_ops = {
6357         .exit = default_device_exit,
6358         .exit_batch = default_device_exit_batch,
6359 };
6360
6361 /*
6362  *      Initialize the DEV module. At boot time this walks the device list and
6363  *      unhooks any devices that fail to initialise (normally hardware not
6364  *      present) and leaves us with a valid list of present and active devices.
6365  *
6366  */
6367
6368 /*
6369  *       This is called single threaded during boot, so no need
6370  *       to take the rtnl semaphore.
6371  */
6372 static int __init net_dev_init(void)
6373 {
6374         int i, rc = -ENOMEM;
6375
6376         BUG_ON(!dev_boot_phase);
6377
6378         if (dev_proc_init())
6379                 goto out;
6380
6381         if (netdev_kobject_init())
6382                 goto out;
6383
6384         INIT_LIST_HEAD(&ptype_all);
6385         for (i = 0; i < PTYPE_HASH_SIZE; i++)
6386                 INIT_LIST_HEAD(&ptype_base[i]);
6387
6388         if (register_pernet_subsys(&netdev_net_ops))
6389                 goto out;
6390
6391         /*
6392          *      Initialise the packet receive queues.
6393          */
6394
6395         for_each_possible_cpu(i) {
6396                 struct softnet_data *sd = &per_cpu(softnet_data, i);
6397
6398                 memset(sd, 0, sizeof(*sd));
6399                 skb_queue_head_init(&sd->input_pkt_queue);
6400                 skb_queue_head_init(&sd->process_queue);
6401                 sd->completion_queue = NULL;
6402                 INIT_LIST_HEAD(&sd->poll_list);
6403                 sd->output_queue = NULL;
6404                 sd->output_queue_tailp = &sd->output_queue;
6405 #ifdef CONFIG_RPS
6406                 sd->csd.func = rps_trigger_softirq;
6407                 sd->csd.info = sd;
6408                 sd->csd.flags = 0;
6409                 sd->cpu = i;
6410 #endif
6411
6412                 sd->backlog.poll = process_backlog;
6413                 sd->backlog.weight = weight_p;
6414                 sd->backlog.gro_list = NULL;
6415                 sd->backlog.gro_count = 0;
6416         }
6417
6418         dev_boot_phase = 0;
6419
6420         /* The loopback device is special if any other network devices
6421          * is present in a network namespace the loopback device must
6422          * be present. Since we now dynamically allocate and free the
6423          * loopback device ensure this invariant is maintained by
6424          * keeping the loopback device as the first device on the
6425          * list of network devices.  Ensuring the loopback devices
6426          * is the first device that appears and the last network device
6427          * that disappears.
6428          */
6429         if (register_pernet_device(&loopback_net_ops))
6430                 goto out;
6431
6432         if (register_pernet_device(&default_device_ops))
6433                 goto out;
6434
6435         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6436         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6437
6438         hotcpu_notifier(dev_cpu_callback, 0);
6439         dst_init();
6440         dev_mcast_init();
6441         rc = 0;
6442 out:
6443         return rc;
6444 }
6445
6446 subsys_initcall(net_dev_init);
6447
6448 static int __init initialize_hashrnd(void)
6449 {
6450         get_random_bytes(&hashrnd, sizeof(hashrnd));
6451         return 0;
6452 }
6453
6454 late_initcall_sync(initialize_hashrnd);
6455