net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro
  11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17  *              David Hinds <dahinds@users.sourceforge.net>
  18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19  *              Adam Sulmicki <adam@cfar.umd.edu>
  20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <asm/system.h>
  77 #include <linux/bitops.h>
  78 #include <linux/capability.h>
  79 #include <linux/cpu.h>
  80 #include <linux/types.h>
  81 #include <linux/kernel.h>
  82 #include <linux/hash.h>
  83 #include <linux/slab.h>
  84 #include <linux/sched.h>
  85 #include <linux/mutex.h>
  86 #include <linux/string.h>
  87 #include <linux/mm.h>
  88 #include <linux/socket.h>
  89 #include <linux/sockios.h>
  90 #include <linux/errno.h>
  91 #include <linux/interrupt.h>
  92 #include <linux/if_ether.h>
  93 #include <linux/netdevice.h>
  94 #include <linux/etherdevice.h>
  95 #include <linux/ethtool.h>
  96 #include <linux/notifier.h>
  97 #include <linux/skbuff.h>
  98 #include <net/net_namespace.h>
  99 #include <net/sock.h>
 100 #include <linux/rtnetlink.h>
 101 #include <linux/proc_fs.h>
 102 #include <linux/seq_file.h>
 103 #include <linux/stat.h>
 104 #include <net/dst.h>
 105 #include <net/pkt_sched.h>
 106 #include <net/checksum.h>
 107 #include <net/xfrm.h>
 108 #include <linux/highmem.h>
 109 #include <linux/init.h>
 110 #include <linux/kmod.h>
 111 #include <linux/module.h>
 112 #include <linux/netpoll.h>
 113 #include <linux/rcupdate.h>
 114 #include <linux/delay.h>
 115 #include <net/wext.h>
 116 #include <net/iw_handler.h>
 117 #include <asm/current.h>
 118 #include <linux/audit.h>
 119 #include <linux/dmaengine.h>
 120 #include <linux/err.h>
 121 #include <linux/ctype.h>
 122 #include <linux/if_arp.h>
 123 #include <linux/if_vlan.h>
 124 #include <linux/ip.h>
 125 #include <net/ip.h>
 126 #include <linux/ipv6.h>
 127 #include <linux/in.h>
 128 #include <linux/jhash.h>
 129 #include <linux/random.h>
 130 #include <trace/events/napi.h>
 131 #include <trace/events/net.h>
 132 #include <trace/events/skb.h>
 133 #include <linux/pci.h>
 134 #include <linux/inetdevice.h>
 135 #include <linux/cpu_rmap.h>
 136
 137 #include "net-sysfs.h"
 138
 139 /* Instead of increasing this, you should create a hash table. */
 140 #define MAX_GRO_SKBS 8
 141
 142 /* This should be increased if a protocol with a bigger head is added. */
 143 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 144
 145 /*
 146  *      The list of packet types we will receive (as opposed to discard)
 147  *      and the routines to invoke.
 148  *
 149  *      Why 16. Because with 16 the only overlap we get on a hash of the
 150  *      low nibble of the protocol value is RARP/SNAP/X.25.
 151  *
 152  *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
 153  *             sure which should go first, but I bet it won't make much
 154  *             difference if we are running VLANs.  The good news is that
 155  *             this protocol won't be in the list unless compiled in, so
 156  *             the average user (w/out VLANs) will not be adversely affected.
 157  *             --BLG
 158  *
 159  *              0800    IP
 160  *              8100    802.1Q VLAN
 161  *              0001    802.3
 162  *              0002    AX.25
 163  *              0004    802.2
 164  *              8035    RARP
 165  *              0005    SNAP
 166  *              0805    X.25
 167  *              0806    ARP
 168  *              8137    IPX
 169  *              0009    Localtalk
 170  *              86DD    IPv6
 171  */
 172
 173 #define PTYPE_HASH_SIZE (16)
 174 #define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
 175
 176 static DEFINE_SPINLOCK(ptype_lock);
 177 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 178 static struct list_head ptype_all __read_mostly;        /* Taps */
 179
 180 /*
 181  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 182  * semaphore.
 183  *
 184  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 185  *
 186  * Writers must hold the rtnl semaphore while they loop through the
 187  * dev_base_head list, and hold dev_base_lock for writing when they do the
 188  * actual updates.  This allows pure readers to access the list even
 189  * while a writer is preparing to update it.
 190  *
 191  * To put it another way, dev_base_lock is held for writing only to
 192  * protect against pure readers; the rtnl semaphore provides the
 193  * protection against other writers.
 194  *
 195  * See, for example usages, register_netdevice() and
 196  * unregister_netdevice(), which must be called with the rtnl
 197  * semaphore held.
 198  */
 199 DEFINE_RWLOCK(dev_base_lock);
 200 EXPORT_SYMBOL(dev_base_lock);
 201
 202 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 203 {
 204         unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 205         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 206 }
 207
 208 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 209 {
 210         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 211 }
 212
 213 static inline void rps_lock(struct softnet_data *sd)
 214 {
 215 #ifdef CONFIG_RPS
 216         spin_lock(&sd->input_pkt_queue.lock);
 217 #endif
 218 }
 219
 220 static inline void rps_unlock(struct softnet_data *sd)
 221 {
 222 #ifdef CONFIG_RPS
 223         spin_unlock(&sd->input_pkt_queue.lock);
 224 #endif
 225 }
 226
 227 /* Device list insertion */
 228 static int list_netdevice(struct net_device *dev)
 229 {
 230         struct net *net = dev_net(dev);
 231
 232         ASSERT_RTNL();
 233
 234         write_lock_bh(&dev_base_lock);
 235         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 236         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 237         hlist_add_head_rcu(&dev->index_hlist,
 238                            dev_index_hash(net, dev->ifindex));
 239         write_unlock_bh(&dev_base_lock);
 240         return 0;
 241 }
 242
 243 /* Device list removal
 244  * caller must respect a RCU grace period before freeing/reusing dev
 245  */
 246 static void unlist_netdevice(struct net_device *dev)
 247 {
 248         ASSERT_RTNL();
 249
 250         /* Unlink dev from the device chain */
 251         write_lock_bh(&dev_base_lock);
 252         list_del_rcu(&dev->dev_list);
 253         hlist_del_rcu(&dev->name_hlist);
 254         hlist_del_rcu(&dev->index_hlist);
 255         write_unlock_bh(&dev_base_lock);
 256 }
 257
 258 /*
 259  *      Our notifier list
 260  */
 261
 262 static RAW_NOTIFIER_HEAD(netdev_chain);
 263
 264 /*
 265  *      Device drivers call our routines to queue packets here. We empty the
 266  *      queue in the local softnet handler.
 267  */
 268
 269 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 270 EXPORT_PER_CPU_SYMBOL(softnet_data);
 271
 272 #ifdef CONFIG_LOCKDEP
 273 /*
 274  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 275  * according to dev->type
 276  */
 277 static const unsigned short netdev_lock_type[] =
 278         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 279          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 280          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 281          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 282          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 283          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 284          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 285          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 286          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 287          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 288          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 289          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 290          ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
 291          ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
 292          ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,
 293          ARPHRD_VOID, ARPHRD_NONE};
 294
 295 static const char *const netdev_lock_name[] =
 296         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 297          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 298          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 299          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 300          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 301          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 302          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 303          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 304          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 305          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 306          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 307          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 308          "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
 309          "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
 310          "_xmit_PHONET_PIPE", "_xmit_IEEE802154",
 311          "_xmit_VOID", "_xmit_NONE"};
 312
 313 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 314 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 315
 316 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 317 {
 318         int i;
 319
 320         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 321                 if (netdev_lock_type[i] == dev_type)
 322                         return i;
 323         /* the last key is used by default */
 324         return ARRAY_SIZE(netdev_lock_type) - 1;
 325 }
 326
 327 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 328                                                  unsigned short dev_type)
 329 {
 330         int i;
 331
 332         i = netdev_lock_pos(dev_type);
 333         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 334                                    netdev_lock_name[i]);
 335 }
 336
 337 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 338 {
 339         int i;
 340
 341         i = netdev_lock_pos(dev->type);
 342         lockdep_set_class_and_name(&dev->addr_list_lock,
 343                                    &netdev_addr_lock_key[i],
 344                                    netdev_lock_name[i]);
 345 }
 346 #else
 347 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 348                                                  unsigned short dev_type)
 349 {
 350 }
 351 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 352 {
 353 }
 354 #endif
 355
 356 /*******************************************************************************
 357
 358                 Protocol management and registration routines
 359
 360 *******************************************************************************/
 361
 362 /*
 363  *      Add a protocol ID to the list. Now that the input handler is
 364  *      smarter we can dispense with all the messy stuff that used to be
 365  *      here.
 366  *
 367  *      BEWARE!!! Protocol handlers, mangling input packets,
 368  *      MUST BE last in hash buckets and checking protocol handlers
 369  *      MUST start from promiscuous ptype_all chain in net_bh.
 370  *      It is true now, do not change it.
 371  *      Explanation follows: if protocol handler, mangling packet, will
 372  *      be the first on list, it is not able to sense, that packet
 373  *      is cloned and should be copied-on-write, so that it will
 374  *      change it and subsequent readers will get broken packet.
 375  *                                                      --ANK (980803)
 376  */
 377
 378 static inline struct list_head *ptype_head(const struct packet_type *pt)
 379 {
 380         if (pt->type == htons(ETH_P_ALL))
 381                 return &ptype_all;
 382         else
 383                 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 384 }
 385
 386 /**
 387  *      dev_add_pack - add packet handler
 388  *      @pt: packet type declaration
 389  *
 390  *      Add a protocol handler to the networking stack. The passed &packet_type
 391  *      is linked into kernel lists and may not be freed until it has been
 392  *      removed from the kernel lists.
 393  *
 394  *      This call does not sleep therefore it can not
 395  *      guarantee all CPU's that are in middle of receiving packets
 396  *      will see the new packet type (until the next received packet).
 397  */
 398
 399 void dev_add_pack(struct packet_type *pt)
 400 {
 401         struct list_head *head = ptype_head(pt);
 402
 403         spin_lock(&ptype_lock);
 404         list_add_rcu(&pt->list, head);
 405         spin_unlock(&ptype_lock);
 406 }
 407 EXPORT_SYMBOL(dev_add_pack);
 408
 409 /**
 410  *      __dev_remove_pack        - remove packet handler
 411  *      @pt: packet type declaration
 412  *
 413  *      Remove a protocol handler that was previously added to the kernel
 414  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 415  *      from the kernel lists and can be freed or reused once this function
 416  *      returns.
 417  *
 418  *      The packet type might still be in use by receivers
 419  *      and must not be freed until after all the CPU's have gone
 420  *      through a quiescent state.
 421  */
 422 void __dev_remove_pack(struct packet_type *pt)
 423 {
 424         struct list_head *head = ptype_head(pt);
 425         struct packet_type *pt1;
 426
 427         spin_lock(&ptype_lock);
 428
 429         list_for_each_entry(pt1, head, list) {
 430                 if (pt == pt1) {
 431                         list_del_rcu(&pt->list);
 432                         goto out;
 433                 }
 434         }
 435
 436         printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
 437 out:
 438         spin_unlock(&ptype_lock);
 439 }
 440 EXPORT_SYMBOL(__dev_remove_pack);
 441
 442 /**
 443  *      dev_remove_pack  - remove packet handler
 444  *      @pt: packet type declaration
 445  *
 446  *      Remove a protocol handler that was previously added to the kernel
 447  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 448  *      from the kernel lists and can be freed or reused once this function
 449  *      returns.
 450  *
 451  *      This call sleeps to guarantee that no CPU is looking at the packet
 452  *      type after return.
 453  */
 454 void dev_remove_pack(struct packet_type *pt)
 455 {
 456         __dev_remove_pack(pt);
 457
 458         synchronize_net();
 459 }
 460 EXPORT_SYMBOL(dev_remove_pack);
 461
 462 /******************************************************************************
 463
 464                       Device Boot-time Settings Routines
 465
 466 *******************************************************************************/
 467
 468 /* Boot time configuration table */
 469 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 470
 471 /**
 472  *      netdev_boot_setup_add   - add new setup entry
 473  *      @name: name of the device
 474  *      @map: configured settings for the device
 475  *
 476  *      Adds new setup entry to the dev_boot_setup list.  The function
 477  *      returns 0 on error and 1 on success.  This is a generic routine to
 478  *      all netdevices.
 479  */
 480 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 481 {
 482         struct netdev_boot_setup *s;
 483         int i;
 484
 485         s = dev_boot_setup;
 486         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 487                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 488                         memset(s[i].name, 0, sizeof(s[i].name));
 489                         strlcpy(s[i].name, name, IFNAMSIZ);
 490                         memcpy(&s[i].map, map, sizeof(s[i].map));
 491                         break;
 492                 }
 493         }
 494
 495         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 496 }
 497
 498 /**
 499  *      netdev_boot_setup_check - check boot time settings
 500  *      @dev: the netdevice
 501  *
 502  *      Check boot time settings for the device.
 503  *      The found settings are set for the device to be used
 504  *      later in the device probing.
 505  *      Returns 0 if no settings found, 1 if they are.
 506  */
 507 int netdev_boot_setup_check(struct net_device *dev)
 508 {
 509         struct netdev_boot_setup *s = dev_boot_setup;
 510         int i;
 511
 512         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 513                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 514                     !strcmp(dev->name, s[i].name)) {
 515                         dev->irq        = s[i].map.irq;
 516                         dev->base_addr  = s[i].map.base_addr;
 517                         dev->mem_start  = s[i].map.mem_start;
 518                         dev->mem_end    = s[i].map.mem_end;
 519                         return 1;
 520                 }
 521         }
 522         return 0;
 523 }
 524 EXPORT_SYMBOL(netdev_boot_setup_check);
 525
 526
 527 /**
 528  *      netdev_boot_base        - get address from boot time settings
 529  *      @prefix: prefix for network device
 530  *      @unit: id for network device
 531  *
 532  *      Check boot time settings for the base address of device.
 533  *      The found settings are set for the device to be used
 534  *      later in the device probing.
 535  *      Returns 0 if no settings found.
 536  */
 537 unsigned long netdev_boot_base(const char *prefix, int unit)
 538 {
 539         const struct netdev_boot_setup *s = dev_boot_setup;
 540         char name[IFNAMSIZ];
 541         int i;
 542
 543         sprintf(name, "%s%d", prefix, unit);
 544
 545         /*
 546          * If device already registered then return base of 1
 547          * to indicate not to probe for this interface
 548          */
 549         if (__dev_get_by_name(&init_net, name))
 550                 return 1;
 551
 552         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 553                 if (!strcmp(name, s[i].name))
 554                         return s[i].map.base_addr;
 555         return 0;
 556 }
 557
 558 /*
 559  * Saves at boot time configured settings for any netdevice.
 560  */
 561 int __init netdev_boot_setup(char *str)
 562 {
 563         int ints[5];
 564         struct ifmap map;
 565
 566         str = get_options(str, ARRAY_SIZE(ints), ints);
 567         if (!str || !*str)
 568                 return 0;
 569
 570         /* Save settings */
 571         memset(&map, 0, sizeof(map));
 572         if (ints[0] > 0)
 573                 map.irq = ints[1];
 574         if (ints[0] > 1)
 575                 map.base_addr = ints[2];
 576         if (ints[0] > 2)
 577                 map.mem_start = ints[3];
 578         if (ints[0] > 3)
 579                 map.mem_end = ints[4];
 580
 581         /* Add new entry to the list */
 582         return netdev_boot_setup_add(str, &map);
 583 }
 584
 585 __setup("netdev=", netdev_boot_setup);
 586
 587 /*******************************************************************************
 588
 589                             Device Interface Subroutines
 590
 591 *******************************************************************************/
 592
 593 /**
 594  *      __dev_get_by_name       - find a device by its name
 595  *      @net: the applicable net namespace
 596  *      @name: name to find
 597  *
 598  *      Find an interface by name. Must be called under RTNL semaphore
 599  *      or @dev_base_lock. If the name is found a pointer to the device
 600  *      is returned. If the name is not found then %NULL is returned. The
 601  *      reference counters are not incremented so the caller must be
 602  *      careful with locks.
 603  */
 604
 605 struct net_device *__dev_get_by_name(struct net *net, const char *name)
 606 {
 607         struct hlist_node *p;
 608         struct net_device *dev;
 609         struct hlist_head *head = dev_name_hash(net, name);
 610
 611         hlist_for_each_entry(dev, p, head, name_hlist)
 612                 if (!strncmp(dev->name, name, IFNAMSIZ))
 613                         return dev;
 614
 615         return NULL;
 616 }
 617 EXPORT_SYMBOL(__dev_get_by_name);
 618
 619 /**
 620  *      dev_get_by_name_rcu     - find a device by its name
 621  *      @net: the applicable net namespace
 622  *      @name: name to find
 623  *
 624  *      Find an interface by name.
 625  *      If the name is found a pointer to the device is returned.
 626  *      If the name is not found then %NULL is returned.
 627  *      The reference counters are not incremented so the caller must be
 628  *      careful with locks. The caller must hold RCU lock.
 629  */
 630
 631 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 632 {
 633         struct hlist_node *p;
 634         struct net_device *dev;
 635         struct hlist_head *head = dev_name_hash(net, name);
 636
 637         hlist_for_each_entry_rcu(dev, p, head, name_hlist)
 638                 if (!strncmp(dev->name, name, IFNAMSIZ))
 639                         return dev;
 640
 641         return NULL;
 642 }
 643 EXPORT_SYMBOL(dev_get_by_name_rcu);
 644
 645 /**
 646  *      dev_get_by_name         - find a device by its name
 647  *      @net: the applicable net namespace
 648  *      @name: name to find
 649  *
 650  *      Find an interface by name. This can be called from any
 651  *      context and does its own locking. The returned handle has
 652  *      the usage count incremented and the caller must use dev_put() to
 653  *      release it when it is no longer needed. %NULL is returned if no
 654  *      matching device is found.
 655  */
 656
 657 struct net_device *dev_get_by_name(struct net *net, const char *name)
 658 {
 659         struct net_device *dev;
 660
 661         rcu_read_lock();
 662         dev = dev_get_by_name_rcu(net, name);
 663         if (dev)
 664                 dev_hold(dev);
 665         rcu_read_unlock();
 666         return dev;
 667 }
 668 EXPORT_SYMBOL(dev_get_by_name);
 669
 670 /**
 671  *      __dev_get_by_index - find a device by its ifindex
 672  *      @net: the applicable net namespace
 673  *      @ifindex: index of device
 674  *
 675  *      Search for an interface by index. Returns %NULL if the device
 676  *      is not found or a pointer to the device. The device has not
 677  *      had its reference counter increased so the caller must be careful
 678  *      about locking. The caller must hold either the RTNL semaphore
 679  *      or @dev_base_lock.
 680  */
 681
 682 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 683 {
 684         struct hlist_node *p;
 685         struct net_device *dev;
 686         struct hlist_head *head = dev_index_hash(net, ifindex);
 687
 688         hlist_for_each_entry(dev, p, head, index_hlist)
 689                 if (dev->ifindex == ifindex)
 690                         return dev;
 691
 692         return NULL;
 693 }
 694 EXPORT_SYMBOL(__dev_get_by_index);
 695
 696 /**
 697  *      dev_get_by_index_rcu - find a device by its ifindex
 698  *      @net: the applicable net namespace
 699  *      @ifindex: index of device
 700  *
 701  *      Search for an interface by index. Returns %NULL if the device
 702  *      is not found or a pointer to the device. The device has not
 703  *      had its reference counter increased so the caller must be careful
 704  *      about locking. The caller must hold RCU lock.
 705  */
 706
 707 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 708 {
 709         struct hlist_node *p;
 710         struct net_device *dev;
 711         struct hlist_head *head = dev_index_hash(net, ifindex);
 712
 713         hlist_for_each_entry_rcu(dev, p, head, index_hlist)
 714                 if (dev->ifindex == ifindex)
 715                         return dev;
 716
 717         return NULL;
 718 }
 719 EXPORT_SYMBOL(dev_get_by_index_rcu);
 720
 721
 722 /**
 723  *      dev_get_by_index - find a device by its ifindex
 724  *      @net: the applicable net namespace
 725  *      @ifindex: index of device
 726  *
 727  *      Search for an interface by index. Returns NULL if the device
 728  *      is not found or a pointer to the device. The device returned has
 729  *      had a reference added and the pointer is safe until the user calls
 730  *      dev_put to indicate they have finished with it.
 731  */
 732
 733 struct net_device *dev_get_by_index(struct net *net, int ifindex)
 734 {
 735         struct net_device *dev;
 736
 737         rcu_read_lock();
 738         dev = dev_get_by_index_rcu(net, ifindex);
 739         if (dev)
 740                 dev_hold(dev);
 741         rcu_read_unlock();
 742         return dev;
 743 }
 744 EXPORT_SYMBOL(dev_get_by_index);
 745
 746 /**
 747  *      dev_getbyhwaddr_rcu - find a device by its hardware address
 748  *      @net: the applicable net namespace
 749  *      @type: media type of device
 750  *      @ha: hardware address
 751  *
 752  *      Search for an interface by MAC address. Returns NULL if the device
 753  *      is not found or a pointer to the device.
 754  *      The caller must hold RCU or RTNL.
 755  *      The returned device has not had its ref count increased
 756  *      and the caller must therefore be careful about locking
 757  *
 758  */
 759
 760 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 761                                        const char *ha)
 762 {
 763         struct net_device *dev;
 764
 765         for_each_netdev_rcu(net, dev)
 766                 if (dev->type == type &&
 767                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 768                         return dev;
 769
 770         return NULL;
 771 }
 772 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 773
 774 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 775 {
 776         struct net_device *dev;
 777
 778         ASSERT_RTNL();
 779         for_each_netdev(net, dev)
 780                 if (dev->type == type)
 781                         return dev;
 782
 783         return NULL;
 784 }
 785 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 786
 787 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 788 {
 789         struct net_device *dev, *ret = NULL;
 790
 791         rcu_read_lock();
 792         for_each_netdev_rcu(net, dev)
 793                 if (dev->type == type) {
 794                         dev_hold(dev);
 795                         ret = dev;
 796                         break;
 797                 }
 798         rcu_read_unlock();
 799         return ret;
 800 }
 801 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 802
 803 /**
 804  *      dev_get_by_flags_rcu - find any device with given flags
 805  *      @net: the applicable net namespace
 806  *      @if_flags: IFF_* values
 807  *      @mask: bitmask of bits in if_flags to check
 808  *
 809  *      Search for any interface with the given flags. Returns NULL if a device
 810  *      is not found or a pointer to the device. Must be called inside
 811  *      rcu_read_lock(), and result refcount is unchanged.
 812  */
 813
 814 struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
 815                                     unsigned short mask)
 816 {
 817         struct net_device *dev, *ret;
 818
 819         ret = NULL;
 820         for_each_netdev_rcu(net, dev) {
 821                 if (((dev->flags ^ if_flags) & mask) == 0) {
 822                         ret = dev;
 823                         break;
 824                 }
 825         }
 826         return ret;
 827 }
 828 EXPORT_SYMBOL(dev_get_by_flags_rcu);
 829
 830 /**
 831  *      dev_valid_name - check if name is okay for network device
 832  *      @name: name string
 833  *
 834  *      Network device names need to be valid file names to
 835  *      to allow sysfs to work.  We also disallow any kind of
 836  *      whitespace.
 837  */
 838 int dev_valid_name(const char *name)
 839 {
 840         if (*name == '\0')
 841                 return 0;
 842         if (strlen(name) >= IFNAMSIZ)
 843                 return 0;
 844         if (!strcmp(name, ".") || !strcmp(name, ".."))
 845                 return 0;
 846
 847         while (*name) {
 848                 if (*name == '/' || isspace(*name))
 849                         return 0;
 850                 name++;
 851         }
 852         return 1;
 853 }
 854 EXPORT_SYMBOL(dev_valid_name);
 855
 856 /**
 857  *      __dev_alloc_name - allocate a name for a device
 858  *      @net: network namespace to allocate the device name in
 859  *      @name: name format string
 860  *      @buf:  scratch buffer and result name string
 861  *
 862  *      Passed a format string - eg "lt%d" it will try and find a suitable
 863  *      id. It scans list of devices to build up a free map, then chooses
 864  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 865  *      while allocating the name and adding the device in order to avoid
 866  *      duplicates.
 867  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 868  *      Returns the number of the unit assigned or a negative errno code.
 869  */
 870
 871 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 872 {
 873         int i = 0;
 874         const char *p;
 875         const int max_netdevices = 8*PAGE_SIZE;
 876         unsigned long *inuse;
 877         struct net_device *d;
 878
 879         p = strnchr(name, IFNAMSIZ-1, '%');
 880         if (p) {
 881                 /*
 882                  * Verify the string as this thing may have come from
 883                  * the user.  There must be either one "%d" and no other "%"
 884                  * characters.
 885                  */
 886                 if (p[1] != 'd' || strchr(p + 2, '%'))
 887                         return -EINVAL;
 888
 889                 /* Use one page as a bit array of possible slots */
 890                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 891                 if (!inuse)
 892                         return -ENOMEM;
 893
 894                 for_each_netdev(net, d) {
 895                         if (!sscanf(d->name, name, &i))
 896                                 continue;
 897                         if (i < 0 || i >= max_netdevices)
 898                                 continue;
 899
 900                         /*  avoid cases where sscanf is not exact inverse of printf */
 901                         snprintf(buf, IFNAMSIZ, name, i);
 902                         if (!strncmp(buf, d->name, IFNAMSIZ))
 903                                 set_bit(i, inuse);
 904                 }
 905
 906                 i = find_first_zero_bit(inuse, max_netdevices);
 907                 free_page((unsigned long) inuse);
 908         }
 909
 910         if (buf != name)
 911                 snprintf(buf, IFNAMSIZ, name, i);
 912         if (!__dev_get_by_name(net, buf))
 913                 return i;
 914
 915         /* It is possible to run out of possible slots
 916          * when the name is long and there isn't enough space left
 917          * for the digits, or if all bits are used.
 918          */
 919         return -ENFILE;
 920 }
 921
 922 /**
 923  *      dev_alloc_name - allocate a name for a device
 924  *      @dev: device
 925  *      @name: name format string
 926  *
 927  *      Passed a format string - eg "lt%d" it will try and find a suitable
 928  *      id. It scans list of devices to build up a free map, then chooses
 929  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 930  *      while allocating the name and adding the device in order to avoid
 931  *      duplicates.
 932  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 933  *      Returns the number of the unit assigned or a negative errno code.
 934  */
 935
 936 int dev_alloc_name(struct net_device *dev, const char *name)
 937 {
 938         char buf[IFNAMSIZ];
 939         struct net *net;
 940         int ret;
 941
 942         BUG_ON(!dev_net(dev));
 943         net = dev_net(dev);
 944         ret = __dev_alloc_name(net, name, buf);
 945         if (ret >= 0)
 946                 strlcpy(dev->name, buf, IFNAMSIZ);
 947         return ret;
 948 }
 949 EXPORT_SYMBOL(dev_alloc_name);
 950
 951 static int dev_get_valid_name(struct net_device *dev, const char *name, bool fmt)
 952 {
 953         struct net *net;
 954
 955         BUG_ON(!dev_net(dev));
 956         net = dev_net(dev);
 957
 958         if (!dev_valid_name(name))
 959                 return -EINVAL;
 960
 961         if (fmt && strchr(name, '%'))
 962                 return dev_alloc_name(dev, name);
 963         else if (__dev_get_by_name(net, name))
 964                 return -EEXIST;
 965         else if (dev->name != name)
 966                 strlcpy(dev->name, name, IFNAMSIZ);
 967
 968         return 0;
 969 }
 970
 971 /**
 972  *      dev_change_name - change name of a device
 973  *      @dev: device
 974  *      @newname: name (or format string) must be at least IFNAMSIZ
 975  *
 976  *      Change name of a device, can pass format strings "eth%d".
 977  *      for wildcarding.
 978  */
 979 int dev_change_name(struct net_device *dev, const char *newname)
 980 {
 981         char oldname[IFNAMSIZ];
 982         int err = 0;
 983         int ret;
 984         struct net *net;
 985
 986         ASSERT_RTNL();
 987         BUG_ON(!dev_net(dev));
 988
 989         net = dev_net(dev);
 990         if (dev->flags & IFF_UP)
 991                 return -EBUSY;
 992
 993         if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
 994                 return 0;
 995
 996         memcpy(oldname, dev->name, IFNAMSIZ);
 997
 998         err = dev_get_valid_name(dev, newname, 1);
 999         if (err < 0)
1000                 return err;
1001
1002 rollback:
1003         ret = device_rename(&dev->dev, dev->name);
1004         if (ret) {
1005                 memcpy(dev->name, oldname, IFNAMSIZ);
1006                 return ret;
1007         }
1008
1009         write_lock_bh(&dev_base_lock);
1010         hlist_del(&dev->name_hlist);
1011         write_unlock_bh(&dev_base_lock);
1012
1013         synchronize_rcu();
1014
1015         write_lock_bh(&dev_base_lock);
1016         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1017         write_unlock_bh(&dev_base_lock);
1018
1019         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1020         ret = notifier_to_errno(ret);
1021
1022         if (ret) {
1023                 /* err >= 0 after dev_alloc_name() or stores the first errno */
1024                 if (err >= 0) {
1025                         err = ret;
1026                         memcpy(dev->name, oldname, IFNAMSIZ);
1027                         goto rollback;
1028                 } else {
1029                         printk(KERN_ERR
1030                                "%s: name change rollback failed: %d.\n",
1031                                dev->name, ret);
1032                 }
1033         }
1034
1035         return err;
1036 }
1037
1038 /**
1039  *      dev_set_alias - change ifalias of a device
1040  *      @dev: device
1041  *      @alias: name up to IFALIASZ
1042  *      @len: limit of bytes to copy from info
1043  *
1044  *      Set ifalias for a device,
1045  */
1046 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1047 {
1048         ASSERT_RTNL();
1049
1050         if (len >= IFALIASZ)
1051                 return -EINVAL;
1052
1053         if (!len) {
1054                 if (dev->ifalias) {
1055                         kfree(dev->ifalias);
1056                         dev->ifalias = NULL;
1057                 }
1058                 return 0;
1059         }
1060
1061         dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1062         if (!dev->ifalias)
1063                 return -ENOMEM;
1064
1065         strlcpy(dev->ifalias, alias, len+1);
1066         return len;
1067 }
1068
1069
1070 /**
1071  *      netdev_features_change - device changes features
1072  *      @dev: device to cause notification
1073  *
1074  *      Called to indicate a device has changed features.
1075  */
1076 void netdev_features_change(struct net_device *dev)
1077 {
1078         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1079 }
1080 EXPORT_SYMBOL(netdev_features_change);
1081
1082 /**
1083  *      netdev_state_change - device changes state
1084  *      @dev: device to cause notification
1085  *
1086  *      Called to indicate a device has changed state. This function calls
1087  *      the notifier chains for netdev_chain and sends a NEWLINK message
1088  *      to the routing socket.
1089  */
1090 void netdev_state_change(struct net_device *dev)
1091 {
1092         if (dev->flags & IFF_UP) {
1093                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1094                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1095         }
1096 }
1097 EXPORT_SYMBOL(netdev_state_change);
1098
1099 int netdev_bonding_change(struct net_device *dev, unsigned long event)
1100 {
1101         return call_netdevice_notifiers(event, dev);
1102 }
1103 EXPORT_SYMBOL(netdev_bonding_change);
1104
1105 /**
1106  *      dev_load        - load a network module
1107  *      @net: the applicable net namespace
1108  *      @name: name of interface
1109  *
1110  *      If a network interface is not present and the process has suitable
1111  *      privileges this function loads the module. If module loading is not
1112  *      available in this kernel then it becomes a nop.
1113  */
1114
1115 void dev_load(struct net *net, const char *name)
1116 {
1117         struct net_device *dev;
1118         int no_module;
1119
1120         rcu_read_lock();
1121         dev = dev_get_by_name_rcu(net, name);
1122         rcu_read_unlock();
1123
1124         no_module = !dev;
1125         if (no_module && capable(CAP_NET_ADMIN))
1126                 no_module = request_module("netdev-%s", name);
1127         if (no_module && capable(CAP_SYS_MODULE)) {
1128                 if (!request_module("%s", name))
1129                         pr_err("Loading kernel module for a network device "
1130 "with CAP_SYS_MODULE (deprecated).  Use CAP_NET_ADMIN and alias netdev-%s "
1131 "instead\n", name);
1132         }
1133 }
1134 EXPORT_SYMBOL(dev_load);
1135
1136 static int __dev_open(struct net_device *dev)
1137 {
1138         const struct net_device_ops *ops = dev->netdev_ops;
1139         int ret;
1140
1141         ASSERT_RTNL();
1142
1143         if (!netif_device_present(dev))
1144                 return -ENODEV;
1145
1146         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1147         ret = notifier_to_errno(ret);
1148         if (ret)
1149                 return ret;
1150
1151         set_bit(__LINK_STATE_START, &dev->state);
1152
1153         if (ops->ndo_validate_addr)
1154                 ret = ops->ndo_validate_addr(dev);
1155
1156         if (!ret && ops->ndo_open)
1157                 ret = ops->ndo_open(dev);
1158
1159         if (ret)
1160                 clear_bit(__LINK_STATE_START, &dev->state);
1161         else {
1162                 dev->flags |= IFF_UP;
1163                 net_dmaengine_get();
1164                 dev_set_rx_mode(dev);
1165                 dev_activate(dev);
1166         }
1167
1168         return ret;
1169 }
1170
1171 /**
1172  *      dev_open        - prepare an interface for use.
1173  *      @dev:   device to open
1174  *
1175  *      Takes a device from down to up state. The device's private open
1176  *      function is invoked and then the multicast lists are loaded. Finally
1177  *      the device is moved into the up state and a %NETDEV_UP message is
1178  *      sent to the netdev notifier chain.
1179  *
1180  *      Calling this function on an active interface is a nop. On a failure
1181  *      a negative errno code is returned.
1182  */
1183 int dev_open(struct net_device *dev)
1184 {
1185         int ret;
1186
1187         if (dev->flags & IFF_UP)
1188                 return 0;
1189
1190         ret = __dev_open(dev);
1191         if (ret < 0)
1192                 return ret;
1193
1194         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1195         call_netdevice_notifiers(NETDEV_UP, dev);
1196
1197         return ret;
1198 }
1199 EXPORT_SYMBOL(dev_open);
1200
1201 static int __dev_close_many(struct list_head *head)
1202 {
1203         struct net_device *dev;
1204
1205         ASSERT_RTNL();
1206         might_sleep();
1207
1208         list_for_each_entry(dev, head, unreg_list) {
1209                 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1210
1211                 clear_bit(__LINK_STATE_START, &dev->state);
1212
1213                 /* Synchronize to scheduled poll. We cannot touch poll list, it
1214                  * can be even on different cpu. So just clear netif_running().
1215                  *
1216                  * dev->stop() will invoke napi_disable() on all of it's
1217                  * napi_struct instances on this device.
1218                  */
1219                 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1220         }
1221
1222         dev_deactivate_many(head);
1223
1224         list_for_each_entry(dev, head, unreg_list) {
1225                 const struct net_device_ops *ops = dev->netdev_ops;
1226
1227                 /*
1228                  *      Call the device specific close. This cannot fail.
1229                  *      Only if device is UP
1230                  *
1231                  *      We allow it to be called even after a DETACH hot-plug
1232                  *      event.
1233                  */
1234                 if (ops->ndo_stop)
1235                         ops->ndo_stop(dev);
1236
1237                 dev->flags &= ~IFF_UP;
1238                 net_dmaengine_put();
1239         }
1240
1241         return 0;
1242 }
1243
1244 static int __dev_close(struct net_device *dev)
1245 {
1246         int retval;
1247         LIST_HEAD(single);
1248
1249         list_add(&dev->unreg_list, &single);
1250         retval = __dev_close_many(&single);
1251         list_del(&single);
1252         return retval;
1253 }
1254
1255 static int dev_close_many(struct list_head *head)
1256 {
1257         struct net_device *dev, *tmp;
1258         LIST_HEAD(tmp_list);
1259
1260         list_for_each_entry_safe(dev, tmp, head, unreg_list)
1261                 if (!(dev->flags & IFF_UP))
1262                         list_move(&dev->unreg_list, &tmp_list);
1263
1264         __dev_close_many(head);
1265
1266         list_for_each_entry(dev, head, unreg_list) {
1267                 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1268                 call_netdevice_notifiers(NETDEV_DOWN, dev);
1269         }
1270
1271         /* rollback_registered_many needs the complete original list */
1272         list_splice(&tmp_list, head);
1273         return 0;
1274 }
1275
1276 /**
1277  *      dev_close - shutdown an interface.
1278  *      @dev: device to shutdown
1279  *
1280  *      This function moves an active device into down state. A
1281  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1282  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1283  *      chain.
1284  */
1285 int dev_close(struct net_device *dev)
1286 {
1287         LIST_HEAD(single);
1288
1289         list_add(&dev->unreg_list, &single);
1290         dev_close_many(&single);
1291         list_del(&single);
1292         return 0;
1293 }
1294 EXPORT_SYMBOL(dev_close);
1295
1296
1297 /**
1298  *      dev_disable_lro - disable Large Receive Offload on a device
1299  *      @dev: device
1300  *
1301  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1302  *      called under RTNL.  This is needed if received packets may be
1303  *      forwarded to another interface.
1304  */
1305 void dev_disable_lro(struct net_device *dev)
1306 {
1307         u32 flags;
1308
1309         if (dev->ethtool_ops && dev->ethtool_ops->get_flags)
1310                 flags = dev->ethtool_ops->get_flags(dev);
1311         else
1312                 flags = ethtool_op_get_flags(dev);
1313
1314         if (!(flags & ETH_FLAG_LRO))
1315                 return;
1316
1317         __ethtool_set_flags(dev, flags & ~ETH_FLAG_LRO);
1318         WARN_ON(dev->features & NETIF_F_LRO);
1319 }
1320 EXPORT_SYMBOL(dev_disable_lro);
1321
1322
1323 static int dev_boot_phase = 1;
1324
1325 /**
1326  *      register_netdevice_notifier - register a network notifier block
1327  *      @nb: notifier
1328  *
1329  *      Register a notifier to be called when network device events occur.
1330  *      The notifier passed is linked into the kernel structures and must
1331  *      not be reused until it has been unregistered. A negative errno code
1332  *      is returned on a failure.
1333  *
1334  *      When registered all registration and up events are replayed
1335  *      to the new notifier to allow device to have a race free
1336  *      view of the network device list.
1337  */
1338
1339 int register_netdevice_notifier(struct notifier_block *nb)
1340 {
1341         struct net_device *dev;
1342         struct net_device *last;
1343         struct net *net;
1344         int err;
1345
1346         rtnl_lock();
1347         err = raw_notifier_chain_register(&netdev_chain, nb);
1348         if (err)
1349                 goto unlock;
1350         if (dev_boot_phase)
1351                 goto unlock;
1352         for_each_net(net) {
1353                 for_each_netdev(net, dev) {
1354                         err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1355                         err = notifier_to_errno(err);
1356                         if (err)
1357                                 goto rollback;
1358
1359                         if (!(dev->flags & IFF_UP))
1360                                 continue;
1361
1362                         nb->notifier_call(nb, NETDEV_UP, dev);
1363                 }
1364         }
1365
1366 unlock:
1367         rtnl_unlock();
1368         return err;
1369
1370 rollback:
1371         last = dev;
1372         for_each_net(net) {
1373                 for_each_netdev(net, dev) {
1374                         if (dev == last)
1375                                 break;
1376
1377                         if (dev->flags & IFF_UP) {
1378                                 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1379                                 nb->notifier_call(nb, NETDEV_DOWN, dev);
1380                         }
1381                         nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1382                         nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1383                 }
1384         }
1385
1386         raw_notifier_chain_unregister(&netdev_chain, nb);
1387         goto unlock;
1388 }
1389 EXPORT_SYMBOL(register_netdevice_notifier);
1390
1391 /**
1392  *      unregister_netdevice_notifier - unregister a network notifier block
1393  *      @nb: notifier
1394  *
1395  *      Unregister a notifier previously registered by
1396  *      register_netdevice_notifier(). The notifier is unlinked into the
1397  *      kernel structures and may then be reused. A negative errno code
1398  *      is returned on a failure.
1399  */
1400
1401 int unregister_netdevice_notifier(struct notifier_block *nb)
1402 {
1403         int err;
1404
1405         rtnl_lock();
1406         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1407         rtnl_unlock();
1408         return err;
1409 }
1410 EXPORT_SYMBOL(unregister_netdevice_notifier);
1411
1412 /**
1413  *      call_netdevice_notifiers - call all network notifier blocks
1414  *      @val: value passed unmodified to notifier function
1415  *      @dev: net_device pointer passed unmodified to notifier function
1416  *
1417  *      Call all network notifier blocks.  Parameters and return value
1418  *      are as for raw_notifier_call_chain().
1419  */
1420
1421 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1422 {
1423         ASSERT_RTNL();
1424         return raw_notifier_call_chain(&netdev_chain, val, dev);
1425 }
1426
1427 /* When > 0 there are consumers of rx skb time stamps */
1428 static atomic_t netstamp_needed = ATOMIC_INIT(0);
1429
1430 void net_enable_timestamp(void)
1431 {
1432         atomic_inc(&netstamp_needed);
1433 }
1434 EXPORT_SYMBOL(net_enable_timestamp);
1435
1436 void net_disable_timestamp(void)
1437 {
1438         atomic_dec(&netstamp_needed);
1439 }
1440 EXPORT_SYMBOL(net_disable_timestamp);
1441
1442 static inline void net_timestamp_set(struct sk_buff *skb)
1443 {
1444         if (atomic_read(&netstamp_needed))
1445                 __net_timestamp(skb);
1446         else
1447                 skb->tstamp.tv64 = 0;
1448 }
1449
1450 static inline void net_timestamp_check(struct sk_buff *skb)
1451 {
1452         if (!skb->tstamp.tv64 && atomic_read(&netstamp_needed))
1453                 __net_timestamp(skb);
1454 }
1455
1456 /**
1457  * dev_forward_skb - loopback an skb to another netif
1458  *
1459  * @dev: destination network device
1460  * @skb: buffer to forward
1461  *
1462  * return values:
1463  *      NET_RX_SUCCESS  (no congestion)
1464  *      NET_RX_DROP     (packet was dropped, but freed)
1465  *
1466  * dev_forward_skb can be used for injecting an skb from the
1467  * start_xmit function of one device into the receive queue
1468  * of another device.
1469  *
1470  * The receiving device may be in another namespace, so
1471  * we have to clear all information in the skb that could
1472  * impact namespace isolation.
1473  */
1474 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1475 {
1476         skb_orphan(skb);
1477         nf_reset(skb);
1478
1479         if (unlikely(!(dev->flags & IFF_UP) ||
1480                      (skb->len > (dev->mtu + dev->hard_header_len + VLAN_HLEN)))) {
1481                 atomic_long_inc(&dev->rx_dropped);
1482                 kfree_skb(skb);
1483                 return NET_RX_DROP;
1484         }
1485         skb_set_dev(skb, dev);
1486         skb->tstamp.tv64 = 0;
1487         skb->pkt_type = PACKET_HOST;
1488         skb->protocol = eth_type_trans(skb, dev);
1489         return netif_rx(skb);
1490 }
1491 EXPORT_SYMBOL_GPL(dev_forward_skb);
1492
1493 static inline int deliver_skb(struct sk_buff *skb,
1494                               struct packet_type *pt_prev,
1495                               struct net_device *orig_dev)
1496 {
1497         atomic_inc(&skb->users);
1498         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1499 }
1500
1501 /*
1502  *      Support routine. Sends outgoing frames to any network
1503  *      taps currently in use.
1504  */
1505
1506 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1507 {
1508         struct packet_type *ptype;
1509         struct sk_buff *skb2 = NULL;
1510         struct packet_type *pt_prev = NULL;
1511
1512         rcu_read_lock();
1513         list_for_each_entry_rcu(ptype, &ptype_all, list) {
1514                 /* Never send packets back to the socket
1515                  * they originated from - MvS (miquels@drinkel.ow.org)
1516                  */
1517                 if ((ptype->dev == dev || !ptype->dev) &&
1518                     (ptype->af_packet_priv == NULL ||
1519                      (struct sock *)ptype->af_packet_priv != skb->sk)) {
1520                         if (pt_prev) {
1521                                 deliver_skb(skb2, pt_prev, skb->dev);
1522                                 pt_prev = ptype;
1523                                 continue;
1524                         }
1525
1526                         skb2 = skb_clone(skb, GFP_ATOMIC);
1527                         if (!skb2)
1528                                 break;
1529
1530                         net_timestamp_set(skb2);
1531
1532                         /* skb->nh should be correctly
1533                            set by sender, so that the second statement is
1534                            just protection against buggy protocols.
1535                          */
1536                         skb_reset_mac_header(skb2);
1537
1538                         if (skb_network_header(skb2) < skb2->data ||
1539                             skb2->network_header > skb2->tail) {
1540                                 if (net_ratelimit())
1541                                         printk(KERN_CRIT "protocol %04x is "
1542                                                "buggy, dev %s\n",
1543                                                ntohs(skb2->protocol),
1544                                                dev->name);
1545                                 skb_reset_network_header(skb2);
1546                         }
1547
1548                         skb2->transport_header = skb2->network_header;
1549                         skb2->pkt_type = PACKET_OUTGOING;
1550                         pt_prev = ptype;
1551                 }
1552         }
1553         if (pt_prev)
1554                 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1555         rcu_read_unlock();
1556 }
1557
1558 /* netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1559  * @dev: Network device
1560  * @txq: number of queues available
1561  *
1562  * If real_num_tx_queues is changed the tc mappings may no longer be
1563  * valid. To resolve this verify the tc mapping remains valid and if
1564  * not NULL the mapping. With no priorities mapping to this
1565  * offset/count pair it will no longer be used. In the worst case TC0
1566  * is invalid nothing can be done so disable priority mappings. If is
1567  * expected that drivers will fix this mapping if they can before
1568  * calling netif_set_real_num_tx_queues.
1569  */
1570 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1571 {
1572         int i;
1573         struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1574
1575         /* If TC0 is invalidated disable TC mapping */
1576         if (tc->offset + tc->count > txq) {
1577                 pr_warning("Number of in use tx queues changed "
1578                            "invalidating tc mappings. Priority "
1579                            "traffic classification disabled!\n");
1580                 dev->num_tc = 0;
1581                 return;
1582         }
1583
1584         /* Invalidated prio to tc mappings set to TC0 */
1585         for (i = 1; i < TC_BITMASK + 1; i++) {
1586                 int q = netdev_get_prio_tc_map(dev, i);
1587
1588                 tc = &dev->tc_to_txq[q];
1589                 if (tc->offset + tc->count > txq) {
1590                         pr_warning("Number of in use tx queues "
1591                                    "changed. Priority %i to tc "
1592                                    "mapping %i is no longer valid "
1593                                    "setting map to 0\n",
1594                                    i, q);
1595                         netdev_set_prio_tc_map(dev, i, 0);
1596                 }
1597         }
1598 }
1599
1600 /*
1601  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1602  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1603  */
1604 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1605 {
1606         int rc;
1607
1608         if (txq < 1 || txq > dev->num_tx_queues)
1609                 return -EINVAL;
1610
1611         if (dev->reg_state == NETREG_REGISTERED ||
1612             dev->reg_state == NETREG_UNREGISTERING) {
1613                 ASSERT_RTNL();
1614
1615                 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
1616                                                   txq);
1617                 if (rc)
1618                         return rc;
1619
1620                 if (dev->num_tc)
1621                         netif_setup_tc(dev, txq);
1622
1623                 if (txq < dev->real_num_tx_queues)
1624                         qdisc_reset_all_tx_gt(dev, txq);
1625         }
1626
1627         dev->real_num_tx_queues = txq;
1628         return 0;
1629 }
1630 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
1631
1632 #ifdef CONFIG_RPS
1633 /**
1634  *      netif_set_real_num_rx_queues - set actual number of RX queues used
1635  *      @dev: Network device
1636  *      @rxq: Actual number of RX queues
1637  *
1638  *      This must be called either with the rtnl_lock held or before
1639  *      registration of the net device.  Returns 0 on success, or a
1640  *      negative error code.  If called before registration, it always
1641  *      succeeds.
1642  */
1643 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
1644 {
1645         int rc;
1646
1647         if (rxq < 1 || rxq > dev->num_rx_queues)
1648                 return -EINVAL;
1649
1650         if (dev->reg_state == NETREG_REGISTERED) {
1651                 ASSERT_RTNL();
1652
1653                 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
1654                                                   rxq);
1655                 if (rc)
1656                         return rc;
1657         }
1658
1659         dev->real_num_rx_queues = rxq;
1660         return 0;
1661 }
1662 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
1663 #endif
1664
1665 static inline void __netif_reschedule(struct Qdisc *q)
1666 {
1667         struct softnet_data *sd;
1668         unsigned long flags;
1669
1670         local_irq_save(flags);
1671         sd = &__get_cpu_var(softnet_data);
1672         q->next_sched = NULL;
1673         *sd->output_queue_tailp = q;
1674         sd->output_queue_tailp = &q->next_sched;
1675         raise_softirq_irqoff(NET_TX_SOFTIRQ);
1676         local_irq_restore(flags);
1677 }
1678
1679 void __netif_schedule(struct Qdisc *q)
1680 {
1681         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1682                 __netif_reschedule(q);
1683 }
1684 EXPORT_SYMBOL(__netif_schedule);
1685
1686 void dev_kfree_skb_irq(struct sk_buff *skb)
1687 {
1688         if (atomic_dec_and_test(&skb->users)) {
1689                 struct softnet_data *sd;
1690                 unsigned long flags;
1691
1692                 local_irq_save(flags);
1693                 sd = &__get_cpu_var(softnet_data);
1694                 skb->next = sd->completion_queue;
1695                 sd->completion_queue = skb;
1696                 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1697                 local_irq_restore(flags);
1698         }
1699 }
1700 EXPORT_SYMBOL(dev_kfree_skb_irq);
1701
1702 void dev_kfree_skb_any(struct sk_buff *skb)
1703 {
1704         if (in_irq() || irqs_disabled())
1705                 dev_kfree_skb_irq(skb);
1706         else
1707                 dev_kfree_skb(skb);
1708 }
1709 EXPORT_SYMBOL(dev_kfree_skb_any);
1710
1711
1712 /**
1713  * netif_device_detach - mark device as removed
1714  * @dev: network device
1715  *
1716  * Mark device as removed from system and therefore no longer available.
1717  */
1718 void netif_device_detach(struct net_device *dev)
1719 {
1720         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1721             netif_running(dev)) {
1722                 netif_tx_stop_all_queues(dev);
1723         }
1724 }
1725 EXPORT_SYMBOL(netif_device_detach);
1726
1727 /**
1728  * netif_device_attach - mark device as attached
1729  * @dev: network device
1730  *
1731  * Mark device as attached from system and restart if needed.
1732  */
1733 void netif_device_attach(struct net_device *dev)
1734 {
1735         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1736             netif_running(dev)) {
1737                 netif_tx_wake_all_queues(dev);
1738                 __netdev_watchdog_up(dev);
1739         }
1740 }
1741 EXPORT_SYMBOL(netif_device_attach);
1742
1743 /**
1744  * skb_dev_set -- assign a new device to a buffer
1745  * @skb: buffer for the new device
1746  * @dev: network device
1747  *
1748  * If an skb is owned by a device already, we have to reset
1749  * all data private to the namespace a device belongs to
1750  * before assigning it a new device.
1751  */
1752 #ifdef CONFIG_NET_NS
1753 void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
1754 {
1755         skb_dst_drop(skb);
1756         if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
1757                 secpath_reset(skb);
1758                 nf_reset(skb);
1759                 skb_init_secmark(skb);
1760                 skb->mark = 0;
1761                 skb->priority = 0;
1762                 skb->nf_trace = 0;
1763                 skb->ipvs_property = 0;
1764 #ifdef CONFIG_NET_SCHED
1765                 skb->tc_index = 0;
1766 #endif
1767         }
1768         skb->dev = dev;
1769 }
1770 EXPORT_SYMBOL(skb_set_dev);
1771 #endif /* CONFIG_NET_NS */
1772
1773 /*
1774  * Invalidate hardware checksum when packet is to be mangled, and
1775  * complete checksum manually on outgoing path.
1776  */
1777 int skb_checksum_help(struct sk_buff *skb)
1778 {
1779         __wsum csum;
1780         int ret = 0, offset;
1781
1782         if (skb->ip_summed == CHECKSUM_COMPLETE)
1783                 goto out_set_summed;
1784
1785         if (unlikely(skb_shinfo(skb)->gso_size)) {
1786                 /* Let GSO fix up the checksum. */
1787                 goto out_set_summed;
1788         }
1789
1790         offset = skb_checksum_start_offset(skb);
1791         BUG_ON(offset >= skb_headlen(skb));
1792         csum = skb_checksum(skb, offset, skb->len - offset, 0);
1793
1794         offset += skb->csum_offset;
1795         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1796
1797         if (skb_cloned(skb) &&
1798             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1799                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1800                 if (ret)
1801                         goto out;
1802         }
1803
1804         *(__sum16 *)(skb->data + offset) = csum_fold(csum);
1805 out_set_summed:
1806         skb->ip_summed = CHECKSUM_NONE;
1807 out:
1808         return ret;
1809 }
1810 EXPORT_SYMBOL(skb_checksum_help);
1811
1812 /**
1813  *      skb_gso_segment - Perform segmentation on skb.
1814  *      @skb: buffer to segment
1815  *      @features: features for the output path (see dev->features)
1816  *
1817  *      This function segments the given skb and returns a list of segments.
1818  *
1819  *      It may return NULL if the skb requires no segmentation.  This is
1820  *      only possible when GSO is used for verifying header integrity.
1821  */
1822 struct sk_buff *skb_gso_segment(struct sk_buff *skb, u32 features)
1823 {
1824         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1825         struct packet_type *ptype;
1826         __be16 type = skb->protocol;
1827         int vlan_depth = ETH_HLEN;
1828         int err;
1829
1830         while (type == htons(ETH_P_8021Q)) {
1831                 struct vlan_hdr *vh;
1832
1833                 if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
1834                         return ERR_PTR(-EINVAL);
1835
1836                 vh = (struct vlan_hdr *)(skb->data + vlan_depth);
1837                 type = vh->h_vlan_encapsulated_proto;
1838                 vlan_depth += VLAN_HLEN;
1839         }
1840
1841         skb_reset_mac_header(skb);
1842         skb->mac_len = skb->network_header - skb->mac_header;
1843         __skb_pull(skb, skb->mac_len);
1844
1845         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1846                 struct net_device *dev = skb->dev;
1847                 struct ethtool_drvinfo info = {};
1848
1849                 if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1850                         dev->ethtool_ops->get_drvinfo(dev, &info);
1851
1852                 WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d ip_summed=%d\n",
1853                      info.driver, dev ? dev->features : 0L,
1854                      skb->sk ? skb->sk->sk_route_caps : 0L,
1855                      skb->len, skb->data_len, skb->ip_summed);
1856
1857                 if (skb_header_cloned(skb) &&
1858                     (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1859                         return ERR_PTR(err);
1860         }
1861
1862         rcu_read_lock();
1863         list_for_each_entry_rcu(ptype,
1864                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1865                 if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1866                         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1867                                 err = ptype->gso_send_check(skb);
1868                                 segs = ERR_PTR(err);
1869                                 if (err || skb_gso_ok(skb, features))
1870                                         break;
1871                                 __skb_push(skb, (skb->data -
1872                                                  skb_network_header(skb)));
1873                         }
1874                         segs = ptype->gso_segment(skb, features);
1875                         break;
1876                 }
1877         }
1878         rcu_read_unlock();
1879
1880         __skb_push(skb, skb->data - skb_mac_header(skb));
1881
1882         return segs;
1883 }
1884 EXPORT_SYMBOL(skb_gso_segment);
1885
1886 /* Take action when hardware reception checksum errors are detected. */
1887 #ifdef CONFIG_BUG
1888 void netdev_rx_csum_fault(struct net_device *dev)
1889 {
1890         if (net_ratelimit()) {
1891                 printk(KERN_ERR "%s: hw csum failure.\n",
1892                         dev ? dev->name : "<unknown>");
1893                 dump_stack();
1894         }
1895 }
1896 EXPORT_SYMBOL(netdev_rx_csum_fault);
1897 #endif
1898
1899 /* Actually, we should eliminate this check as soon as we know, that:
1900  * 1. IOMMU is present and allows to map all the memory.
1901  * 2. No high memory really exists on this machine.
1902  */
1903
1904 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1905 {
1906 #ifdef CONFIG_HIGHMEM
1907         int i;
1908         if (!(dev->features & NETIF_F_HIGHDMA)) {
1909                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1910                         if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1911                                 return 1;
1912         }
1913
1914         if (PCI_DMA_BUS_IS_PHYS) {
1915                 struct device *pdev = dev->dev.parent;
1916
1917                 if (!pdev)
1918                         return 0;
1919                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1920                         dma_addr_t addr = page_to_phys(skb_shinfo(skb)->frags[i].page);
1921                         if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
1922                                 return 1;
1923                 }
1924         }
1925 #endif
1926         return 0;
1927 }
1928
1929 struct dev_gso_cb {
1930         void (*destructor)(struct sk_buff *skb);
1931 };
1932
1933 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1934
1935 static void dev_gso_skb_destructor(struct sk_buff *skb)
1936 {
1937         struct dev_gso_cb *cb;
1938
1939         do {
1940                 struct sk_buff *nskb = skb->next;
1941
1942                 skb->next = nskb->next;
1943                 nskb->next = NULL;
1944                 kfree_skb(nskb);
1945         } while (skb->next);
1946
1947         cb = DEV_GSO_CB(skb);
1948         if (cb->destructor)
1949                 cb->destructor(skb);
1950 }
1951
1952 /**
1953  *      dev_gso_segment - Perform emulated hardware segmentation on skb.
1954  *      @skb: buffer to segment
1955  *      @features: device features as applicable to this skb
1956  *
1957  *      This function segments the given skb and stores the list of segments
1958  *      in skb->next.
1959  */
1960 static int dev_gso_segment(struct sk_buff *skb, int features)
1961 {
1962         struct sk_buff *segs;
1963
1964         segs = skb_gso_segment(skb, features);
1965
1966         /* Verifying header integrity only. */
1967         if (!segs)
1968                 return 0;
1969
1970         if (IS_ERR(segs))
1971                 return PTR_ERR(segs);
1972
1973         skb->next = segs;
1974         DEV_GSO_CB(skb)->destructor = skb->destructor;
1975         skb->destructor = dev_gso_skb_destructor;
1976
1977         return 0;
1978 }
1979
1980 /*
1981  * Try to orphan skb early, right before transmission by the device.
1982  * We cannot orphan skb if tx timestamp is requested or the sk-reference
1983  * is needed on driver level for other reasons, e.g. see net/can/raw.c
1984  */
1985 static inline void skb_orphan_try(struct sk_buff *skb)
1986 {
1987         struct sock *sk = skb->sk;
1988
1989         if (sk && !skb_shinfo(skb)->tx_flags) {
1990                 /* skb_tx_hash() wont be able to get sk.
1991                  * We copy sk_hash into skb->rxhash
1992                  */
1993                 if (!skb->rxhash)
1994                         skb->rxhash = sk->sk_hash;
1995                 skb_orphan(skb);
1996         }
1997 }
1998
1999 static bool can_checksum_protocol(unsigned long features, __be16 protocol)
2000 {
2001         return ((features & NETIF_F_GEN_CSUM) ||
2002                 ((features & NETIF_F_V4_CSUM) &&
2003                  protocol == htons(ETH_P_IP)) ||
2004                 ((features & NETIF_F_V6_CSUM) &&
2005                  protocol == htons(ETH_P_IPV6)) ||
2006                 ((features & NETIF_F_FCOE_CRC) &&
2007                  protocol == htons(ETH_P_FCOE)));
2008 }
2009
2010 static u32 harmonize_features(struct sk_buff *skb, __be16 protocol, u32 features)
2011 {
2012         if (!can_checksum_protocol(features, protocol)) {
2013                 features &= ~NETIF_F_ALL_CSUM;
2014                 features &= ~NETIF_F_SG;
2015         } else if (illegal_highdma(skb->dev, skb)) {
2016                 features &= ~NETIF_F_SG;
2017         }
2018
2019         return features;
2020 }
2021
2022 u32 netif_skb_features(struct sk_buff *skb)
2023 {
2024         __be16 protocol = skb->protocol;
2025         u32 features = skb->dev->features;
2026
2027         if (protocol == htons(ETH_P_8021Q)) {
2028                 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2029                 protocol = veh->h_vlan_encapsulated_proto;
2030         } else if (!vlan_tx_tag_present(skb)) {
2031                 return harmonize_features(skb, protocol, features);
2032         }
2033
2034         features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
2035
2036         if (protocol != htons(ETH_P_8021Q)) {
2037                 return harmonize_features(skb, protocol, features);
2038         } else {
2039                 features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2040                                 NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
2041                 return harmonize_features(skb, protocol, features);
2042         }
2043 }
2044 EXPORT_SYMBOL(netif_skb_features);
2045
2046 /*
2047  * Returns true if either:
2048  *      1. skb has frag_list and the device doesn't support FRAGLIST, or
2049  *      2. skb is fragmented and the device does not support SG, or if
2050  *         at least one of fragments is in highmem and device does not
2051  *         support DMA from it.
2052  */
2053 static inline int skb_needs_linearize(struct sk_buff *skb,
2054                                       int features)
2055 {
2056         return skb_is_nonlinear(skb) &&
2057                         ((skb_has_frag_list(skb) &&
2058                                 !(features & NETIF_F_FRAGLIST)) ||
2059                         (skb_shinfo(skb)->nr_frags &&
2060                                 !(features & NETIF_F_SG)));
2061 }
2062
2063 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2064                         struct netdev_queue *txq)
2065 {
2066         const struct net_device_ops *ops = dev->netdev_ops;
2067         int rc = NETDEV_TX_OK;
2068
2069         if (likely(!skb->next)) {
2070                 u32 features;
2071
2072                 /*
2073                  * If device doesnt need skb->dst, release it right now while
2074                  * its hot in this cpu cache
2075                  */
2076                 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2077                         skb_dst_drop(skb);
2078
2079                 if (!list_empty(&ptype_all))
2080                         dev_queue_xmit_nit(skb, dev);
2081
2082                 skb_orphan_try(skb);
2083
2084                 features = netif_skb_features(skb);
2085
2086                 if (vlan_tx_tag_present(skb) &&
2087                     !(features & NETIF_F_HW_VLAN_TX)) {
2088                         skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2089                         if (unlikely(!skb))
2090                                 goto out;
2091
2092                         skb->vlan_tci = 0;
2093                 }
2094
2095                 if (netif_needs_gso(skb, features)) {
2096                         if (unlikely(dev_gso_segment(skb, features)))
2097                                 goto out_kfree_skb;
2098                         if (skb->next)
2099                                 goto gso;
2100                 } else {
2101                         if (skb_needs_linearize(skb, features) &&
2102                             __skb_linearize(skb))
2103                                 goto out_kfree_skb;
2104
2105                         /* If packet is not checksummed and device does not
2106                          * support checksumming for this protocol, complete
2107                          * checksumming here.
2108                          */
2109                         if (skb->ip_summed == CHECKSUM_PARTIAL) {
2110                                 skb_set_transport_header(skb,
2111                                         skb_checksum_start_offset(skb));
2112                                 if (!(features & NETIF_F_ALL_CSUM) &&
2113                                      skb_checksum_help(skb))
2114                                         goto out_kfree_skb;
2115                         }
2116                 }
2117
2118                 rc = ops->ndo_start_xmit(skb, dev);
2119                 trace_net_dev_xmit(skb, rc);
2120                 if (rc == NETDEV_TX_OK)
2121                         txq_trans_update(txq);
2122                 return rc;
2123         }
2124
2125 gso:
2126         do {
2127                 struct sk_buff *nskb = skb->next;
2128
2129                 skb->next = nskb->next;
2130                 nskb->next = NULL;
2131
2132                 /*
2133                  * If device doesnt need nskb->dst, release it right now while
2134                  * its hot in this cpu cache
2135                  */
2136                 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2137                         skb_dst_drop(nskb);
2138
2139                 rc = ops->ndo_start_xmit(nskb, dev);
2140                 trace_net_dev_xmit(nskb, rc);
2141                 if (unlikely(rc != NETDEV_TX_OK)) {
2142                         if (rc & ~NETDEV_TX_MASK)
2143                                 goto out_kfree_gso_skb;
2144                         nskb->next = skb->next;
2145                         skb->next = nskb;
2146                         return rc;
2147                 }
2148                 txq_trans_update(txq);
2149                 if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
2150                         return NETDEV_TX_BUSY;
2151         } while (skb->next);
2152
2153 out_kfree_gso_skb:
2154         if (likely(skb->next == NULL))
2155                 skb->destructor = DEV_GSO_CB(skb)->destructor;
2156 out_kfree_skb:
2157         kfree_skb(skb);
2158 out:
2159         return rc;
2160 }
2161
2162 static u32 hashrnd __read_mostly;
2163
2164 /*
2165  * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2166  * to be used as a distribution range.
2167  */
2168 u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
2169                   unsigned int num_tx_queues)
2170 {
2171         u32 hash;
2172         u16 qoffset = 0;
2173         u16 qcount = num_tx_queues;
2174
2175         if (skb_rx_queue_recorded(skb)) {
2176                 hash = skb_get_rx_queue(skb);
2177                 while (unlikely(hash >= num_tx_queues))
2178                         hash -= num_tx_queues;
2179                 return hash;
2180         }
2181
2182         if (dev->num_tc) {
2183                 u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2184                 qoffset = dev->tc_to_txq[tc].offset;
2185                 qcount = dev->tc_to_txq[tc].count;
2186         }
2187
2188         if (skb->sk && skb->sk->sk_hash)
2189                 hash = skb->sk->sk_hash;
2190         else
2191                 hash = (__force u16) skb->protocol ^ skb->rxhash;
2192         hash = jhash_1word(hash, hashrnd);
2193
2194         return (u16) (((u64) hash * qcount) >> 32) + qoffset;
2195 }
2196 EXPORT_SYMBOL(__skb_tx_hash);
2197
2198 static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2199 {
2200         if (unlikely(queue_index >= dev->real_num_tx_queues)) {
2201                 if (net_ratelimit()) {
2202                         pr_warning("%s selects TX queue %d, but "
2203                                 "real number of TX queues is %d\n",
2204                                 dev->name, queue_index, dev->real_num_tx_queues);
2205                 }
2206                 return 0;
2207         }
2208         return queue_index;
2209 }
2210
2211 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2212 {
2213 #ifdef CONFIG_XPS
2214         struct xps_dev_maps *dev_maps;
2215         struct xps_map *map;
2216         int queue_index = -1;
2217
2218         rcu_read_lock();
2219         dev_maps = rcu_dereference(dev->xps_maps);
2220         if (dev_maps) {
2221                 map = rcu_dereference(
2222                     dev_maps->cpu_map[raw_smp_processor_id()]);
2223                 if (map) {
2224                         if (map->len == 1)
2225                                 queue_index = map->queues[0];
2226                         else {
2227                                 u32 hash;
2228                                 if (skb->sk && skb->sk->sk_hash)
2229                                         hash = skb->sk->sk_hash;
2230                                 else
2231                                         hash = (__force u16) skb->protocol ^
2232                                             skb->rxhash;
2233                                 hash = jhash_1word(hash, hashrnd);
2234                                 queue_index = map->queues[
2235                                     ((u64)hash * map->len) >> 32];
2236                         }
2237                         if (unlikely(queue_index >= dev->real_num_tx_queues))
2238                                 queue_index = -1;
2239                 }
2240         }
2241         rcu_read_unlock();
2242
2243         return queue_index;
2244 #else
2245         return -1;
2246 #endif
2247 }
2248
2249 static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2250                                         struct sk_buff *skb)
2251 {
2252         int queue_index;
2253         const struct net_device_ops *ops = dev->netdev_ops;
2254
2255         if (dev->real_num_tx_queues == 1)
2256                 queue_index = 0;
2257         else if (ops->ndo_select_queue) {
2258                 queue_index = ops->ndo_select_queue(dev, skb);
2259                 queue_index = dev_cap_txqueue(dev, queue_index);
2260         } else {
2261                 struct sock *sk = skb->sk;
2262                 queue_index = sk_tx_queue_get(sk);
2263
2264                 if (queue_index < 0 || skb->ooo_okay ||
2265                     queue_index >= dev->real_num_tx_queues) {
2266                         int old_index = queue_index;
2267
2268                         queue_index = get_xps_queue(dev, skb);
2269                         if (queue_index < 0)
2270                                 queue_index = skb_tx_hash(dev, skb);
2271
2272                         if (queue_index != old_index && sk) {
2273                                 struct dst_entry *dst =
2274                                     rcu_dereference_check(sk->sk_dst_cache, 1);
2275
2276                                 if (dst && skb_dst(skb) == dst)
2277                                         sk_tx_queue_set(sk, queue_index);
2278                         }
2279                 }
2280         }
2281
2282         skb_set_queue_mapping(skb, queue_index);
2283         return netdev_get_tx_queue(dev, queue_index);
2284 }
2285
2286 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2287                                  struct net_device *dev,
2288                                  struct netdev_queue *txq)
2289 {
2290         spinlock_t *root_lock = qdisc_lock(q);
2291         bool contended;
2292         int rc;
2293
2294         qdisc_skb_cb(skb)->pkt_len = skb->len;
2295         qdisc_calculate_pkt_len(skb, q);
2296         /*
2297          * Heuristic to force contended enqueues to serialize on a
2298          * separate lock before trying to get qdisc main lock.
2299          * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2300          * and dequeue packets faster.
2301          */
2302         contended = qdisc_is_running(q);
2303         if (unlikely(contended))
2304                 spin_lock(&q->busylock);
2305
2306         spin_lock(root_lock);
2307         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2308                 kfree_skb(skb);
2309                 rc = NET_XMIT_DROP;
2310         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2311                    qdisc_run_begin(q)) {
2312                 /*
2313                  * This is a work-conserving queue; there are no old skbs
2314                  * waiting to be sent out; and the qdisc is not running -
2315                  * xmit the skb directly.
2316                  */
2317                 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2318                         skb_dst_force(skb);
2319
2320                 qdisc_bstats_update(q, skb);
2321
2322                 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2323                         if (unlikely(contended)) {
2324                                 spin_unlock(&q->busylock);
2325                                 contended = false;
2326                         }
2327                         __qdisc_run(q);
2328                 } else
2329                         qdisc_run_end(q);
2330
2331                 rc = NET_XMIT_SUCCESS;
2332         } else {
2333                 skb_dst_force(skb);
2334                 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2335                 if (qdisc_run_begin(q)) {
2336                         if (unlikely(contended)) {
2337                                 spin_unlock(&q->busylock);
2338                                 contended = false;
2339                         }
2340                         __qdisc_run(q);
2341                 }
2342         }
2343         spin_unlock(root_lock);
2344         if (unlikely(contended))
2345                 spin_unlock(&q->busylock);
2346         return rc;
2347 }
2348
2349 static DEFINE_PER_CPU(int, xmit_recursion);
2350 #define RECURSION_LIMIT 10
2351
2352 /**
2353  *      dev_queue_xmit - transmit a buffer
2354  *      @skb: buffer to transmit
2355  *
2356  *      Queue a buffer for transmission to a network device. The caller must
2357  *      have set the device and priority and built the buffer before calling
2358  *      this function. The function can be called from an interrupt.
2359  *
2360  *      A negative errno code is returned on a failure. A success does not
2361  *      guarantee the frame will be transmitted as it may be dropped due
2362  *      to congestion or traffic shaping.
2363  *
2364  * -----------------------------------------------------------------------------------
2365  *      I notice this method can also return errors from the queue disciplines,
2366  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2367  *      be positive.
2368  *
2369  *      Regardless of the return value, the skb is consumed, so it is currently
2370  *      difficult to retry a send to this method.  (You can bump the ref count
2371  *      before sending to hold a reference for retry if you are careful.)
2372  *
2373  *      When calling this method, interrupts MUST be enabled.  This is because
2374  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2375  *          --BLG
2376  */
2377 int dev_queue_xmit(struct sk_buff *skb)
2378 {
2379         struct net_device *dev = skb->dev;
2380         struct netdev_queue *txq;
2381         struct Qdisc *q;
2382         int rc = -ENOMEM;
2383
2384         /* Disable soft irqs for various locks below. Also
2385          * stops preemption for RCU.
2386          */
2387         rcu_read_lock_bh();
2388
2389         txq = dev_pick_tx(dev, skb);
2390         q = rcu_dereference_bh(txq->qdisc);
2391
2392 #ifdef CONFIG_NET_CLS_ACT
2393         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2394 #endif
2395         trace_net_dev_queue(skb);
2396         if (q->enqueue) {
2397                 rc = __dev_xmit_skb(skb, q, dev, txq);
2398                 goto out;
2399         }
2400
2401         /* The device has no queue. Common case for software devices:
2402            loopback, all the sorts of tunnels...
2403
2404            Really, it is unlikely that netif_tx_lock protection is necessary
2405            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2406            counters.)
2407            However, it is possible, that they rely on protection
2408            made by us here.
2409
2410            Check this and shot the lock. It is not prone from deadlocks.
2411            Either shot noqueue qdisc, it is even simpler 8)
2412          */
2413         if (dev->flags & IFF_UP) {
2414                 int cpu = smp_processor_id(); /* ok because BHs are off */
2415
2416                 if (txq->xmit_lock_owner != cpu) {
2417
2418                         if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2419                                 goto recursion_alert;
2420
2421                         HARD_TX_LOCK(dev, txq, cpu);
2422
2423                         if (!netif_tx_queue_stopped(txq)) {
2424                                 __this_cpu_inc(xmit_recursion);
2425                                 rc = dev_hard_start_xmit(skb, dev, txq);
2426                                 __this_cpu_dec(xmit_recursion);
2427                                 if (dev_xmit_complete(rc)) {
2428                                         HARD_TX_UNLOCK(dev, txq);
2429                                         goto out;
2430                                 }
2431                         }
2432                         HARD_TX_UNLOCK(dev, txq);
2433                         if (net_ratelimit())
2434                                 printk(KERN_CRIT "Virtual device %s asks to "
2435                                        "queue packet!\n", dev->name);
2436                 } else {
2437                         /* Recursion is detected! It is possible,
2438                          * unfortunately
2439                          */
2440 recursion_alert:
2441                         if (net_ratelimit())
2442                                 printk(KERN_CRIT "Dead loop on virtual device "
2443                                        "%s, fix it urgently!\n", dev->name);
2444                 }
2445         }
2446
2447         rc = -ENETDOWN;
2448         rcu_read_unlock_bh();
2449
2450         kfree_skb(skb);
2451         return rc;
2452 out:
2453         rcu_read_unlock_bh();
2454         return rc;
2455 }
2456 EXPORT_SYMBOL(dev_queue_xmit);
2457
2458
2459 /*=======================================================================
2460                         Receiver routines
2461   =======================================================================*/
2462
2463 int netdev_max_backlog __read_mostly = 1000;
2464 int netdev_tstamp_prequeue __read_mostly = 1;
2465 int netdev_budget __read_mostly = 300;
2466 int weight_p __read_mostly = 64;            /* old backlog weight */
2467
2468 /* Called with irq disabled */
2469 static inline void ____napi_schedule(struct softnet_data *sd,
2470                                      struct napi_struct *napi)
2471 {
2472         list_add_tail(&napi->poll_list, &sd->poll_list);
2473         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2474 }
2475
2476 /*
2477  * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
2478  * and src/dst port numbers. Returns a non-zero hash number on success
2479  * and 0 on failure.
2480  */
2481 __u32 __skb_get_rxhash(struct sk_buff *skb)
2482 {
2483         int nhoff, hash = 0, poff;
2484         struct ipv6hdr *ip6;
2485         struct iphdr *ip;
2486         u8 ip_proto;
2487         u32 addr1, addr2, ihl;
2488         union {
2489                 u32 v32;
2490                 u16 v16[2];
2491         } ports;
2492
2493         nhoff = skb_network_offset(skb);
2494
2495         switch (skb->protocol) {
2496         case __constant_htons(ETH_P_IP):
2497                 if (!pskb_may_pull(skb, sizeof(*ip) + nhoff))
2498                         goto done;
2499
2500                 ip = (struct iphdr *) (skb->data + nhoff);
2501                 if (ip->frag_off & htons(IP_MF | IP_OFFSET))
2502                         ip_proto = 0;
2503                 else
2504                         ip_proto = ip->protocol;
2505                 addr1 = (__force u32) ip->saddr;
2506                 addr2 = (__force u32) ip->daddr;
2507                 ihl = ip->ihl;
2508                 break;
2509         case __constant_htons(ETH_P_IPV6):
2510                 if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff))
2511                         goto done;
2512
2513                 ip6 = (struct ipv6hdr *) (skb->data + nhoff);
2514                 ip_proto = ip6->nexthdr;
2515                 addr1 = (__force u32) ip6->saddr.s6_addr32[3];
2516                 addr2 = (__force u32) ip6->daddr.s6_addr32[3];
2517                 ihl = (40 >> 2);
2518                 break;
2519         default:
2520                 goto done;
2521         }
2522
2523         ports.v32 = 0;
2524         poff = proto_ports_offset(ip_proto);
2525         if (poff >= 0) {
2526                 nhoff += ihl * 4 + poff;
2527                 if (pskb_may_pull(skb, nhoff + 4)) {
2528                         ports.v32 = * (__force u32 *) (skb->data + nhoff);
2529                         if (ports.v16[1] < ports.v16[0])
2530                                 swap(ports.v16[0], ports.v16[1]);
2531                 }
2532         }
2533
2534         /* get a consistent hash (same value on both flow directions) */
2535         if (addr2 < addr1)
2536                 swap(addr1, addr2);
2537
2538         hash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
2539         if (!hash)
2540                 hash = 1;
2541
2542 done:
2543         return hash;
2544 }
2545 EXPORT_SYMBOL(__skb_get_rxhash);
2546
2547 #ifdef CONFIG_RPS
2548
2549 /* One global table that all flow-based protocols share. */
2550 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2551 EXPORT_SYMBOL(rps_sock_flow_table);
2552
2553 static struct rps_dev_flow *
2554 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2555             struct rps_dev_flow *rflow, u16 next_cpu)
2556 {
2557         u16 tcpu;
2558
2559         tcpu = rflow->cpu = next_cpu;
2560         if (tcpu != RPS_NO_CPU) {
2561 #ifdef CONFIG_RFS_ACCEL
2562                 struct netdev_rx_queue *rxqueue;
2563                 struct rps_dev_flow_table *flow_table;
2564                 struct rps_dev_flow *old_rflow;
2565                 u32 flow_id;
2566                 u16 rxq_index;
2567                 int rc;
2568
2569                 /* Should we steer this flow to a different hardware queue? */
2570                 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2571                     !(dev->features & NETIF_F_NTUPLE))
2572                         goto out;
2573                 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2574                 if (rxq_index == skb_get_rx_queue(skb))
2575                         goto out;
2576
2577                 rxqueue = dev->_rx + rxq_index;
2578                 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2579                 if (!flow_table)
2580                         goto out;
2581                 flow_id = skb->rxhash & flow_table->mask;
2582                 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2583                                                         rxq_index, flow_id);
2584                 if (rc < 0)
2585                         goto out;
2586                 old_rflow = rflow;
2587                 rflow = &flow_table->flows[flow_id];
2588                 rflow->cpu = next_cpu;
2589                 rflow->filter = rc;
2590                 if (old_rflow->filter == rflow->filter)
2591                         old_rflow->filter = RPS_NO_FILTER;
2592         out:
2593 #endif
2594                 rflow->last_qtail =
2595                         per_cpu(softnet_data, tcpu).input_queue_head;
2596         }
2597
2598         return rflow;
2599 }
2600
2601 /*
2602  * get_rps_cpu is called from netif_receive_skb and returns the target
2603  * CPU from the RPS map of the receiving queue for a given skb.
2604  * rcu_read_lock must be held on entry.
2605  */
2606 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2607                        struct rps_dev_flow **rflowp)
2608 {
2609         struct netdev_rx_queue *rxqueue;
2610         struct rps_map *map;
2611         struct rps_dev_flow_table *flow_table;
2612         struct rps_sock_flow_table *sock_flow_table;
2613         int cpu = -1;
2614         u16 tcpu;
2615
2616         if (skb_rx_queue_recorded(skb)) {
2617                 u16 index = skb_get_rx_queue(skb);
2618                 if (unlikely(index >= dev->real_num_rx_queues)) {
2619                         WARN_ONCE(dev->real_num_rx_queues > 1,
2620                                   "%s received packet on queue %u, but number "
2621                                   "of RX queues is %u\n",
2622                                   dev->name, index, dev->real_num_rx_queues);
2623                         goto done;
2624                 }
2625                 rxqueue = dev->_rx + index;
2626         } else
2627                 rxqueue = dev->_rx;
2628
2629         map = rcu_dereference(rxqueue->rps_map);
2630         if (map) {
2631                 if (map->len == 1 &&
2632                     !rcu_dereference_raw(rxqueue->rps_flow_table)) {
2633                         tcpu = map->cpus[0];
2634                         if (cpu_online(tcpu))
2635                                 cpu = tcpu;
2636                         goto done;
2637                 }
2638         } else if (!rcu_dereference_raw(rxqueue->rps_flow_table)) {
2639                 goto done;
2640         }
2641
2642         skb_reset_network_header(skb);
2643         if (!skb_get_rxhash(skb))
2644                 goto done;
2645
2646         flow_table = rcu_dereference(rxqueue->rps_flow_table);
2647         sock_flow_table = rcu_dereference(rps_sock_flow_table);
2648         if (flow_table && sock_flow_table) {
2649                 u16 next_cpu;
2650                 struct rps_dev_flow *rflow;
2651
2652                 rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2653                 tcpu = rflow->cpu;
2654
2655                 next_cpu = sock_flow_table->ents[skb->rxhash &
2656                     sock_flow_table->mask];
2657
2658                 /*
2659                  * If the desired CPU (where last recvmsg was done) is
2660                  * different from current CPU (one in the rx-queue flow
2661                  * table entry), switch if one of the following holds:
2662                  *   - Current CPU is unset (equal to RPS_NO_CPU).
2663                  *   - Current CPU is offline.
2664                  *   - The current CPU's queue tail has advanced beyond the
2665                  *     last packet that was enqueued using this table entry.
2666                  *     This guarantees that all previous packets for the flow
2667                  *     have been dequeued, thus preserving in order delivery.
2668                  */
2669                 if (unlikely(tcpu != next_cpu) &&
2670                     (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2671                      ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2672                       rflow->last_qtail)) >= 0))
2673                         rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
2674
2675                 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2676                         *rflowp = rflow;
2677                         cpu = tcpu;
2678                         goto done;
2679                 }
2680         }
2681
2682         if (map) {
2683                 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2684
2685                 if (cpu_online(tcpu)) {
2686                         cpu = tcpu;
2687                         goto done;
2688                 }
2689         }
2690
2691 done:
2692         return cpu;
2693 }
2694
2695 #ifdef CONFIG_RFS_ACCEL
2696
2697 /**
2698  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
2699  * @dev: Device on which the filter was set
2700  * @rxq_index: RX queue index
2701  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
2702  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
2703  *
2704  * Drivers that implement ndo_rx_flow_steer() should periodically call
2705  * this function for each installed filter and remove the filters for
2706  * which it returns %true.
2707  */
2708 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
2709                          u32 flow_id, u16 filter_id)
2710 {
2711         struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
2712         struct rps_dev_flow_table *flow_table;
2713         struct rps_dev_flow *rflow;
2714         bool expire = true;
2715         int cpu;
2716
2717         rcu_read_lock();
2718         flow_table = rcu_dereference(rxqueue->rps_flow_table);
2719         if (flow_table && flow_id <= flow_table->mask) {
2720                 rflow = &flow_table->flows[flow_id];
2721                 cpu = ACCESS_ONCE(rflow->cpu);
2722                 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
2723                     ((int)(per_cpu(softnet_data, cpu).input_queue_head -
2724                            rflow->last_qtail) <
2725                      (int)(10 * flow_table->mask)))
2726                         expire = false;
2727         }
2728         rcu_read_unlock();
2729         return expire;
2730 }
2731 EXPORT_SYMBOL(rps_may_expire_flow);
2732
2733 #endif /* CONFIG_RFS_ACCEL */
2734
2735 /* Called from hardirq (IPI) context */
2736 static void rps_trigger_softirq(void *data)
2737 {
2738         struct softnet_data *sd = data;
2739
2740         ____napi_schedule(sd, &sd->backlog);
2741         sd->received_rps++;
2742 }
2743
2744 #endif /* CONFIG_RPS */
2745
2746 /*
2747  * Check if this softnet_data structure is another cpu one
2748  * If yes, queue it to our IPI list and return 1
2749  * If no, return 0
2750  */
2751 static int rps_ipi_queued(struct softnet_data *sd)
2752 {
2753 #ifdef CONFIG_RPS
2754         struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2755
2756         if (sd != mysd) {
2757                 sd->rps_ipi_next = mysd->rps_ipi_list;
2758                 mysd->rps_ipi_list = sd;
2759
2760                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2761                 return 1;
2762         }
2763 #endif /* CONFIG_RPS */
2764         return 0;
2765 }
2766
2767 /*
2768  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2769  * queue (may be a remote CPU queue).
2770  */
2771 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2772                               unsigned int *qtail)
2773 {
2774         struct softnet_data *sd;
2775         unsigned long flags;
2776
2777         sd = &per_cpu(softnet_data, cpu);
2778
2779         local_irq_save(flags);
2780
2781         rps_lock(sd);
2782         if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
2783                 if (skb_queue_len(&sd->input_pkt_queue)) {
2784 enqueue:
2785                         __skb_queue_tail(&sd->input_pkt_queue, skb);
2786                         input_queue_tail_incr_save(sd, qtail);
2787                         rps_unlock(sd);
2788                         local_irq_restore(flags);
2789                         return NET_RX_SUCCESS;
2790                 }
2791
2792                 /* Schedule NAPI for backlog device
2793                  * We can use non atomic operation since we own the queue lock
2794                  */
2795                 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
2796                         if (!rps_ipi_queued(sd))
2797                                 ____napi_schedule(sd, &sd->backlog);
2798                 }
2799                 goto enqueue;
2800         }
2801
2802         sd->dropped++;
2803         rps_unlock(sd);
2804
2805         local_irq_restore(flags);
2806
2807         atomic_long_inc(&skb->dev->rx_dropped);
2808         kfree_skb(skb);
2809         return NET_RX_DROP;
2810 }
2811
2812 /**
2813  *      netif_rx        -       post buffer to the network code
2814  *      @skb: buffer to post
2815  *
2816  *      This function receives a packet from a device driver and queues it for
2817  *      the upper (protocol) levels to process.  It always succeeds. The buffer
2818  *      may be dropped during processing for congestion control or by the
2819  *      protocol layers.
2820  *
2821  *      return values:
2822  *      NET_RX_SUCCESS  (no congestion)
2823  *      NET_RX_DROP     (packet was dropped)
2824  *
2825  */
2826
2827 int netif_rx(struct sk_buff *skb)
2828 {
2829         int ret;
2830
2831         /* if netpoll wants it, pretend we never saw it */
2832         if (netpoll_rx(skb))
2833                 return NET_RX_DROP;
2834
2835         if (netdev_tstamp_prequeue)
2836                 net_timestamp_check(skb);
2837
2838         trace_netif_rx(skb);
2839 #ifdef CONFIG_RPS
2840         {
2841                 struct rps_dev_flow voidflow, *rflow = &voidflow;
2842                 int cpu;
2843
2844                 preempt_disable();
2845                 rcu_read_lock();
2846
2847                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
2848                 if (cpu < 0)
2849                         cpu = smp_processor_id();
2850
2851                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2852
2853                 rcu_read_unlock();
2854                 preempt_enable();
2855         }
2856 #else
2857         {
2858                 unsigned int qtail;
2859                 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
2860                 put_cpu();
2861         }
2862 #endif
2863         return ret;
2864 }
2865 EXPORT_SYMBOL(netif_rx);
2866
2867 int netif_rx_ni(struct sk_buff *skb)
2868 {
2869         int err;
2870
2871         preempt_disable();
2872         err = netif_rx(skb);
2873         if (local_softirq_pending())
2874                 do_softirq();
2875         preempt_enable();
2876
2877         return err;
2878 }
2879 EXPORT_SYMBOL(netif_rx_ni);
2880
2881 static void net_tx_action(struct softirq_action *h)
2882 {
2883         struct softnet_data *sd = &__get_cpu_var(softnet_data);
2884
2885         if (sd->completion_queue) {
2886                 struct sk_buff *clist;
2887
2888                 local_irq_disable();
2889                 clist = sd->completion_queue;
2890                 sd->completion_queue = NULL;
2891                 local_irq_enable();
2892
2893                 while (clist) {
2894                         struct sk_buff *skb = clist;
2895                         clist = clist->next;
2896
2897                         WARN_ON(atomic_read(&skb->users));
2898                         trace_kfree_skb(skb, net_tx_action);
2899                         __kfree_skb(skb);
2900                 }
2901         }
2902
2903         if (sd->output_queue) {
2904                 struct Qdisc *head;
2905
2906                 local_irq_disable();
2907                 head = sd->output_queue;
2908                 sd->output_queue = NULL;
2909                 sd->output_queue_tailp = &sd->output_queue;
2910                 local_irq_enable();
2911
2912                 while (head) {
2913                         struct Qdisc *q = head;
2914                         spinlock_t *root_lock;
2915
2916                         head = head->next_sched;
2917
2918                         root_lock = qdisc_lock(q);
2919                         if (spin_trylock(root_lock)) {
2920                                 smp_mb__before_clear_bit();
2921                                 clear_bit(__QDISC_STATE_SCHED,
2922                                           &q->state);
2923                                 qdisc_run(q);
2924                                 spin_unlock(root_lock);
2925                         } else {
2926                                 if (!test_bit(__QDISC_STATE_DEACTIVATED,
2927                                               &q->state)) {
2928                                         __netif_reschedule(q);
2929                                 } else {
2930                                         smp_mb__before_clear_bit();
2931                                         clear_bit(__QDISC_STATE_SCHED,
2932                                                   &q->state);
2933                                 }
2934                         }
2935                 }
2936         }
2937 }
2938
2939 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
2940     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
2941 /* This hook is defined here for ATM LANE */
2942 int (*br_fdb_test_addr_hook)(struct net_device *dev,
2943                              unsigned char *addr) __read_mostly;
2944 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
2945 #endif
2946
2947 #ifdef CONFIG_NET_CLS_ACT
2948 /* TODO: Maybe we should just force sch_ingress to be compiled in
2949  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2950  * a compare and 2 stores extra right now if we dont have it on
2951  * but have CONFIG_NET_CLS_ACT
2952  * NOTE: This doesnt stop any functionality; if you dont have
2953  * the ingress scheduler, you just cant add policies on ingress.
2954  *
2955  */
2956 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
2957 {
2958         struct net_device *dev = skb->dev;
2959         u32 ttl = G_TC_RTTL(skb->tc_verd);
2960         int result = TC_ACT_OK;
2961         struct Qdisc *q;
2962
2963         if (unlikely(MAX_RED_LOOP < ttl++)) {
2964                 if (net_ratelimit())
2965                         pr_warning( "Redir loop detected Dropping packet (%d->%d)\n",
2966                                skb->skb_iif, dev->ifindex);
2967                 return TC_ACT_SHOT;
2968         }
2969
2970         skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
2971         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
2972
2973         q = rxq->qdisc;
2974         if (q != &noop_qdisc) {
2975                 spin_lock(qdisc_lock(q));
2976                 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
2977                         result = qdisc_enqueue_root(skb, q);
2978                 spin_unlock(qdisc_lock(q));
2979         }
2980
2981         return result;
2982 }
2983
2984 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
2985                                          struct packet_type **pt_prev,
2986                                          int *ret, struct net_device *orig_dev)
2987 {
2988         struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
2989
2990         if (!rxq || rxq->qdisc == &noop_qdisc)
2991                 goto out;
2992
2993         if (*pt_prev) {
2994                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
2995                 *pt_prev = NULL;
2996         }
2997
2998         switch (ing_filter(skb, rxq)) {
2999         case TC_ACT_SHOT:
3000         case TC_ACT_STOLEN:
3001                 kfree_skb(skb);
3002                 return NULL;
3003         }
3004
3005 out:
3006         skb->tc_verd = 0;
3007         return skb;
3008 }
3009 #endif
3010
3011 /**
3012  *      netdev_rx_handler_register - register receive handler
3013  *      @dev: device to register a handler for
3014  *      @rx_handler: receive handler to register
3015  *      @rx_handler_data: data pointer that is used by rx handler
3016  *
3017  *      Register a receive hander for a device. This handler will then be
3018  *      called from __netif_receive_skb. A negative errno code is returned
3019  *      on a failure.
3020  *
3021  *      The caller must hold the rtnl_mutex.
3022  *
3023  *      For a general description of rx_handler, see enum rx_handler_result.
3024  */
3025 int netdev_rx_handler_register(struct net_device *dev,
3026                                rx_handler_func_t *rx_handler,
3027                                void *rx_handler_data)
3028 {
3029         ASSERT_RTNL();
3030
3031         if (dev->rx_handler)
3032                 return -EBUSY;
3033
3034         rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3035         rcu_assign_pointer(dev->rx_handler, rx_handler);
3036
3037         return 0;
3038 }
3039 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3040
3041 /**
3042  *      netdev_rx_handler_unregister - unregister receive handler
3043  *      @dev: device to unregister a handler from
3044  *
3045  *      Unregister a receive hander from a device.
3046  *
3047  *      The caller must hold the rtnl_mutex.
3048  */
3049 void netdev_rx_handler_unregister(struct net_device *dev)
3050 {
3051
3052         ASSERT_RTNL();
3053         rcu_assign_pointer(dev->rx_handler, NULL);
3054         rcu_assign_pointer(dev->rx_handler_data, NULL);
3055 }
3056 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3057
3058 static void vlan_on_bond_hook(struct sk_buff *skb)
3059 {
3060         /*
3061          * Make sure ARP frames received on VLAN interfaces stacked on
3062          * bonding interfaces still make their way to any base bonding
3063          * device that may have registered for a specific ptype.
3064          */
3065         if (skb->dev->priv_flags & IFF_802_1Q_VLAN &&
3066             vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING &&
3067             skb->protocol == htons(ETH_P_ARP)) {
3068                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
3069
3070                 if (!skb2)
3071                         return;
3072                 skb2->dev = vlan_dev_real_dev(skb->dev);
3073                 netif_rx(skb2);
3074         }
3075 }
3076
3077 static int __netif_receive_skb(struct sk_buff *skb)
3078 {
3079         struct packet_type *ptype, *pt_prev;
3080         rx_handler_func_t *rx_handler;
3081         struct net_device *orig_dev;
3082         struct net_device *null_or_dev;
3083         bool deliver_exact = false;
3084         int ret = NET_RX_DROP;
3085         __be16 type;
3086
3087         if (!netdev_tstamp_prequeue)
3088                 net_timestamp_check(skb);
3089
3090         trace_netif_receive_skb(skb);
3091
3092         /* if we've gotten here through NAPI, check netpoll */
3093         if (netpoll_receive_skb(skb))
3094                 return NET_RX_DROP;
3095
3096         if (!skb->skb_iif)
3097                 skb->skb_iif = skb->dev->ifindex;
3098         orig_dev = skb->dev;
3099
3100         skb_reset_network_header(skb);
3101         skb_reset_transport_header(skb);
3102         skb->mac_len = skb->network_header - skb->mac_header;
3103
3104         pt_prev = NULL;
3105
3106         rcu_read_lock();
3107
3108 another_round:
3109
3110         __this_cpu_inc(softnet_data.processed);
3111
3112 #ifdef CONFIG_NET_CLS_ACT
3113         if (skb->tc_verd & TC_NCLS) {
3114                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3115                 goto ncls;
3116         }
3117 #endif
3118
3119         list_for_each_entry_rcu(ptype, &ptype_all, list) {
3120                 if (!ptype->dev || ptype->dev == skb->dev) {
3121                         if (pt_prev)
3122                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3123                         pt_prev = ptype;
3124                 }
3125         }
3126
3127 #ifdef CONFIG_NET_CLS_ACT
3128         skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3129         if (!skb)
3130                 goto out;
3131 ncls:
3132 #endif
3133
3134         rx_handler = rcu_dereference(skb->dev->rx_handler);
3135         if (rx_handler) {
3136                 if (pt_prev) {
3137                         ret = deliver_skb(skb, pt_prev, orig_dev);
3138                         pt_prev = NULL;
3139                 }
3140                 switch (rx_handler(&skb)) {
3141                 case RX_HANDLER_CONSUMED:
3142                         goto out;
3143                 case RX_HANDLER_ANOTHER:
3144                         goto another_round;
3145                 case RX_HANDLER_EXACT:
3146                         deliver_exact = true;
3147                 case RX_HANDLER_PASS:
3148                         break;
3149                 default:
3150                         BUG();
3151                 }
3152         }
3153
3154         if (vlan_tx_tag_present(skb)) {
3155                 if (pt_prev) {
3156                         ret = deliver_skb(skb, pt_prev, orig_dev);
3157                         pt_prev = NULL;
3158                 }
3159                 if (vlan_hwaccel_do_receive(&skb)) {
3160                         ret = __netif_receive_skb(skb);
3161                         goto out;
3162                 } else if (unlikely(!skb))
3163                         goto out;
3164         }
3165
3166         vlan_on_bond_hook(skb);
3167
3168         /* deliver only exact match when indicated */
3169         null_or_dev = deliver_exact ? skb->dev : NULL;
3170
3171         type = skb->protocol;
3172         list_for_each_entry_rcu(ptype,
3173                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3174                 if (ptype->type == type &&
3175                     (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3176                      ptype->dev == orig_dev)) {
3177                         if (pt_prev)
3178                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3179                         pt_prev = ptype;
3180                 }
3181         }
3182
3183         if (pt_prev) {
3184                 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3185         } else {
3186                 atomic_long_inc(&skb->dev->rx_dropped);
3187                 kfree_skb(skb);
3188                 /* Jamal, now you will not able to escape explaining
3189                  * me how you were going to use this. :-)
3190                  */
3191                 ret = NET_RX_DROP;
3192         }
3193
3194 out:
3195         rcu_read_unlock();
3196         return ret;
3197 }
3198
3199 /**
3200  *      netif_receive_skb - process receive buffer from network
3201  *      @skb: buffer to process
3202  *
3203  *      netif_receive_skb() is the main receive data processing function.
3204  *      It always succeeds. The buffer may be dropped during processing
3205  *      for congestion control or by the protocol layers.
3206  *
3207  *      This function may only be called from softirq context and interrupts
3208  *      should be enabled.
3209  *
3210  *      Return values (usually ignored):
3211  *      NET_RX_SUCCESS: no congestion
3212  *      NET_RX_DROP: packet was dropped
3213  */
3214 int netif_receive_skb(struct sk_buff *skb)
3215 {
3216         if (netdev_tstamp_prequeue)
3217                 net_timestamp_check(skb);
3218
3219         if (skb_defer_rx_timestamp(skb))
3220                 return NET_RX_SUCCESS;
3221
3222 #ifdef CONFIG_RPS
3223         {
3224                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3225                 int cpu, ret;
3226
3227                 rcu_read_lock();
3228
3229                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3230
3231                 if (cpu >= 0) {
3232                         ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3233                         rcu_read_unlock();
3234                 } else {
3235                         rcu_read_unlock();
3236                         ret = __netif_receive_skb(skb);
3237                 }
3238
3239                 return ret;
3240         }
3241 #else
3242         return __netif_receive_skb(skb);
3243 #endif
3244 }
3245 EXPORT_SYMBOL(netif_receive_skb);
3246
3247 /* Network device is going away, flush any packets still pending
3248  * Called with irqs disabled.
3249  */
3250 static void flush_backlog(void *arg)
3251 {
3252         struct net_device *dev = arg;
3253         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3254         struct sk_buff *skb, *tmp;
3255
3256         rps_lock(sd);
3257         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3258                 if (skb->dev == dev) {
3259                         __skb_unlink(skb, &sd->input_pkt_queue);
3260                         kfree_skb(skb);
3261                         input_queue_head_incr(sd);
3262                 }
3263         }
3264         rps_unlock(sd);
3265
3266         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3267                 if (skb->dev == dev) {
3268                         __skb_unlink(skb, &sd->process_queue);
3269                         kfree_skb(skb);
3270                         input_queue_head_incr(sd);
3271                 }
3272         }
3273 }
3274
3275 static int napi_gro_complete(struct sk_buff *skb)
3276 {
3277         struct packet_type *ptype;
3278         __be16 type = skb->protocol;
3279         struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3280         int err = -ENOENT;
3281
3282         if (NAPI_GRO_CB(skb)->count == 1) {
3283                 skb_shinfo(skb)->gso_size = 0;
3284                 goto out;
3285         }
3286
3287         rcu_read_lock();
3288         list_for_each_entry_rcu(ptype, head, list) {
3289                 if (ptype->type != type || ptype->dev || !ptype->gro_complete)
3290                         continue;
3291
3292                 err = ptype->gro_complete(skb);
3293                 break;
3294         }
3295         rcu_read_unlock();
3296
3297         if (err) {
3298                 WARN_ON(&ptype->list == head);
3299                 kfree_skb(skb);
3300                 return NET_RX_SUCCESS;
3301         }
3302
3303 out:
3304         return netif_receive_skb(skb);
3305 }
3306
3307 inline void napi_gro_flush(struct napi_struct *napi)
3308 {
3309         struct sk_buff *skb, *next;
3310
3311         for (skb = napi->gro_list; skb; skb = next) {
3312                 next = skb->next;
3313                 skb->next = NULL;
3314                 napi_gro_complete(skb);
3315         }
3316
3317         napi->gro_count = 0;
3318         napi->gro_list = NULL;
3319 }
3320 EXPORT_SYMBOL(napi_gro_flush);
3321
3322 enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3323 {
3324         struct sk_buff **pp = NULL;
3325         struct packet_type *ptype;
3326         __be16 type = skb->protocol;
3327         struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3328         int same_flow;
3329         int mac_len;
3330         enum gro_result ret;
3331
3332         if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3333                 goto normal;
3334
3335         if (skb_is_gso(skb) || skb_has_frag_list(skb))
3336                 goto normal;
3337
3338         rcu_read_lock();
3339         list_for_each_entry_rcu(ptype, head, list) {
3340                 if (ptype->type != type || ptype->dev || !ptype->gro_receive)
3341                         continue;
3342
3343                 skb_set_network_header(skb, skb_gro_offset(skb));
3344                 mac_len = skb->network_header - skb->mac_header;
3345                 skb->mac_len = mac_len;
3346                 NAPI_GRO_CB(skb)->same_flow = 0;
3347                 NAPI_GRO_CB(skb)->flush = 0;
3348                 NAPI_GRO_CB(skb)->free = 0;
3349
3350                 pp = ptype->gro_receive(&napi->gro_list, skb);
3351                 break;
3352         }
3353         rcu_read_unlock();
3354
3355         if (&ptype->list == head)
3356                 goto normal;
3357
3358         same_flow = NAPI_GRO_CB(skb)->same_flow;
3359         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3360
3361         if (pp) {
3362                 struct sk_buff *nskb = *pp;
3363
3364                 *pp = nskb->next;
3365                 nskb->next = NULL;
3366                 napi_gro_complete(nskb);
3367                 napi->gro_count--;
3368         }
3369
3370         if (same_flow)
3371                 goto ok;
3372
3373         if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3374                 goto normal;
3375
3376         napi->gro_count++;
3377         NAPI_GRO_CB(skb)->count = 1;
3378         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3379         skb->next = napi->gro_list;
3380         napi->gro_list = skb;
3381         ret = GRO_HELD;
3382
3383 pull:
3384         if (skb_headlen(skb) < skb_gro_offset(skb)) {
3385                 int grow = skb_gro_offset(skb) - skb_headlen(skb);
3386
3387                 BUG_ON(skb->end - skb->tail < grow);
3388
3389                 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3390
3391                 skb->tail += grow;
3392                 skb->data_len -= grow;
3393
3394                 skb_shinfo(skb)->frags[0].page_offset += grow;
3395                 skb_shinfo(skb)->frags[0].size -= grow;
3396
3397                 if (unlikely(!skb_shinfo(skb)->frags[0].size)) {
3398                         put_page(skb_shinfo(skb)->frags[0].page);
3399                         memmove(skb_shinfo(skb)->frags,
3400                                 skb_shinfo(skb)->frags + 1,
3401                                 --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3402                 }
3403         }
3404
3405 ok:
3406         return ret;
3407
3408 normal:
3409         ret = GRO_NORMAL;
3410         goto pull;
3411 }
3412 EXPORT_SYMBOL(dev_gro_receive);
3413
3414 static inline gro_result_t
3415 __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3416 {
3417         struct sk_buff *p;
3418
3419         for (p = napi->gro_list; p; p = p->next) {
3420                 unsigned long diffs;
3421
3422                 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3423                 diffs |= p->vlan_tci ^ skb->vlan_tci;
3424                 diffs |= compare_ether_header(skb_mac_header(p),
3425                                               skb_gro_mac_header(skb));
3426                 NAPI_GRO_CB(p)->same_flow = !diffs;
3427                 NAPI_GRO_CB(p)->flush = 0;
3428         }
3429
3430         return dev_gro_receive(napi, skb);
3431 }
3432
3433 gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3434 {
3435         switch (ret) {
3436         case GRO_NORMAL:
3437                 if (netif_receive_skb(skb))
3438                         ret = GRO_DROP;
3439                 break;
3440
3441         case GRO_DROP:
3442         case GRO_MERGED_FREE:
3443                 kfree_skb(skb);
3444                 break;
3445
3446         case GRO_HELD:
3447         case GRO_MERGED:
3448                 break;
3449         }
3450
3451         return ret;
3452 }
3453 EXPORT_SYMBOL(napi_skb_finish);
3454
3455 void skb_gro_reset_offset(struct sk_buff *skb)
3456 {
3457         NAPI_GRO_CB(skb)->data_offset = 0;
3458         NAPI_GRO_CB(skb)->frag0 = NULL;
3459         NAPI_GRO_CB(skb)->frag0_len = 0;
3460
3461         if (skb->mac_header == skb->tail &&
3462             !PageHighMem(skb_shinfo(skb)->frags[0].page)) {
3463                 NAPI_GRO_CB(skb)->frag0 =
3464                         page_address(skb_shinfo(skb)->frags[0].page) +
3465                         skb_shinfo(skb)->frags[0].page_offset;
3466                 NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size;
3467         }
3468 }
3469 EXPORT_SYMBOL(skb_gro_reset_offset);
3470
3471 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3472 {
3473         skb_gro_reset_offset(skb);
3474
3475         return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
3476 }
3477 EXPORT_SYMBOL(napi_gro_receive);
3478
3479 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3480 {
3481         __skb_pull(skb, skb_headlen(skb));
3482         skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
3483         skb->vlan_tci = 0;
3484         skb->dev = napi->dev;
3485         skb->skb_iif = 0;
3486
3487         napi->skb = skb;
3488 }
3489
3490 struct sk_buff *napi_get_frags(struct napi_struct *napi)
3491 {
3492         struct sk_buff *skb = napi->skb;
3493
3494         if (!skb) {
3495                 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3496                 if (skb)
3497                         napi->skb = skb;
3498         }
3499         return skb;
3500 }
3501 EXPORT_SYMBOL(napi_get_frags);
3502
3503 gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3504                                gro_result_t ret)
3505 {
3506         switch (ret) {
3507         case GRO_NORMAL:
3508         case GRO_HELD:
3509                 skb->protocol = eth_type_trans(skb, skb->dev);
3510
3511                 if (ret == GRO_HELD)
3512                         skb_gro_pull(skb, -ETH_HLEN);
3513                 else if (netif_receive_skb(skb))
3514                         ret = GRO_DROP;
3515                 break;
3516
3517         case GRO_DROP:
3518         case GRO_MERGED_FREE:
3519                 napi_reuse_skb(napi, skb);
3520                 break;
3521
3522         case GRO_MERGED:
3523                 break;
3524         }
3525
3526         return ret;
3527 }
3528 EXPORT_SYMBOL(napi_frags_finish);
3529
3530 struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3531 {
3532         struct sk_buff *skb = napi->skb;
3533         struct ethhdr *eth;
3534         unsigned int hlen;
3535         unsigned int off;
3536
3537         napi->skb = NULL;
3538
3539         skb_reset_mac_header(skb);
3540         skb_gro_reset_offset(skb);
3541
3542         off = skb_gro_offset(skb);
3543         hlen = off + sizeof(*eth);
3544         eth = skb_gro_header_fast(skb, off);
3545         if (skb_gro_header_hard(skb, hlen)) {
3546                 eth = skb_gro_header_slow(skb, hlen, off);
3547                 if (unlikely(!eth)) {
3548                         napi_reuse_skb(napi, skb);
3549                         skb = NULL;
3550                         goto out;
3551                 }
3552         }
3553
3554         skb_gro_pull(skb, sizeof(*eth));
3555
3556         /*
3557          * This works because the only protocols we care about don't require
3558          * special handling.  We'll fix it up properly at the end.
3559          */
3560         skb->protocol = eth->h_proto;
3561
3562 out:
3563         return skb;
3564 }
3565 EXPORT_SYMBOL(napi_frags_skb);
3566
3567 gro_result_t napi_gro_frags(struct napi_struct *napi)
3568 {
3569         struct sk_buff *skb = napi_frags_skb(napi);
3570
3571         if (!skb)
3572                 return GRO_DROP;
3573
3574         return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
3575 }
3576 EXPORT_SYMBOL(napi_gro_frags);
3577
3578 /*
3579  * net_rps_action sends any pending IPI's for rps.
3580  * Note: called with local irq disabled, but exits with local irq enabled.
3581  */
3582 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3583 {
3584 #ifdef CONFIG_RPS
3585         struct softnet_data *remsd = sd->rps_ipi_list;
3586
3587         if (remsd) {
3588                 sd->rps_ipi_list = NULL;
3589
3590                 local_irq_enable();
3591
3592                 /* Send pending IPI's to kick RPS processing on remote cpus. */
3593                 while (remsd) {
3594                         struct softnet_data *next = remsd->rps_ipi_next;
3595
3596                         if (cpu_online(remsd->cpu))
3597                                 __smp_call_function_single(remsd->cpu,
3598                                                            &remsd->csd, 0);
3599                         remsd = next;
3600                 }
3601         } else
3602 #endif
3603                 local_irq_enable();
3604 }
3605
3606 static int process_backlog(struct napi_struct *napi, int quota)
3607 {
3608         int work = 0;
3609         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
3610
3611 #ifdef CONFIG_RPS
3612         /* Check if we have pending ipi, its better to send them now,
3613          * not waiting net_rx_action() end.
3614          */
3615         if (sd->rps_ipi_list) {
3616                 local_irq_disable();
3617                 net_rps_action_and_irq_enable(sd);
3618         }
3619 #endif
3620         napi->weight = weight_p;
3621         local_irq_disable();
3622         while (work < quota) {
3623                 struct sk_buff *skb;
3624                 unsigned int qlen;
3625
3626                 while ((skb = __skb_dequeue(&sd->process_queue))) {
3627                         local_irq_enable();
3628                         __netif_receive_skb(skb);
3629                         local_irq_disable();
3630                         input_queue_head_incr(sd);
3631                         if (++work >= quota) {
3632                                 local_irq_enable();
3633                                 return work;
3634                         }
3635                 }
3636
3637                 rps_lock(sd);
3638                 qlen = skb_queue_len(&sd->input_pkt_queue);
3639                 if (qlen)
3640                         skb_queue_splice_tail_init(&sd->input_pkt_queue,
3641                                                    &sd->process_queue);
3642
3643                 if (qlen < quota - work) {
3644                         /*
3645                          * Inline a custom version of __napi_complete().
3646                          * only current cpu owns and manipulates this napi,
3647                          * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3648                          * we can use a plain write instead of clear_bit(),
3649                          * and we dont need an smp_mb() memory barrier.
3650                          */
3651                         list_del(&napi->poll_list);
3652                         napi->state = 0;
3653
3654                         quota = work + qlen;
3655                 }
3656                 rps_unlock(sd);
3657         }
3658         local_irq_enable();
3659
3660         return work;
3661 }
3662
3663 /**
3664  * __napi_schedule - schedule for receive
3665  * @n: entry to schedule
3666  *
3667  * The entry's receive function will be scheduled to run
3668  */
3669 void __napi_schedule(struct napi_struct *n)
3670 {
3671         unsigned long flags;
3672
3673         local_irq_save(flags);
3674         ____napi_schedule(&__get_cpu_var(softnet_data), n);
3675         local_irq_restore(flags);
3676 }
3677 EXPORT_SYMBOL(__napi_schedule);
3678
3679 void __napi_complete(struct napi_struct *n)
3680 {
3681         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3682         BUG_ON(n->gro_list);
3683
3684         list_del(&n->poll_list);
3685         smp_mb__before_clear_bit();
3686         clear_bit(NAPI_STATE_SCHED, &n->state);
3687 }
3688 EXPORT_SYMBOL(__napi_complete);
3689
3690 void napi_complete(struct napi_struct *n)
3691 {
3692         unsigned long flags;
3693
3694         /*
3695          * don't let napi dequeue from the cpu poll list
3696          * just in case its running on a different cpu
3697          */
3698         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3699                 return;
3700
3701         napi_gro_flush(n);
3702         local_irq_save(flags);
3703         __napi_complete(n);
3704         local_irq_restore(flags);
3705 }
3706 EXPORT_SYMBOL(napi_complete);
3707
3708 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3709                     int (*poll)(struct napi_struct *, int), int weight)
3710 {
3711         INIT_LIST_HEAD(&napi->poll_list);
3712         napi->gro_count = 0;
3713         napi->gro_list = NULL;
3714         napi->skb = NULL;
3715         napi->poll = poll;
3716         napi->weight = weight;
3717         list_add(&napi->dev_list, &dev->napi_list);
3718         napi->dev = dev;
3719 #ifdef CONFIG_NETPOLL
3720         spin_lock_init(&napi->poll_lock);
3721         napi->poll_owner = -1;
3722 #endif
3723         set_bit(NAPI_STATE_SCHED, &napi->state);
3724 }
3725 EXPORT_SYMBOL(netif_napi_add);
3726
3727 void netif_napi_del(struct napi_struct *napi)
3728 {
3729         struct sk_buff *skb, *next;
3730
3731         list_del_init(&napi->dev_list);
3732         napi_free_frags(napi);
3733
3734         for (skb = napi->gro_list; skb; skb = next) {
3735                 next = skb->next;
3736                 skb->next = NULL;
3737                 kfree_skb(skb);
3738         }
3739
3740         napi->gro_list = NULL;
3741         napi->gro_count = 0;
3742 }
3743 EXPORT_SYMBOL(netif_napi_del);
3744
3745 static void net_rx_action(struct softirq_action *h)
3746 {
3747         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3748         unsigned long time_limit = jiffies + 2;
3749         int budget = netdev_budget;
3750         void *have;
3751
3752         local_irq_disable();
3753
3754         while (!list_empty(&sd->poll_list)) {
3755                 struct napi_struct *n;
3756                 int work, weight;
3757
3758                 /* If softirq window is exhuasted then punt.
3759                  * Allow this to run for 2 jiffies since which will allow
3760                  * an average latency of 1.5/HZ.
3761                  */
3762                 if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
3763                         goto softnet_break;
3764
3765                 local_irq_enable();
3766
3767                 /* Even though interrupts have been re-enabled, this
3768                  * access is safe because interrupts can only add new
3769                  * entries to the tail of this list, and only ->poll()
3770                  * calls can remove this head entry from the list.
3771                  */
3772                 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
3773
3774                 have = netpoll_poll_lock(n);
3775
3776                 weight = n->weight;
3777
3778                 /* This NAPI_STATE_SCHED test is for avoiding a race
3779                  * with netpoll's poll_napi().  Only the entity which
3780                  * obtains the lock and sees NAPI_STATE_SCHED set will
3781                  * actually make the ->poll() call.  Therefore we avoid
3782                  * accidently calling ->poll() when NAPI is not scheduled.
3783                  */
3784                 work = 0;
3785                 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
3786                         work = n->poll(n, weight);
3787                         trace_napi_poll(n);
3788                 }
3789
3790                 WARN_ON_ONCE(work > weight);
3791
3792                 budget -= work;
3793
3794                 local_irq_disable();
3795
3796                 /* Drivers must not modify the NAPI state if they
3797                  * consume the entire weight.  In such cases this code
3798                  * still "owns" the NAPI instance and therefore can
3799                  * move the instance around on the list at-will.
3800                  */
3801                 if (unlikely(work == weight)) {
3802                         if (unlikely(napi_disable_pending(n))) {
3803                                 local_irq_enable();
3804                                 napi_complete(n);
3805                                 local_irq_disable();
3806                         } else
3807                                 list_move_tail(&n->poll_list, &sd->poll_list);
3808                 }
3809
3810                 netpoll_poll_unlock(have);
3811         }
3812 out:
3813         net_rps_action_and_irq_enable(sd);
3814
3815 #ifdef CONFIG_NET_DMA
3816         /*
3817          * There may not be any more sk_buffs coming right now, so push
3818          * any pending DMA copies to hardware
3819          */
3820         dma_issue_pending_all();
3821 #endif
3822
3823         return;
3824
3825 softnet_break:
3826         sd->time_squeeze++;
3827         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3828         goto out;
3829 }
3830
3831 static gifconf_func_t *gifconf_list[NPROTO];
3832
3833 /**
3834  *      register_gifconf        -       register a SIOCGIF handler
3835  *      @family: Address family
3836  *      @gifconf: Function handler
3837  *
3838  *      Register protocol dependent address dumping routines. The handler
3839  *      that is passed must not be freed or reused until it has been replaced
3840  *      by another handler.
3841  */
3842 int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
3843 {
3844         if (family >= NPROTO)
3845                 return -EINVAL;
3846         gifconf_list[family] = gifconf;
3847         return 0;
3848 }
3849 EXPORT_SYMBOL(register_gifconf);
3850
3851
3852 /*
3853  *      Map an interface index to its name (SIOCGIFNAME)
3854  */
3855
3856 /*
3857  *      We need this ioctl for efficient implementation of the
3858  *      if_indextoname() function required by the IPv6 API.  Without
3859  *      it, we would have to search all the interfaces to find a
3860  *      match.  --pb
3861  */
3862
3863 static int dev_ifname(struct net *net, struct ifreq __user *arg)
3864 {
3865         struct net_device *dev;
3866         struct ifreq ifr;
3867
3868         /*
3869          *      Fetch the caller's info block.
3870          */
3871
3872         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3873                 return -EFAULT;
3874
3875         rcu_read_lock();
3876         dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
3877         if (!dev) {
3878                 rcu_read_unlock();
3879                 return -ENODEV;
3880         }
3881
3882         strcpy(ifr.ifr_name, dev->name);
3883         rcu_read_unlock();
3884
3885         if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
3886                 return -EFAULT;
3887         return 0;
3888 }
3889
3890 /*
3891  *      Perform a SIOCGIFCONF call. This structure will change
3892  *      size eventually, and there is nothing I can do about it.
3893  *      Thus we will need a 'compatibility mode'.
3894  */
3895
3896 static int dev_ifconf(struct net *net, char __user *arg)
3897 {
3898         struct ifconf ifc;
3899         struct net_device *dev;
3900         char __user *pos;
3901         int len;
3902         int total;
3903         int i;
3904
3905         /*
3906          *      Fetch the caller's info block.
3907          */
3908
3909         if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
3910                 return -EFAULT;
3911
3912         pos = ifc.ifc_buf;
3913         len = ifc.ifc_len;
3914
3915         /*
3916          *      Loop over the interfaces, and write an info block for each.
3917          */
3918
3919         total = 0;
3920         for_each_netdev(net, dev) {
3921                 for (i = 0; i < NPROTO; i++) {
3922                         if (gifconf_list[i]) {
3923                                 int done;
3924                                 if (!pos)
3925                                         done = gifconf_list[i](dev, NULL, 0);
3926                                 else
3927                                         done = gifconf_list[i](dev, pos + total,
3928                                                                len - total);
3929                                 if (done < 0)
3930                                         return -EFAULT;
3931                                 total += done;
3932                         }
3933                 }
3934         }
3935
3936         /*
3937          *      All done.  Write the updated control block back to the caller.
3938          */
3939         ifc.ifc_len = total;
3940
3941         /*
3942          *      Both BSD and Solaris return 0 here, so we do too.
3943          */
3944         return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
3945 }
3946
3947 #ifdef CONFIG_PROC_FS
3948 /*
3949  *      This is invoked by the /proc filesystem handler to display a device
3950  *      in detail.
3951  */
3952 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
3953         __acquires(RCU)
3954 {
3955         struct net *net = seq_file_net(seq);
3956         loff_t off;
3957         struct net_device *dev;
3958
3959         rcu_read_lock();
3960         if (!*pos)
3961                 return SEQ_START_TOKEN;
3962
3963         off = 1;
3964         for_each_netdev_rcu(net, dev)
3965                 if (off++ == *pos)
3966                         return dev;
3967
3968         return NULL;
3969 }
3970
3971 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3972 {
3973         struct net_device *dev = v;
3974
3975         if (v == SEQ_START_TOKEN)
3976                 dev = first_net_device_rcu(seq_file_net(seq));
3977         else
3978                 dev = next_net_device_rcu(dev);
3979
3980         ++*pos;
3981         return dev;
3982 }
3983
3984 void dev_seq_stop(struct seq_file *seq, void *v)
3985         __releases(RCU)
3986 {
3987         rcu_read_unlock();
3988 }
3989
3990 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
3991 {
3992         struct rtnl_link_stats64 temp;
3993         const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
3994
3995         seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
3996                    "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
3997                    dev->name, stats->rx_bytes, stats->rx_packets,
3998                    stats->rx_errors,
3999                    stats->rx_dropped + stats->rx_missed_errors,
4000                    stats->rx_fifo_errors,
4001                    stats->rx_length_errors + stats->rx_over_errors +
4002                     stats->rx_crc_errors + stats->rx_frame_errors,
4003                    stats->rx_compressed, stats->multicast,
4004                    stats->tx_bytes, stats->tx_packets,
4005                    stats->tx_errors, stats->tx_dropped,
4006                    stats->tx_fifo_errors, stats->collisions,
4007                    stats->tx_carrier_errors +
4008                     stats->tx_aborted_errors +
4009                     stats->tx_window_errors +
4010                     stats->tx_heartbeat_errors,
4011                    stats->tx_compressed);
4012 }
4013
4014 /*
4015  *      Called from the PROCfs module. This now uses the new arbitrary sized
4016  *      /proc/net interface to create /proc/net/dev
4017  */
4018 static int dev_seq_show(struct seq_file *seq, void *v)
4019 {
4020         if (v == SEQ_START_TOKEN)
4021                 seq_puts(seq, "Inter-|   Receive                            "
4022                               "                    |  Transmit\n"
4023                               " face |bytes    packets errs drop fifo frame "
4024                               "compressed multicast|bytes    packets errs "
4025                               "drop fifo colls carrier compressed\n");
4026         else
4027                 dev_seq_printf_stats(seq, v);
4028         return 0;
4029 }
4030
4031 static struct softnet_data *softnet_get_online(loff_t *pos)
4032 {
4033         struct softnet_data *sd = NULL;
4034
4035         while (*pos < nr_cpu_ids)
4036                 if (cpu_online(*pos)) {
4037                         sd = &per_cpu(softnet_data, *pos);
4038                         break;
4039                 } else
4040                         ++*pos;
4041         return sd;
4042 }
4043
4044 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
4045 {
4046         return softnet_get_online(pos);
4047 }
4048
4049 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4050 {
4051         ++*pos;
4052         return softnet_get_online(pos);
4053 }
4054
4055 static void softnet_seq_stop(struct seq_file *seq, void *v)
4056 {
4057 }
4058
4059 static int softnet_seq_show(struct seq_file *seq, void *v)
4060 {
4061         struct softnet_data *sd = v;
4062
4063         seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
4064                    sd->processed, sd->dropped, sd->time_squeeze, 0,
4065                    0, 0, 0, 0, /* was fastroute */
4066                    sd->cpu_collision, sd->received_rps);
4067         return 0;
4068 }
4069
4070 static const struct seq_operations dev_seq_ops = {
4071         .start = dev_seq_start,
4072         .next  = dev_seq_next,
4073         .stop  = dev_seq_stop,
4074         .show  = dev_seq_show,
4075 };
4076
4077 static int dev_seq_open(struct inode *inode, struct file *file)
4078 {
4079         return seq_open_net(inode, file, &dev_seq_ops,
4080                             sizeof(struct seq_net_private));
4081 }
4082
4083 static const struct file_operations dev_seq_fops = {
4084         .owner   = THIS_MODULE,
4085         .open    = dev_seq_open,
4086         .read    = seq_read,
4087         .llseek  = seq_lseek,
4088         .release = seq_release_net,
4089 };
4090
4091 static const struct seq_operations softnet_seq_ops = {
4092         .start = softnet_seq_start,
4093         .next  = softnet_seq_next,
4094         .stop  = softnet_seq_stop,
4095         .show  = softnet_seq_show,
4096 };
4097
4098 static int softnet_seq_open(struct inode *inode, struct file *file)
4099 {
4100         return seq_open(file, &softnet_seq_ops);
4101 }
4102
4103 static const struct file_operations softnet_seq_fops = {
4104         .owner   = THIS_MODULE,
4105         .open    = softnet_seq_open,
4106         .read    = seq_read,
4107         .llseek  = seq_lseek,
4108         .release = seq_release,
4109 };
4110
4111 static void *ptype_get_idx(loff_t pos)
4112 {
4113         struct packet_type *pt = NULL;
4114         loff_t i = 0;
4115         int t;
4116
4117         list_for_each_entry_rcu(pt, &ptype_all, list) {
4118                 if (i == pos)
4119                         return pt;
4120                 ++i;
4121         }
4122
4123         for (t = 0; t < PTYPE_HASH_SIZE; t++) {
4124                 list_for_each_entry_rcu(pt, &ptype_base[t], list) {
4125                         if (i == pos)
4126                                 return pt;
4127                         ++i;
4128                 }
4129         }
4130         return NULL;
4131 }
4132
4133 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
4134         __acquires(RCU)
4135 {
4136         rcu_read_lock();
4137         return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
4138 }
4139
4140 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4141 {
4142         struct packet_type *pt;
4143         struct list_head *nxt;
4144         int hash;
4145
4146         ++*pos;
4147         if (v == SEQ_START_TOKEN)
4148                 return ptype_get_idx(0);
4149
4150         pt = v;
4151         nxt = pt->list.next;
4152         if (pt->type == htons(ETH_P_ALL)) {
4153                 if (nxt != &ptype_all)
4154                         goto found;
4155                 hash = 0;
4156                 nxt = ptype_base[0].next;
4157         } else
4158                 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
4159
4160         while (nxt == &ptype_base[hash]) {
4161                 if (++hash >= PTYPE_HASH_SIZE)
4162                         return NULL;
4163                 nxt = ptype_base[hash].next;
4164         }
4165 found:
4166         return list_entry(nxt, struct packet_type, list);
4167 }
4168
4169 static void ptype_seq_stop(struct seq_file *seq, void *v)
4170         __releases(RCU)
4171 {
4172         rcu_read_unlock();
4173 }
4174
4175 static int ptype_seq_show(struct seq_file *seq, void *v)
4176 {
4177         struct packet_type *pt = v;
4178
4179         if (v == SEQ_START_TOKEN)
4180                 seq_puts(seq, "Type Device      Function\n");
4181         else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
4182                 if (pt->type == htons(ETH_P_ALL))
4183                         seq_puts(seq, "ALL ");
4184                 else
4185                         seq_printf(seq, "%04x", ntohs(pt->type));
4186
4187                 seq_printf(seq, " %-8s %pF\n",
4188                            pt->dev ? pt->dev->name : "", pt->func);
4189         }
4190
4191         return 0;
4192 }
4193
4194 static const struct seq_operations ptype_seq_ops = {
4195         .start = ptype_seq_start,
4196         .next  = ptype_seq_next,
4197         .stop  = ptype_seq_stop,
4198         .show  = ptype_seq_show,
4199 };
4200
4201 static int ptype_seq_open(struct inode *inode, struct file *file)
4202 {
4203         return seq_open_net(inode, file, &ptype_seq_ops,
4204                         sizeof(struct seq_net_private));
4205 }
4206
4207 static const struct file_operations ptype_seq_fops = {
4208         .owner   = THIS_MODULE,
4209         .open    = ptype_seq_open,
4210         .read    = seq_read,
4211         .llseek  = seq_lseek,
4212         .release = seq_release_net,
4213 };
4214
4215
4216 static int __net_init dev_proc_net_init(struct net *net)
4217 {
4218         int rc = -ENOMEM;
4219
4220         if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
4221                 goto out;
4222         if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
4223                 goto out_dev;
4224         if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
4225                 goto out_softnet;
4226
4227         if (wext_proc_init(net))
4228                 goto out_ptype;
4229         rc = 0;
4230 out:
4231         return rc;
4232 out_ptype:
4233         proc_net_remove(net, "ptype");
4234 out_softnet:
4235         proc_net_remove(net, "softnet_stat");
4236 out_dev:
4237         proc_net_remove(net, "dev");
4238         goto out;
4239 }
4240
4241 static void __net_exit dev_proc_net_exit(struct net *net)
4242 {
4243         wext_proc_exit(net);
4244
4245         proc_net_remove(net, "ptype");
4246         proc_net_remove(net, "softnet_stat");
4247         proc_net_remove(net, "dev");
4248 }
4249
4250 static struct pernet_operations __net_initdata dev_proc_ops = {
4251         .init = dev_proc_net_init,
4252         .exit = dev_proc_net_exit,
4253 };
4254
4255 static int __init dev_proc_init(void)
4256 {
4257         return register_pernet_subsys(&dev_proc_ops);
4258 }
4259 #else
4260 #define dev_proc_init() 0
4261 #endif  /* CONFIG_PROC_FS */
4262
4263
4264 /**
4265  *      netdev_set_master       -       set up master pointer
4266  *      @slave: slave device
4267  *      @master: new master device
4268  *
4269  *      Changes the master device of the slave. Pass %NULL to break the
4270  *      bonding. The caller must hold the RTNL semaphore. On a failure
4271  *      a negative errno code is returned. On success the reference counts
4272  *      are adjusted and the function returns zero.
4273  */
4274 int netdev_set_master(struct net_device *slave, struct net_device *master)
4275 {
4276         struct net_device *old = slave->master;
4277
4278         ASSERT_RTNL();
4279
4280         if (master) {
4281                 if (old)
4282                         return -EBUSY;
4283                 dev_hold(master);
4284         }
4285
4286         slave->master = master;
4287
4288         if (old) {
4289                 synchronize_net();
4290                 dev_put(old);
4291         }
4292         return 0;
4293 }
4294 EXPORT_SYMBOL(netdev_set_master);
4295
4296 /**
4297  *      netdev_set_bond_master  -       set up bonding master/slave pair
4298  *      @slave: slave device
4299  *      @master: new master device
4300  *
4301  *      Changes the master device of the slave. Pass %NULL to break the
4302  *      bonding. The caller must hold the RTNL semaphore. On a failure
4303  *      a negative errno code is returned. On success %RTM_NEWLINK is sent
4304  *      to the routing socket and the function returns zero.
4305  */
4306 int netdev_set_bond_master(struct net_device *slave, struct net_device *master)
4307 {
4308         int err;
4309
4310         ASSERT_RTNL();
4311
4312         err = netdev_set_master(slave, master);
4313         if (err)
4314                 return err;
4315         if (master)
4316                 slave->flags |= IFF_SLAVE;
4317         else
4318                 slave->flags &= ~IFF_SLAVE;
4319
4320         rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4321         return 0;
4322 }
4323 EXPORT_SYMBOL(netdev_set_bond_master);
4324
4325 static void dev_change_rx_flags(struct net_device *dev, int flags)
4326 {
4327         const struct net_device_ops *ops = dev->netdev_ops;
4328
4329         if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4330                 ops->ndo_change_rx_flags(dev, flags);
4331 }
4332
4333 static int __dev_set_promiscuity(struct net_device *dev, int inc)
4334 {
4335         unsigned short old_flags = dev->flags;
4336         uid_t uid;
4337         gid_t gid;
4338
4339         ASSERT_RTNL();
4340
4341         dev->flags |= IFF_PROMISC;
4342         dev->promiscuity += inc;
4343         if (dev->promiscuity == 0) {
4344                 /*
4345                  * Avoid overflow.
4346                  * If inc causes overflow, untouch promisc and return error.
4347                  */
4348                 if (inc < 0)
4349                         dev->flags &= ~IFF_PROMISC;
4350                 else {
4351                         dev->promiscuity -= inc;
4352                         printk(KERN_WARNING "%s: promiscuity touches roof, "
4353                                 "set promiscuity failed, promiscuity feature "
4354                                 "of device might be broken.\n", dev->name);
4355                         return -EOVERFLOW;
4356                 }
4357         }
4358         if (dev->flags != old_flags) {
4359                 printk(KERN_INFO "device %s %s promiscuous mode\n",
4360                        dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
4361                                                                "left");
4362                 if (audit_enabled) {
4363                         current_uid_gid(&uid, &gid);
4364                         audit_log(current->audit_context, GFP_ATOMIC,
4365                                 AUDIT_ANOM_PROMISCUOUS,
4366                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4367                                 dev->name, (dev->flags & IFF_PROMISC),
4368                                 (old_flags & IFF_PROMISC),
4369                                 audit_get_loginuid(current),
4370                                 uid, gid,
4371                                 audit_get_sessionid(current));
4372                 }
4373
4374                 dev_change_rx_flags(dev, IFF_PROMISC);
4375         }
4376         return 0;
4377 }
4378
4379 /**
4380  *      dev_set_promiscuity     - update promiscuity count on a device
4381  *      @dev: device
4382  *      @inc: modifier
4383  *
4384  *      Add or remove promiscuity from a device. While the count in the device
4385  *      remains above zero the interface remains promiscuous. Once it hits zero
4386  *      the device reverts back to normal filtering operation. A negative inc
4387  *      value is used to drop promiscuity on the device.
4388  *      Return 0 if successful or a negative errno code on error.
4389  */
4390 int dev_set_promiscuity(struct net_device *dev, int inc)
4391 {
4392         unsigned short old_flags = dev->flags;
4393         int err;
4394
4395         err = __dev_set_promiscuity(dev, inc);
4396         if (err < 0)
4397                 return err;
4398         if (dev->flags != old_flags)
4399                 dev_set_rx_mode(dev);
4400         return err;
4401 }
4402 EXPORT_SYMBOL(dev_set_promiscuity);
4403
4404 /**
4405  *      dev_set_allmulti        - update allmulti count on a device
4406  *      @dev: device
4407  *      @inc: modifier
4408  *
4409  *      Add or remove reception of all multicast frames to a device. While the
4410  *      count in the device remains above zero the interface remains listening
4411  *      to all interfaces. Once it hits zero the device reverts back to normal
4412  *      filtering operation. A negative @inc value is used to drop the counter
4413  *      when releasing a resource needing all multicasts.
4414  *      Return 0 if successful or a negative errno code on error.
4415  */
4416
4417 int dev_set_allmulti(struct net_device *dev, int inc)
4418 {
4419         unsigned short old_flags = dev->flags;
4420
4421         ASSERT_RTNL();
4422
4423         dev->flags |= IFF_ALLMULTI;
4424         dev->allmulti += inc;
4425         if (dev->allmulti == 0) {
4426                 /*
4427                  * Avoid overflow.
4428                  * If inc causes overflow, untouch allmulti and return error.
4429                  */
4430                 if (inc < 0)
4431                         dev->flags &= ~IFF_ALLMULTI;
4432                 else {
4433                         dev->allmulti -= inc;
4434                         printk(KERN_WARNING "%s: allmulti touches roof, "
4435                                 "set allmulti failed, allmulti feature of "
4436                                 "device might be broken.\n", dev->name);
4437                         return -EOVERFLOW;
4438                 }
4439         }
4440         if (dev->flags ^ old_flags) {
4441                 dev_change_rx_flags(dev, IFF_ALLMULTI);
4442                 dev_set_rx_mode(dev);
4443         }
4444         return 0;
4445 }
4446 EXPORT_SYMBOL(dev_set_allmulti);
4447
4448 /*
4449  *      Upload unicast and multicast address lists to device and
4450  *      configure RX filtering. When the device doesn't support unicast
4451  *      filtering it is put in promiscuous mode while unicast addresses
4452  *      are present.
4453  */
4454 void __dev_set_rx_mode(struct net_device *dev)
4455 {
4456         const struct net_device_ops *ops = dev->netdev_ops;
4457
4458         /* dev_open will call this function so the list will stay sane. */
4459         if (!(dev->flags&IFF_UP))
4460                 return;
4461
4462         if (!netif_device_present(dev))
4463                 return;
4464
4465         if (ops->ndo_set_rx_mode)
4466                 ops->ndo_set_rx_mode(dev);
4467         else {
4468                 /* Unicast addresses changes may only happen under the rtnl,
4469                  * therefore calling __dev_set_promiscuity here is safe.
4470                  */
4471                 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4472                         __dev_set_promiscuity(dev, 1);
4473                         dev->uc_promisc = 1;
4474                 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4475                         __dev_set_promiscuity(dev, -1);
4476                         dev->uc_promisc = 0;
4477                 }
4478
4479                 if (ops->ndo_set_multicast_list)
4480                         ops->ndo_set_multicast_list(dev);
4481         }
4482 }
4483
4484 void dev_set_rx_mode(struct net_device *dev)
4485 {
4486         netif_addr_lock_bh(dev);
4487         __dev_set_rx_mode(dev);
4488         netif_addr_unlock_bh(dev);
4489 }
4490
4491 /**
4492  *      dev_get_flags - get flags reported to userspace
4493  *      @dev: device
4494  *
4495  *      Get the combination of flag bits exported through APIs to userspace.
4496  */
4497 unsigned dev_get_flags(const struct net_device *dev)
4498 {
4499         unsigned flags;
4500
4501         flags = (dev->flags & ~(IFF_PROMISC |
4502                                 IFF_ALLMULTI |
4503                                 IFF_RUNNING |
4504                                 IFF_LOWER_UP |
4505                                 IFF_DORMANT)) |
4506                 (dev->gflags & (IFF_PROMISC |
4507                                 IFF_ALLMULTI));
4508
4509         if (netif_running(dev)) {
4510                 if (netif_oper_up(dev))
4511                         flags |= IFF_RUNNING;
4512                 if (netif_carrier_ok(dev))
4513                         flags |= IFF_LOWER_UP;
4514                 if (netif_dormant(dev))
4515                         flags |= IFF_DORMANT;
4516         }
4517
4518         return flags;
4519 }
4520 EXPORT_SYMBOL(dev_get_flags);
4521
4522 int __dev_change_flags(struct net_device *dev, unsigned int flags)
4523 {
4524         int old_flags = dev->flags;
4525         int ret;
4526
4527         ASSERT_RTNL();
4528
4529         /*
4530          *      Set the flags on our device.
4531          */
4532
4533         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4534                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4535                                IFF_AUTOMEDIA)) |
4536                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4537                                     IFF_ALLMULTI));
4538
4539         /*
4540          *      Load in the correct multicast list now the flags have changed.
4541          */
4542
4543         if ((old_flags ^ flags) & IFF_MULTICAST)
4544                 dev_change_rx_flags(dev, IFF_MULTICAST);
4545
4546         dev_set_rx_mode(dev);
4547
4548         /*
4549          *      Have we downed the interface. We handle IFF_UP ourselves
4550          *      according to user attempts to set it, rather than blindly
4551          *      setting it.
4552          */
4553
4554         ret = 0;
4555         if ((old_flags ^ flags) & IFF_UP) {     /* Bit is different  ? */
4556                 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4557
4558                 if (!ret)
4559                         dev_set_rx_mode(dev);
4560         }
4561
4562         if ((flags ^ dev->gflags) & IFF_PROMISC) {
4563                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
4564
4565                 dev->gflags ^= IFF_PROMISC;
4566                 dev_set_promiscuity(dev, inc);
4567         }
4568
4569         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4570            is important. Some (broken) drivers set IFF_PROMISC, when
4571            IFF_ALLMULTI is requested not asking us and not reporting.
4572          */
4573         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4574                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4575
4576                 dev->gflags ^= IFF_ALLMULTI;
4577                 dev_set_allmulti(dev, inc);
4578         }
4579
4580         return ret;
4581 }
4582
4583 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4584 {
4585         unsigned int changes = dev->flags ^ old_flags;
4586
4587         if (changes & IFF_UP) {
4588                 if (dev->flags & IFF_UP)
4589                         call_netdevice_notifiers(NETDEV_UP, dev);
4590                 else
4591                         call_netdevice_notifiers(NETDEV_DOWN, dev);
4592         }
4593
4594         if (dev->flags & IFF_UP &&
4595             (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4596                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
4597 }
4598
4599 /**
4600  *      dev_change_flags - change device settings
4601  *      @dev: device
4602  *      @flags: device state flags
4603  *
4604  *      Change settings on device based state flags. The flags are
4605  *      in the userspace exported format.
4606  */
4607 int dev_change_flags(struct net_device *dev, unsigned flags)
4608 {
4609         int ret, changes;
4610         int old_flags = dev->flags;
4611
4612         ret = __dev_change_flags(dev, flags);
4613         if (ret < 0)
4614                 return ret;
4615
4616         changes = old_flags ^ dev->flags;
4617         if (changes)
4618                 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4619
4620         __dev_notify_flags(dev, old_flags);
4621         return ret;
4622 }
4623 EXPORT_SYMBOL(dev_change_flags);
4624
4625 /**
4626  *      dev_set_mtu - Change maximum transfer unit
4627  *      @dev: device
4628  *      @new_mtu: new transfer unit
4629  *
4630  *      Change the maximum transfer size of the network device.
4631  */
4632 int dev_set_mtu(struct net_device *dev, int new_mtu)
4633 {
4634         const struct net_device_ops *ops = dev->netdev_ops;
4635         int err;
4636
4637         if (new_mtu == dev->mtu)
4638                 return 0;
4639
4640         /*      MTU must be positive.    */
4641         if (new_mtu < 0)
4642                 return -EINVAL;
4643
4644         if (!netif_device_present(dev))
4645                 return -ENODEV;
4646
4647         err = 0;
4648         if (ops->ndo_change_mtu)
4649                 err = ops->ndo_change_mtu(dev, new_mtu);
4650         else
4651                 dev->mtu = new_mtu;
4652
4653         if (!err && dev->flags & IFF_UP)
4654                 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4655         return err;
4656 }
4657 EXPORT_SYMBOL(dev_set_mtu);
4658
4659 /**
4660  *      dev_set_group - Change group this device belongs to
4661  *      @dev: device
4662  *      @new_group: group this device should belong to
4663  */
4664 void dev_set_group(struct net_device *dev, int new_group)
4665 {
4666         dev->group = new_group;
4667 }
4668 EXPORT_SYMBOL(dev_set_group);
4669
4670 /**
4671  *      dev_set_mac_address - Change Media Access Control Address
4672  *      @dev: device
4673  *      @sa: new address
4674  *
4675  *      Change the hardware (MAC) address of the device
4676  */
4677 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4678 {
4679         const struct net_device_ops *ops = dev->netdev_ops;
4680         int err;
4681
4682         if (!ops->ndo_set_mac_address)
4683                 return -EOPNOTSUPP;
4684         if (sa->sa_family != dev->type)
4685                 return -EINVAL;
4686         if (!netif_device_present(dev))
4687                 return -ENODEV;
4688         err = ops->ndo_set_mac_address(dev, sa);
4689         if (!err)
4690                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4691         return err;
4692 }
4693 EXPORT_SYMBOL(dev_set_mac_address);
4694
4695 /*
4696  *      Perform the SIOCxIFxxx calls, inside rcu_read_lock()
4697  */
4698 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4699 {
4700         int err;
4701         struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
4702
4703         if (!dev)
4704                 return -ENODEV;
4705
4706         switch (cmd) {
4707         case SIOCGIFFLAGS:      /* Get interface flags */
4708                 ifr->ifr_flags = (short) dev_get_flags(dev);
4709                 return 0;
4710
4711         case SIOCGIFMETRIC:     /* Get the metric on the interface
4712                                    (currently unused) */
4713                 ifr->ifr_metric = 0;
4714                 return 0;
4715
4716         case SIOCGIFMTU:        /* Get the MTU of a device */
4717                 ifr->ifr_mtu = dev->mtu;
4718                 return 0;
4719
4720         case SIOCGIFHWADDR:
4721                 if (!dev->addr_len)
4722                         memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4723                 else
4724                         memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4725                                min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4726                 ifr->ifr_hwaddr.sa_family = dev->type;
4727                 return 0;
4728
4729         case SIOCGIFSLAVE:
4730                 err = -EINVAL;
4731                 break;
4732
4733         case SIOCGIFMAP:
4734                 ifr->ifr_map.mem_start = dev->mem_start;
4735                 ifr->ifr_map.mem_end   = dev->mem_end;
4736                 ifr->ifr_map.base_addr = dev->base_addr;
4737                 ifr->ifr_map.irq       = dev->irq;
4738                 ifr->ifr_map.dma       = dev->dma;
4739                 ifr->ifr_map.port      = dev->if_port;
4740                 return 0;
4741
4742         case SIOCGIFINDEX:
4743                 ifr->ifr_ifindex = dev->ifindex;
4744                 return 0;
4745
4746         case SIOCGIFTXQLEN:
4747                 ifr->ifr_qlen = dev->tx_queue_len;
4748                 return 0;
4749
4750         default:
4751                 /* dev_ioctl() should ensure this case
4752                  * is never reached
4753                  */
4754                 WARN_ON(1);
4755                 err = -EINVAL;
4756                 break;
4757
4758         }
4759         return err;
4760 }
4761
4762 /*
4763  *      Perform the SIOCxIFxxx calls, inside rtnl_lock()
4764  */
4765 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4766 {
4767         int err;
4768         struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4769         const struct net_device_ops *ops;
4770
4771         if (!dev)
4772                 return -ENODEV;
4773
4774         ops = dev->netdev_ops;
4775
4776         switch (cmd) {
4777         case SIOCSIFFLAGS:      /* Set interface flags */
4778                 return dev_change_flags(dev, ifr->ifr_flags);
4779
4780         case SIOCSIFMETRIC:     /* Set the metric on the interface
4781                                    (currently unused) */
4782                 return -EOPNOTSUPP;
4783
4784         case SIOCSIFMTU:        /* Set the MTU of a device */
4785                 return dev_set_mtu(dev, ifr->ifr_mtu);
4786
4787         case SIOCSIFHWADDR:
4788                 return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
4789
4790         case SIOCSIFHWBROADCAST:
4791                 if (ifr->ifr_hwaddr.sa_family != dev->type)
4792                         return -EINVAL;
4793                 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4794                        min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4795                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4796                 return 0;
4797
4798         case SIOCSIFMAP:
4799                 if (ops->ndo_set_config) {
4800                         if (!netif_device_present(dev))
4801                                 return -ENODEV;
4802                         return ops->ndo_set_config(dev, &ifr->ifr_map);
4803                 }
4804                 return -EOPNOTSUPP;
4805
4806         case SIOCADDMULTI:
4807                 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4808                     ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4809                         return -EINVAL;
4810                 if (!netif_device_present(dev))
4811                         return -ENODEV;
4812                 return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
4813
4814         case SIOCDELMULTI:
4815                 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4816                     ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4817                         return -EINVAL;
4818                 if (!netif_device_present(dev))
4819                         return -ENODEV;
4820                 return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
4821
4822         case SIOCSIFTXQLEN:
4823                 if (ifr->ifr_qlen < 0)
4824                         return -EINVAL;
4825                 dev->tx_queue_len = ifr->ifr_qlen;
4826                 return 0;
4827
4828         case SIOCSIFNAME:
4829                 ifr->ifr_newname[IFNAMSIZ-1] = '\0';
4830                 return dev_change_name(dev, ifr->ifr_newname);
4831
4832         /*
4833          *      Unknown or private ioctl
4834          */
4835         default:
4836                 if ((cmd >= SIOCDEVPRIVATE &&
4837                     cmd <= SIOCDEVPRIVATE + 15) ||
4838                     cmd == SIOCBONDENSLAVE ||
4839                     cmd == SIOCBONDRELEASE ||
4840                     cmd == SIOCBONDSETHWADDR ||
4841                     cmd == SIOCBONDSLAVEINFOQUERY ||
4842                     cmd == SIOCBONDINFOQUERY ||
4843                     cmd == SIOCBONDCHANGEACTIVE ||
4844                     cmd == SIOCGMIIPHY ||
4845                     cmd == SIOCGMIIREG ||
4846                     cmd == SIOCSMIIREG ||
4847                     cmd == SIOCBRADDIF ||
4848                     cmd == SIOCBRDELIF ||
4849                     cmd == SIOCSHWTSTAMP ||
4850                     cmd == SIOCWANDEV) {
4851                         err = -EOPNOTSUPP;
4852                         if (ops->ndo_do_ioctl) {
4853                                 if (netif_device_present(dev))
4854                                         err = ops->ndo_do_ioctl(dev, ifr, cmd);
4855                                 else
4856                                         err = -ENODEV;
4857                         }
4858                 } else
4859                         err = -EINVAL;
4860
4861         }
4862         return err;
4863 }
4864
4865 /*
4866  *      This function handles all "interface"-type I/O control requests. The actual
4867  *      'doing' part of this is dev_ifsioc above.
4868  */
4869
4870 /**
4871  *      dev_ioctl       -       network device ioctl
4872  *      @net: the applicable net namespace
4873  *      @cmd: command to issue
4874  *      @arg: pointer to a struct ifreq in user space
4875  *
4876  *      Issue ioctl functions to devices. This is normally called by the
4877  *      user space syscall interfaces but can sometimes be useful for
4878  *      other purposes. The return value is the return from the syscall if
4879  *      positive or a negative errno code on error.
4880  */
4881
4882 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
4883 {
4884         struct ifreq ifr;
4885         int ret;
4886         char *colon;
4887
4888         /* One special case: SIOCGIFCONF takes ifconf argument
4889            and requires shared lock, because it sleeps writing
4890            to user space.
4891          */
4892
4893         if (cmd == SIOCGIFCONF) {
4894                 rtnl_lock();
4895                 ret = dev_ifconf(net, (char __user *) arg);
4896                 rtnl_unlock();
4897                 return ret;
4898         }
4899         if (cmd == SIOCGIFNAME)
4900                 return dev_ifname(net, (struct ifreq __user *)arg);
4901
4902         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4903                 return -EFAULT;
4904
4905         ifr.ifr_name[IFNAMSIZ-1] = 0;
4906
4907         colon = strchr(ifr.ifr_name, ':');
4908         if (colon)
4909                 *colon = 0;
4910
4911         /*
4912          *      See which interface the caller is talking about.
4913          */
4914
4915         switch (cmd) {
4916         /*
4917          *      These ioctl calls:
4918          *      - can be done by all.
4919          *      - atomic and do not require locking.
4920          *      - return a value
4921          */
4922         case SIOCGIFFLAGS:
4923         case SIOCGIFMETRIC:
4924         case SIOCGIFMTU:
4925         case SIOCGIFHWADDR:
4926         case SIOCGIFSLAVE:
4927         case SIOCGIFMAP:
4928         case SIOCGIFINDEX:
4929         case SIOCGIFTXQLEN:
4930                 dev_load(net, ifr.ifr_name);
4931                 rcu_read_lock();
4932                 ret = dev_ifsioc_locked(net, &ifr, cmd);
4933                 rcu_read_unlock();
4934                 if (!ret) {
4935                         if (colon)
4936                                 *colon = ':';
4937                         if (copy_to_user(arg, &ifr,
4938                                          sizeof(struct ifreq)))
4939                                 ret = -EFAULT;
4940                 }
4941                 return ret;
4942
4943         case SIOCETHTOOL:
4944                 dev_load(net, ifr.ifr_name);
4945                 rtnl_lock();
4946                 ret = dev_ethtool(net, &ifr);
4947                 rtnl_unlock();
4948                 if (!ret) {
4949                         if (colon)
4950                                 *colon = ':';
4951                         if (copy_to_user(arg, &ifr,
4952                                          sizeof(struct ifreq)))
4953                                 ret = -EFAULT;
4954                 }
4955                 return ret;
4956
4957         /*
4958          *      These ioctl calls:
4959          *      - require superuser power.
4960          *      - require strict serialization.
4961          *      - return a value
4962          */
4963         case SIOCGMIIPHY:
4964         case SIOCGMIIREG:
4965         case SIOCSIFNAME:
4966                 if (!capable(CAP_NET_ADMIN))
4967                         return -EPERM;
4968                 dev_load(net, ifr.ifr_name);
4969                 rtnl_lock();
4970                 ret = dev_ifsioc(net, &ifr, cmd);
4971                 rtnl_unlock();
4972                 if (!ret) {
4973                         if (colon)
4974                                 *colon = ':';
4975                         if (copy_to_user(arg, &ifr,
4976                                          sizeof(struct ifreq)))
4977                                 ret = -EFAULT;
4978                 }
4979                 return ret;
4980
4981         /*
4982          *      These ioctl calls:
4983          *      - require superuser power.
4984          *      - require strict serialization.
4985          *      - do not return a value
4986          */
4987         case SIOCSIFFLAGS:
4988         case SIOCSIFMETRIC:
4989         case SIOCSIFMTU:
4990         case SIOCSIFMAP:
4991         case SIOCSIFHWADDR:
4992         case SIOCSIFSLAVE:
4993         case SIOCADDMULTI:
4994         case SIOCDELMULTI:
4995         case SIOCSIFHWBROADCAST:
4996         case SIOCSIFTXQLEN:
4997         case SIOCSMIIREG:
4998         case SIOCBONDENSLAVE:
4999         case SIOCBONDRELEASE:
5000         case SIOCBONDSETHWADDR:
5001         case SIOCBONDCHANGEACTIVE:
5002         case SIOCBRADDIF:
5003         case SIOCBRDELIF:
5004         case SIOCSHWTSTAMP:
5005                 if (!capable(CAP_NET_ADMIN))
5006                         return -EPERM;
5007                 /* fall through */
5008         case SIOCBONDSLAVEINFOQUERY:
5009         case SIOCBONDINFOQUERY:
5010                 dev_load(net, ifr.ifr_name);
5011                 rtnl_lock();
5012                 ret = dev_ifsioc(net, &ifr, cmd);
5013                 rtnl_unlock();
5014                 return ret;
5015
5016         case SIOCGIFMEM:
5017                 /* Get the per device memory space. We can add this but
5018                  * currently do not support it */
5019         case SIOCSIFMEM:
5020                 /* Set the per device memory buffer space.
5021                  * Not applicable in our case */
5022         case SIOCSIFLINK:
5023                 return -EINVAL;
5024
5025         /*
5026          *      Unknown or private ioctl.
5027          */
5028         default:
5029                 if (cmd == SIOCWANDEV ||
5030                     (cmd >= SIOCDEVPRIVATE &&
5031                      cmd <= SIOCDEVPRIVATE + 15)) {
5032                         dev_load(net, ifr.ifr_name);
5033                         rtnl_lock();
5034                         ret = dev_ifsioc(net, &ifr, cmd);
5035                         rtnl_unlock();
5036                         if (!ret && copy_to_user(arg, &ifr,
5037                                                  sizeof(struct ifreq)))
5038                                 ret = -EFAULT;
5039                         return ret;
5040                 }
5041                 /* Take care of Wireless Extensions */
5042                 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
5043                         return wext_handle_ioctl(net, &ifr, cmd, arg);
5044                 return -EINVAL;
5045         }
5046 }
5047
5048
5049 /**
5050  *      dev_new_index   -       allocate an ifindex
5051  *      @net: the applicable net namespace
5052  *
5053  *      Returns a suitable unique value for a new device interface
5054  *      number.  The caller must hold the rtnl semaphore or the
5055  *      dev_base_lock to be sure it remains unique.
5056  */
5057 static int dev_new_index(struct net *net)
5058 {
5059         static int ifindex;
5060         for (;;) {
5061                 if (++ifindex <= 0)
5062                         ifindex = 1;
5063                 if (!__dev_get_by_index(net, ifindex))
5064                         return ifindex;
5065         }
5066 }
5067
5068 /* Delayed registration/unregisteration */
5069 static LIST_HEAD(net_todo_list);
5070
5071 static void net_set_todo(struct net_device *dev)
5072 {
5073         list_add_tail(&dev->todo_list, &net_todo_list);
5074 }
5075
5076 static void rollback_registered_many(struct list_head *head)
5077 {
5078         struct net_device *dev, *tmp;
5079
5080         BUG_ON(dev_boot_phase);
5081         ASSERT_RTNL();
5082
5083         list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5084                 /* Some devices call without registering
5085                  * for initialization unwind. Remove those
5086                  * devices and proceed with the remaining.
5087                  */
5088                 if (dev->reg_state == NETREG_UNINITIALIZED) {
5089                         pr_debug("unregister_netdevice: device %s/%p never "
5090                                  "was registered\n", dev->name, dev);
5091
5092                         WARN_ON(1);
5093                         list_del(&dev->unreg_list);
5094                         continue;
5095                 }
5096
5097                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
5098         }
5099
5100         /* If device is running, close it first. */
5101         dev_close_many(head);
5102
5103         list_for_each_entry(dev, head, unreg_list) {
5104                 /* And unlink it from device chain. */
5105                 unlist_netdevice(dev);
5106
5107                 dev->reg_state = NETREG_UNREGISTERING;
5108         }
5109
5110         synchronize_net();
5111
5112         list_for_each_entry(dev, head, unreg_list) {
5113                 /* Shutdown queueing discipline. */
5114                 dev_shutdown(dev);
5115
5116
5117                 /* Notify protocols, that we are about to destroy
5118                    this device. They should clean all the things.
5119                 */
5120                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5121
5122                 if (!dev->rtnl_link_ops ||
5123                     dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5124                         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5125
5126                 /*
5127                  *      Flush the unicast and multicast chains
5128                  */
5129                 dev_uc_flush(dev);
5130                 dev_mc_flush(dev);
5131
5132                 if (dev->netdev_ops->ndo_uninit)
5133                         dev->netdev_ops->ndo_uninit(dev);
5134
5135                 /* Notifier chain MUST detach us from master device. */
5136                 WARN_ON(dev->master);
5137
5138                 /* Remove entries from kobject tree */
5139                 netdev_unregister_kobject(dev);
5140         }
5141
5142         /* Process any work delayed until the end of the batch */
5143         dev = list_first_entry(head, struct net_device, unreg_list);
5144         call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5145
5146         rcu_barrier();
5147
5148         list_for_each_entry(dev, head, unreg_list)
5149                 dev_put(dev);
5150 }
5151
5152 static void rollback_registered(struct net_device *dev)
5153 {
5154         LIST_HEAD(single);
5155
5156         list_add(&dev->unreg_list, &single);
5157         rollback_registered_many(&single);
5158         list_del(&single);
5159 }
5160
5161 u32 netdev_fix_features(struct net_device *dev, u32 features)
5162 {
5163         /* Fix illegal checksum combinations */
5164         if ((features & NETIF_F_HW_CSUM) &&
5165             (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5166                 netdev_info(dev, "mixed HW and IP checksum settings.\n");
5167                 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5168         }
5169
5170         if ((features & NETIF_F_NO_CSUM) &&
5171             (features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5172                 netdev_info(dev, "mixed no checksumming and other settings.\n");
5173                 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
5174         }
5175
5176         /* Fix illegal SG+CSUM combinations. */
5177         if ((features & NETIF_F_SG) &&
5178             !(features & NETIF_F_ALL_CSUM)) {
5179                 netdev_info(dev,
5180                             "Dropping NETIF_F_SG since no checksum feature.\n");
5181                 features &= ~NETIF_F_SG;
5182         }
5183
5184         /* TSO requires that SG is present as well. */
5185         if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) {
5186                 netdev_info(dev, "Dropping NETIF_F_TSO since no SG feature.\n");
5187                 features &= ~NETIF_F_TSO;
5188         }
5189
5190         /* Software GSO depends on SG. */
5191         if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5192                 netdev_info(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5193                 features &= ~NETIF_F_GSO;
5194         }
5195
5196         /* UFO needs SG and checksumming */
5197         if (features & NETIF_F_UFO) {
5198                 /* maybe split UFO into V4 and V6? */
5199                 if (!((features & NETIF_F_GEN_CSUM) ||
5200                     (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5201                             == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5202                         netdev_info(dev,
5203                                 "Dropping NETIF_F_UFO since no checksum offload features.\n");
5204                         features &= ~NETIF_F_UFO;
5205                 }
5206
5207                 if (!(features & NETIF_F_SG)) {
5208                         netdev_info(dev,
5209                                 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5210                         features &= ~NETIF_F_UFO;
5211                 }
5212         }
5213
5214         return features;
5215 }
5216 EXPORT_SYMBOL(netdev_fix_features);
5217
5218 void netdev_update_features(struct net_device *dev)
5219 {
5220         u32 features;
5221         int err = 0;
5222
5223         features = netdev_get_wanted_features(dev);
5224
5225         if (dev->netdev_ops->ndo_fix_features)
5226                 features = dev->netdev_ops->ndo_fix_features(dev, features);
5227
5228         /* driver might be less strict about feature dependencies */
5229         features = netdev_fix_features(dev, features);
5230
5231         if (dev->features == features)
5232                 return;
5233
5234         netdev_info(dev, "Features changed: 0x%08x -> 0x%08x\n",
5235                 dev->features, features);
5236
5237         if (dev->netdev_ops->ndo_set_features)
5238                 err = dev->netdev_ops->ndo_set_features(dev, features);
5239
5240         if (!err)
5241                 dev->features = features;
5242         else if (err < 0)
5243                 netdev_err(dev,
5244                         "set_features() failed (%d); wanted 0x%08x, left 0x%08x\n",
5245                         err, features, dev->features);
5246 }
5247 EXPORT_SYMBOL(netdev_update_features);
5248
5249 /**
5250  *      netif_stacked_transfer_operstate -      transfer operstate
5251  *      @rootdev: the root or lower level device to transfer state from
5252  *      @dev: the device to transfer operstate to
5253  *
5254  *      Transfer operational state from root to device. This is normally
5255  *      called when a stacking relationship exists between the root
5256  *      device and the device(a leaf device).
5257  */
5258 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5259                                         struct net_device *dev)
5260 {
5261         if (rootdev->operstate == IF_OPER_DORMANT)
5262                 netif_dormant_on(dev);
5263         else
5264                 netif_dormant_off(dev);
5265
5266         if (netif_carrier_ok(rootdev)) {
5267                 if (!netif_carrier_ok(dev))
5268                         netif_carrier_on(dev);
5269         } else {
5270                 if (netif_carrier_ok(dev))
5271                         netif_carrier_off(dev);
5272         }
5273 }
5274 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5275
5276 #ifdef CONFIG_RPS
5277 static int netif_alloc_rx_queues(struct net_device *dev)
5278 {
5279         unsigned int i, count = dev->num_rx_queues;
5280         struct netdev_rx_queue *rx;
5281
5282         BUG_ON(count < 1);
5283
5284         rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5285         if (!rx) {
5286                 pr_err("netdev: Unable to allocate %u rx queues.\n", count);
5287                 return -ENOMEM;
5288         }
5289         dev->_rx = rx;
5290
5291         for (i = 0; i < count; i++)
5292                 rx[i].dev = dev;
5293         return 0;
5294 }
5295 #endif
5296
5297 static void netdev_init_one_queue(struct net_device *dev,
5298                                   struct netdev_queue *queue, void *_unused)
5299 {
5300         /* Initialize queue lock */
5301         spin_lock_init(&queue->_xmit_lock);
5302         netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5303         queue->xmit_lock_owner = -1;
5304         netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5305         queue->dev = dev;
5306 }
5307
5308 static int netif_alloc_netdev_queues(struct net_device *dev)
5309 {
5310         unsigned int count = dev->num_tx_queues;
5311         struct netdev_queue *tx;
5312
5313         BUG_ON(count < 1);
5314
5315         tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5316         if (!tx) {
5317                 pr_err("netdev: Unable to allocate %u tx queues.\n",
5318                        count);
5319                 return -ENOMEM;
5320         }
5321         dev->_tx = tx;
5322
5323         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5324         spin_lock_init(&dev->tx_global_lock);
5325
5326         return 0;
5327 }
5328
5329 /**
5330  *      register_netdevice      - register a network device
5331  *      @dev: device to register
5332  *
5333  *      Take a completed network device structure and add it to the kernel
5334  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5335  *      chain. 0 is returned on success. A negative errno code is returned
5336  *      on a failure to set up the device, or if the name is a duplicate.
5337  *
5338  *      Callers must hold the rtnl semaphore. You may want
5339  *      register_netdev() instead of this.
5340  *
5341  *      BUGS:
5342  *      The locking appears insufficient to guarantee two parallel registers
5343  *      will not get the same name.
5344  */
5345
5346 int register_netdevice(struct net_device *dev)
5347 {
5348         int ret;
5349         struct net *net = dev_net(dev);
5350
5351         BUG_ON(dev_boot_phase);
5352         ASSERT_RTNL();
5353
5354         might_sleep();
5355
5356         /* When net_device's are persistent, this will be fatal. */
5357         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5358         BUG_ON(!net);
5359
5360         spin_lock_init(&dev->addr_list_lock);
5361         netdev_set_addr_lockdep_class(dev);
5362
5363         dev->iflink = -1;
5364
5365         /* Init, if this function is available */
5366         if (dev->netdev_ops->ndo_init) {
5367                 ret = dev->netdev_ops->ndo_init(dev);
5368                 if (ret) {
5369                         if (ret > 0)
5370                                 ret = -EIO;
5371                         goto out;
5372                 }
5373         }
5374
5375         ret = dev_get_valid_name(dev, dev->name, 0);
5376         if (ret)
5377                 goto err_uninit;
5378
5379         dev->ifindex = dev_new_index(net);
5380         if (dev->iflink == -1)
5381                 dev->iflink = dev->ifindex;
5382
5383         /* Transfer changeable features to wanted_features and enable
5384          * software offloads (GSO and GRO).
5385          */
5386         dev->hw_features |= NETIF_F_SOFT_FEATURES;
5387         dev->features |= NETIF_F_SOFT_FEATURES;
5388         dev->wanted_features = dev->features & dev->hw_features;
5389
5390         /* Avoid warning from netdev_fix_features() for GSO without SG */
5391         if (!(dev->wanted_features & NETIF_F_SG)) {
5392                 dev->wanted_features &= ~NETIF_F_GSO;
5393                 dev->features &= ~NETIF_F_GSO;
5394         }
5395
5396         /* Enable GRO and NETIF_F_HIGHDMA for vlans by default,
5397          * vlan_dev_init() will do the dev->features check, so these features
5398          * are enabled only if supported by underlying device.
5399          */
5400         dev->vlan_features |= (NETIF_F_GRO | NETIF_F_HIGHDMA);
5401
5402         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5403         ret = notifier_to_errno(ret);
5404         if (ret)
5405                 goto err_uninit;
5406
5407         ret = netdev_register_kobject(dev);
5408         if (ret)
5409                 goto err_uninit;
5410         dev->reg_state = NETREG_REGISTERED;
5411
5412         netdev_update_features(dev);
5413
5414         /*
5415          *      Default initial state at registry is that the
5416          *      device is present.
5417          */
5418
5419         set_bit(__LINK_STATE_PRESENT, &dev->state);
5420
5421         dev_init_scheduler(dev);
5422         dev_hold(dev);
5423         list_netdevice(dev);
5424
5425         /* Notify protocols, that a new device appeared. */
5426         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5427         ret = notifier_to_errno(ret);
5428         if (ret) {
5429                 rollback_registered(dev);
5430                 dev->reg_state = NETREG_UNREGISTERED;
5431         }
5432         /*
5433          *      Prevent userspace races by waiting until the network
5434          *      device is fully setup before sending notifications.
5435          */
5436         if (!dev->rtnl_link_ops ||
5437             dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5438                 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5439
5440 out:
5441         return ret;
5442
5443 err_uninit:
5444         if (dev->netdev_ops->ndo_uninit)
5445                 dev->netdev_ops->ndo_uninit(dev);
5446         goto out;
5447 }
5448 EXPORT_SYMBOL(register_netdevice);
5449
5450 /**
5451  *      init_dummy_netdev       - init a dummy network device for NAPI
5452  *      @dev: device to init
5453  *
5454  *      This takes a network device structure and initialize the minimum
5455  *      amount of fields so it can be used to schedule NAPI polls without
5456  *      registering a full blown interface. This is to be used by drivers
5457  *      that need to tie several hardware interfaces to a single NAPI
5458  *      poll scheduler due to HW limitations.
5459  */
5460 int init_dummy_netdev(struct net_device *dev)
5461 {
5462         /* Clear everything. Note we don't initialize spinlocks
5463          * are they aren't supposed to be taken by any of the
5464          * NAPI code and this dummy netdev is supposed to be
5465          * only ever used for NAPI polls
5466          */
5467         memset(dev, 0, sizeof(struct net_device));
5468
5469         /* make sure we BUG if trying to hit standard
5470          * register/unregister code path
5471          */
5472         dev->reg_state = NETREG_DUMMY;
5473
5474         /* NAPI wants this */
5475         INIT_LIST_HEAD(&dev->napi_list);
5476
5477         /* a dummy interface is started by default */
5478         set_bit(__LINK_STATE_PRESENT, &dev->state);
5479         set_bit(__LINK_STATE_START, &dev->state);
5480
5481         /* Note : We dont allocate pcpu_refcnt for dummy devices,
5482          * because users of this 'device' dont need to change
5483          * its refcount.
5484          */
5485
5486         return 0;
5487 }
5488 EXPORT_SYMBOL_GPL(init_dummy_netdev);
5489
5490
5491 /**
5492  *      register_netdev - register a network device
5493  *      @dev: device to register
5494  *
5495  *      Take a completed network device structure and add it to the kernel
5496  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5497  *      chain. 0 is returned on success. A negative errno code is returned
5498  *      on a failure to set up the device, or if the name is a duplicate.
5499  *
5500  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
5501  *      and expands the device name if you passed a format string to
5502  *      alloc_netdev.
5503  */
5504 int register_netdev(struct net_device *dev)
5505 {
5506         int err;
5507
5508         rtnl_lock();
5509
5510         /*
5511          * If the name is a format string the caller wants us to do a
5512          * name allocation.
5513          */
5514         if (strchr(dev->name, '%')) {
5515                 err = dev_alloc_name(dev, dev->name);
5516                 if (err < 0)
5517                         goto out;
5518         }
5519
5520         err = register_netdevice(dev);
5521 out:
5522         rtnl_unlock();
5523         return err;
5524 }
5525 EXPORT_SYMBOL(register_netdev);
5526
5527 int netdev_refcnt_read(const struct net_device *dev)
5528 {
5529         int i, refcnt = 0;
5530
5531         for_each_possible_cpu(i)
5532                 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5533         return refcnt;
5534 }
5535 EXPORT_SYMBOL(netdev_refcnt_read);
5536
5537 /*
5538  * netdev_wait_allrefs - wait until all references are gone.
5539  *
5540  * This is called when unregistering network devices.
5541  *
5542  * Any protocol or device that holds a reference should register
5543  * for netdevice notification, and cleanup and put back the
5544  * reference if they receive an UNREGISTER event.
5545  * We can get stuck here if buggy protocols don't correctly
5546  * call dev_put.
5547  */
5548 static void netdev_wait_allrefs(struct net_device *dev)
5549 {
5550         unsigned long rebroadcast_time, warning_time;
5551         int refcnt;
5552
5553         linkwatch_forget_dev(dev);
5554
5555         rebroadcast_time = warning_time = jiffies;
5556         refcnt = netdev_refcnt_read(dev);
5557
5558         while (refcnt != 0) {
5559                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5560                         rtnl_lock();
5561
5562                         /* Rebroadcast unregister notification */
5563                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5564                         /* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
5565                          * should have already handle it the first time */
5566
5567                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5568                                      &dev->state)) {
5569                                 /* We must not have linkwatch events
5570                                  * pending on unregister. If this
5571                                  * happens, we simply run the queue
5572                                  * unscheduled, resulting in a noop
5573                                  * for this device.
5574                                  */
5575                                 linkwatch_run_queue();
5576                         }
5577
5578                         __rtnl_unlock();
5579
5580                         rebroadcast_time = jiffies;
5581                 }
5582
5583                 msleep(250);
5584
5585                 refcnt = netdev_refcnt_read(dev);
5586
5587                 if (time_after(jiffies, warning_time + 10 * HZ)) {
5588                         printk(KERN_EMERG "unregister_netdevice: "
5589                                "waiting for %s to become free. Usage "
5590                                "count = %d\n",
5591                                dev->name, refcnt);
5592                         warning_time = jiffies;
5593                 }
5594         }
5595 }
5596
5597 /* The sequence is:
5598  *
5599  *      rtnl_lock();
5600  *      ...
5601  *      register_netdevice(x1);
5602  *      register_netdevice(x2);
5603  *      ...
5604  *      unregister_netdevice(y1);
5605  *      unregister_netdevice(y2);
5606  *      ...
5607  *      rtnl_unlock();
5608  *      free_netdev(y1);
5609  *      free_netdev(y2);
5610  *
5611  * We are invoked by rtnl_unlock().
5612  * This allows us to deal with problems:
5613  * 1) We can delete sysfs objects which invoke hotplug
5614  *    without deadlocking with linkwatch via keventd.
5615  * 2) Since we run with the RTNL semaphore not held, we can sleep
5616  *    safely in order to wait for the netdev refcnt to drop to zero.
5617  *
5618  * We must not return until all unregister events added during
5619  * the interval the lock was held have been completed.
5620  */
5621 void netdev_run_todo(void)
5622 {
5623         struct list_head list;
5624
5625         /* Snapshot list, allow later requests */
5626         list_replace_init(&net_todo_list, &list);
5627
5628         __rtnl_unlock();
5629
5630         while (!list_empty(&list)) {
5631                 struct net_device *dev
5632                         = list_first_entry(&list, struct net_device, todo_list);
5633                 list_del(&dev->todo_list);
5634
5635                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5636                         printk(KERN_ERR "network todo '%s' but state %d\n",
5637                                dev->name, dev->reg_state);
5638                         dump_stack();
5639                         continue;
5640                 }
5641
5642                 dev->reg_state = NETREG_UNREGISTERED;
5643
5644                 on_each_cpu(flush_backlog, dev, 1);
5645
5646                 netdev_wait_allrefs(dev);
5647
5648                 /* paranoia */
5649                 BUG_ON(netdev_refcnt_read(dev));
5650                 WARN_ON(rcu_dereference_raw(dev->ip_ptr));
5651                 WARN_ON(rcu_dereference_raw(dev->ip6_ptr));
5652                 WARN_ON(dev->dn_ptr);
5653
5654                 if (dev->destructor)
5655                         dev->destructor(dev);
5656
5657                 /* Free network device */
5658                 kobject_put(&dev->dev.kobj);
5659         }
5660 }
5661
5662 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
5663  * fields in the same order, with only the type differing.
5664  */
5665 static void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5666                                     const struct net_device_stats *netdev_stats)
5667 {
5668 #if BITS_PER_LONG == 64
5669         BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5670         memcpy(stats64, netdev_stats, sizeof(*stats64));
5671 #else
5672         size_t i, n = sizeof(*stats64) / sizeof(u64);
5673         const unsigned long *src = (const unsigned long *)netdev_stats;
5674         u64 *dst = (u64 *)stats64;
5675
5676         BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5677                      sizeof(*stats64) / sizeof(u64));
5678         for (i = 0; i < n; i++)
5679                 dst[i] = src[i];
5680 #endif
5681 }
5682
5683 /**
5684  *      dev_get_stats   - get network device statistics
5685  *      @dev: device to get statistics from
5686  *      @storage: place to store stats
5687  *
5688  *      Get network statistics from device. Return @storage.
5689  *      The device driver may provide its own method by setting
5690  *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5691  *      otherwise the internal statistics structure is used.
5692  */
5693 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5694                                         struct rtnl_link_stats64 *storage)
5695 {
5696         const struct net_device_ops *ops = dev->netdev_ops;
5697
5698         if (ops->ndo_get_stats64) {
5699                 memset(storage, 0, sizeof(*storage));
5700                 ops->ndo_get_stats64(dev, storage);
5701         } else if (ops->ndo_get_stats) {
5702                 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
5703         } else {
5704                 netdev_stats_to_stats64(storage, &dev->stats);
5705         }
5706         storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
5707         return storage;
5708 }
5709 EXPORT_SYMBOL(dev_get_stats);
5710
5711 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
5712 {
5713         struct netdev_queue *queue = dev_ingress_queue(dev);
5714
5715 #ifdef CONFIG_NET_CLS_ACT
5716         if (queue)
5717                 return queue;
5718         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5719         if (!queue)
5720                 return NULL;
5721         netdev_init_one_queue(dev, queue, NULL);
5722         queue->qdisc = &noop_qdisc;
5723         queue->qdisc_sleeping = &noop_qdisc;
5724         rcu_assign_pointer(dev->ingress_queue, queue);
5725 #endif
5726         return queue;
5727 }
5728
5729 /**
5730  *      alloc_netdev_mqs - allocate network device
5731  *      @sizeof_priv:   size of private data to allocate space for
5732  *      @name:          device name format string
5733  *      @setup:         callback to initialize device
5734  *      @txqs:          the number of TX subqueues to allocate
5735  *      @rxqs:          the number of RX subqueues to allocate
5736  *
5737  *      Allocates a struct net_device with private data area for driver use
5738  *      and performs basic initialization.  Also allocates subquue structs
5739  *      for each queue on the device.
5740  */
5741 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
5742                 void (*setup)(struct net_device *),
5743                 unsigned int txqs, unsigned int rxqs)
5744 {
5745         struct net_device *dev;
5746         size_t alloc_size;
5747         struct net_device *p;
5748
5749         BUG_ON(strlen(name) >= sizeof(dev->name));
5750
5751         if (txqs < 1) {
5752                 pr_err("alloc_netdev: Unable to allocate device "
5753                        "with zero queues.\n");
5754                 return NULL;
5755         }
5756
5757 #ifdef CONFIG_RPS
5758         if (rxqs < 1) {
5759                 pr_err("alloc_netdev: Unable to allocate device "
5760                        "with zero RX queues.\n");
5761                 return NULL;
5762         }
5763 #endif
5764
5765         alloc_size = sizeof(struct net_device);
5766         if (sizeof_priv) {
5767                 /* ensure 32-byte alignment of private area */
5768                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5769                 alloc_size += sizeof_priv;
5770         }
5771         /* ensure 32-byte alignment of whole construct */
5772         alloc_size += NETDEV_ALIGN - 1;
5773
5774         p = kzalloc(alloc_size, GFP_KERNEL);
5775         if (!p) {
5776                 printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
5777                 return NULL;
5778         }
5779
5780         dev = PTR_ALIGN(p, NETDEV_ALIGN);
5781         dev->padded = (char *)dev - (char *)p;
5782
5783         dev->pcpu_refcnt = alloc_percpu(int);
5784         if (!dev->pcpu_refcnt)
5785                 goto free_p;
5786
5787         if (dev_addr_init(dev))
5788                 goto free_pcpu;
5789
5790         dev_mc_init(dev);
5791         dev_uc_init(dev);
5792
5793         dev_net_set(dev, &init_net);
5794
5795         dev->gso_max_size = GSO_MAX_SIZE;
5796
5797         INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list);
5798         dev->ethtool_ntuple_list.count = 0;
5799         INIT_LIST_HEAD(&dev->napi_list);
5800         INIT_LIST_HEAD(&dev->unreg_list);
5801         INIT_LIST_HEAD(&dev->link_watch_list);
5802         dev->priv_flags = IFF_XMIT_DST_RELEASE;
5803         setup(dev);
5804
5805         dev->num_tx_queues = txqs;
5806         dev->real_num_tx_queues = txqs;
5807         if (netif_alloc_netdev_queues(dev))
5808                 goto free_all;
5809
5810 #ifdef CONFIG_RPS
5811         dev->num_rx_queues = rxqs;
5812         dev->real_num_rx_queues = rxqs;
5813         if (netif_alloc_rx_queues(dev))
5814                 goto free_all;
5815 #endif
5816
5817         strcpy(dev->name, name);
5818         dev->group = INIT_NETDEV_GROUP;
5819         return dev;
5820
5821 free_all:
5822         free_netdev(dev);
5823         return NULL;
5824
5825 free_pcpu:
5826         free_percpu(dev->pcpu_refcnt);
5827         kfree(dev->_tx);
5828 #ifdef CONFIG_RPS
5829         kfree(dev->_rx);
5830 #endif
5831
5832 free_p:
5833         kfree(p);
5834         return NULL;
5835 }
5836 EXPORT_SYMBOL(alloc_netdev_mqs);
5837
5838 /**
5839  *      free_netdev - free network device
5840  *      @dev: device
5841  *
5842  *      This function does the last stage of destroying an allocated device
5843  *      interface. The reference to the device object is released.
5844  *      If this is the last reference then it will be freed.
5845  */
5846 void free_netdev(struct net_device *dev)
5847 {
5848         struct napi_struct *p, *n;
5849
5850         release_net(dev_net(dev));
5851
5852         kfree(dev->_tx);
5853 #ifdef CONFIG_RPS
5854         kfree(dev->_rx);
5855 #endif
5856
5857         kfree(rcu_dereference_raw(dev->ingress_queue));
5858
5859         /* Flush device addresses */
5860         dev_addr_flush(dev);
5861
5862         /* Clear ethtool n-tuple list */
5863         ethtool_ntuple_flush(dev);
5864
5865         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5866                 netif_napi_del(p);
5867
5868         free_percpu(dev->pcpu_refcnt);
5869         dev->pcpu_refcnt = NULL;
5870
5871         /*  Compatibility with error handling in drivers */
5872         if (dev->reg_state == NETREG_UNINITIALIZED) {
5873                 kfree((char *)dev - dev->padded);
5874                 return;
5875         }
5876
5877         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5878         dev->reg_state = NETREG_RELEASED;
5879
5880         /* will free via device release */
5881         put_device(&dev->dev);
5882 }
5883 EXPORT_SYMBOL(free_netdev);
5884
5885 /**
5886  *      synchronize_net -  Synchronize with packet receive processing
5887  *
5888  *      Wait for packets currently being received to be done.
5889  *      Does not block later packets from starting.
5890  */
5891 void synchronize_net(void)
5892 {
5893         might_sleep();
5894         synchronize_rcu();
5895 }
5896 EXPORT_SYMBOL(synchronize_net);
5897
5898 /**
5899  *      unregister_netdevice_queue - remove device from the kernel
5900  *      @dev: device
5901  *      @head: list
5902  *
5903  *      This function shuts down a device interface and removes it
5904  *      from the kernel tables.
5905  *      If head not NULL, device is queued to be unregistered later.
5906  *
5907  *      Callers must hold the rtnl semaphore.  You may want
5908  *      unregister_netdev() instead of this.
5909  */
5910
5911 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
5912 {
5913         ASSERT_RTNL();
5914
5915         if (head) {
5916                 list_move_tail(&dev->unreg_list, head);
5917         } else {
5918                 rollback_registered(dev);
5919                 /* Finish processing unregister after unlock */
5920                 net_set_todo(dev);
5921         }
5922 }
5923 EXPORT_SYMBOL(unregister_netdevice_queue);
5924
5925 /**
5926  *      unregister_netdevice_many - unregister many devices
5927  *      @head: list of devices
5928  */
5929 void unregister_netdevice_many(struct list_head *head)
5930 {
5931         struct net_device *dev;
5932
5933         if (!list_empty(head)) {
5934                 rollback_registered_many(head);
5935                 list_for_each_entry(dev, head, unreg_list)
5936                         net_set_todo(dev);
5937         }
5938 }
5939 EXPORT_SYMBOL(unregister_netdevice_many);
5940
5941 /**
5942  *      unregister_netdev - remove device from the kernel
5943  *      @dev: device
5944  *
5945  *      This function shuts down a device interface and removes it
5946  *      from the kernel tables.
5947  *
5948  *      This is just a wrapper for unregister_netdevice that takes
5949  *      the rtnl semaphore.  In general you want to use this and not
5950  *      unregister_netdevice.
5951  */
5952 void unregister_netdev(struct net_device *dev)
5953 {
5954         rtnl_lock();
5955         unregister_netdevice(dev);
5956         rtnl_unlock();
5957 }
5958 EXPORT_SYMBOL(unregister_netdev);
5959
5960 /**
5961  *      dev_change_net_namespace - move device to different nethost namespace
5962  *      @dev: device
5963  *      @net: network namespace
5964  *      @pat: If not NULL name pattern to try if the current device name
5965  *            is already taken in the destination network namespace.
5966  *
5967  *      This function shuts down a device interface and moves it
5968  *      to a new network namespace. On success 0 is returned, on
5969  *      a failure a netagive errno code is returned.
5970  *
5971  *      Callers must hold the rtnl semaphore.
5972  */
5973
5974 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
5975 {
5976         int err;
5977
5978         ASSERT_RTNL();
5979
5980         /* Don't allow namespace local devices to be moved. */
5981         err = -EINVAL;
5982         if (dev->features & NETIF_F_NETNS_LOCAL)
5983                 goto out;
5984
5985         /* Ensure the device has been registrered */
5986         err = -EINVAL;
5987         if (dev->reg_state != NETREG_REGISTERED)
5988                 goto out;
5989
5990         /* Get out if there is nothing todo */
5991         err = 0;
5992         if (net_eq(dev_net(dev), net))
5993                 goto out;
5994
5995         /* Pick the destination device name, and ensure
5996          * we can use it in the destination network namespace.
5997          */
5998         err = -EEXIST;
5999         if (__dev_get_by_name(net, dev->name)) {
6000                 /* We get here if we can't use the current device name */
6001                 if (!pat)
6002                         goto out;
6003                 if (dev_get_valid_name(dev, pat, 1))
6004                         goto out;
6005         }
6006
6007         /*
6008          * And now a mini version of register_netdevice unregister_netdevice.
6009          */
6010
6011         /* If device is running close it first. */
6012         dev_close(dev);
6013
6014         /* And unlink it from device chain */
6015         err = -ENODEV;
6016         unlist_netdevice(dev);
6017
6018         synchronize_net();
6019
6020         /* Shutdown queueing discipline. */
6021         dev_shutdown(dev);
6022
6023         /* Notify protocols, that we are about to destroy
6024            this device. They should clean all the things.
6025
6026            Note that dev->reg_state stays at NETREG_REGISTERED.
6027            This is wanted because this way 8021q and macvlan know
6028            the device is just moving and can keep their slaves up.
6029         */
6030         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6031         call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
6032
6033         /*
6034          *      Flush the unicast and multicast chains
6035          */
6036         dev_uc_flush(dev);
6037         dev_mc_flush(dev);
6038
6039         /* Actually switch the network namespace */
6040         dev_net_set(dev, net);
6041
6042         /* If there is an ifindex conflict assign a new one */
6043         if (__dev_get_by_index(net, dev->ifindex)) {
6044                 int iflink = (dev->iflink == dev->ifindex);
6045                 dev->ifindex = dev_new_index(net);
6046                 if (iflink)
6047                         dev->iflink = dev->ifindex;
6048         }
6049
6050         /* Fixup kobjects */
6051         err = device_rename(&dev->dev, dev->name);
6052         WARN_ON(err);
6053
6054         /* Add the device back in the hashes */
6055         list_netdevice(dev);
6056
6057         /* Notify protocols, that a new device appeared. */
6058         call_netdevice_notifiers(NETDEV_REGISTER, dev);
6059
6060         /*
6061          *      Prevent userspace races by waiting until the network
6062          *      device is fully setup before sending notifications.
6063          */
6064         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6065
6066         synchronize_net();
6067         err = 0;
6068 out:
6069         return err;
6070 }
6071 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6072
6073 static int dev_cpu_callback(struct notifier_block *nfb,
6074                             unsigned long action,
6075                             void *ocpu)
6076 {
6077         struct sk_buff **list_skb;
6078         struct sk_buff *skb;
6079         unsigned int cpu, oldcpu = (unsigned long)ocpu;
6080         struct softnet_data *sd, *oldsd;
6081
6082         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6083                 return NOTIFY_OK;
6084
6085         local_irq_disable();
6086         cpu = smp_processor_id();
6087         sd = &per_cpu(softnet_data, cpu);
6088         oldsd = &per_cpu(softnet_data, oldcpu);
6089
6090         /* Find end of our completion_queue. */
6091         list_skb = &sd->completion_queue;
6092         while (*list_skb)
6093                 list_skb = &(*list_skb)->next;
6094         /* Append completion queue from offline CPU. */
6095         *list_skb = oldsd->completion_queue;
6096         oldsd->completion_queue = NULL;
6097
6098         /* Append output queue from offline CPU. */
6099         if (oldsd->output_queue) {
6100                 *sd->output_queue_tailp = oldsd->output_queue;
6101                 sd->output_queue_tailp = oldsd->output_queue_tailp;
6102                 oldsd->output_queue = NULL;
6103                 oldsd->output_queue_tailp = &oldsd->output_queue;
6104         }
6105
6106         raise_softirq_irqoff(NET_TX_SOFTIRQ);
6107         local_irq_enable();
6108
6109         /* Process offline CPU's input_pkt_queue */
6110         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6111                 netif_rx(skb);
6112                 input_queue_head_incr(oldsd);
6113         }
6114         while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6115                 netif_rx(skb);
6116                 input_queue_head_incr(oldsd);
6117         }
6118
6119         return NOTIFY_OK;
6120 }
6121
6122
6123 /**
6124  *      netdev_increment_features - increment feature set by one
6125  *      @all: current feature set
6126  *      @one: new feature set
6127  *      @mask: mask feature set
6128  *
6129  *      Computes a new feature set after adding a device with feature set
6130  *      @one to the master device with current feature set @all.  Will not
6131  *      enable anything that is off in @mask. Returns the new feature set.
6132  */
6133 u32 netdev_increment_features(u32 all, u32 one, u32 mask)
6134 {
6135         /* If device needs checksumming, downgrade to it. */
6136         if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
6137                 all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM);
6138         else if (mask & NETIF_F_ALL_CSUM) {
6139                 /* If one device supports v4/v6 checksumming, set for all. */
6140                 if (one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM) &&
6141                     !(all & NETIF_F_GEN_CSUM)) {
6142                         all &= ~NETIF_F_ALL_CSUM;
6143                         all |= one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
6144                 }
6145
6146                 /* If one device supports hw checksumming, set for all. */
6147                 if (one & NETIF_F_GEN_CSUM && !(all & NETIF_F_GEN_CSUM)) {
6148                         all &= ~NETIF_F_ALL_CSUM;
6149                         all |= NETIF_F_HW_CSUM;
6150                 }
6151         }
6152
6153         one |= NETIF_F_ALL_CSUM;
6154
6155         one |= all & NETIF_F_ONE_FOR_ALL;
6156         all &= one | NETIF_F_LLTX | NETIF_F_GSO | NETIF_F_UFO;
6157         all |= one & mask & NETIF_F_ONE_FOR_ALL;
6158
6159         return all;
6160 }
6161 EXPORT_SYMBOL(netdev_increment_features);
6162
6163 static struct hlist_head *netdev_create_hash(void)
6164 {
6165         int i;
6166         struct hlist_head *hash;
6167
6168         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6169         if (hash != NULL)
6170                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
6171                         INIT_HLIST_HEAD(&hash[i]);
6172
6173         return hash;
6174 }
6175
6176 /* Initialize per network namespace state */
6177 static int __net_init netdev_init(struct net *net)
6178 {
6179         INIT_LIST_HEAD(&net->dev_base_head);
6180
6181         net->dev_name_head = netdev_create_hash();
6182         if (net->dev_name_head == NULL)
6183                 goto err_name;
6184
6185         net->dev_index_head = netdev_create_hash();
6186         if (net->dev_index_head == NULL)
6187                 goto err_idx;
6188
6189         return 0;
6190
6191 err_idx:
6192         kfree(net->dev_name_head);
6193 err_name:
6194         return -ENOMEM;
6195 }
6196
6197 /**
6198  *      netdev_drivername - network driver for the device
6199  *      @dev: network device
6200  *      @buffer: buffer for resulting name
6201  *      @len: size of buffer
6202  *
6203  *      Determine network driver for device.
6204  */
6205 char *netdev_drivername(const struct net_device *dev, char *buffer, int len)
6206 {
6207         const struct device_driver *driver;
6208         const struct device *parent;
6209
6210         if (len <= 0 || !buffer)
6211                 return buffer;
6212         buffer[0] = 0;
6213
6214         parent = dev->dev.parent;
6215
6216         if (!parent)
6217                 return buffer;
6218
6219         driver = parent->driver;
6220         if (driver && driver->name)
6221                 strlcpy(buffer, driver->name, len);
6222         return buffer;
6223 }
6224
6225 static int __netdev_printk(const char *level, const struct net_device *dev,
6226                            struct va_format *vaf)
6227 {
6228         int r;
6229
6230         if (dev && dev->dev.parent)
6231                 r = dev_printk(level, dev->dev.parent, "%s: %pV",
6232                                netdev_name(dev), vaf);
6233         else if (dev)
6234                 r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6235         else
6236                 r = printk("%s(NULL net_device): %pV", level, vaf);
6237
6238         return r;
6239 }
6240
6241 int netdev_printk(const char *level, const struct net_device *dev,
6242                   const char *format, ...)
6243 {
6244         struct va_format vaf;
6245         va_list args;
6246         int r;
6247
6248         va_start(args, format);
6249
6250         vaf.fmt = format;
6251         vaf.va = &args;
6252
6253         r = __netdev_printk(level, dev, &vaf);
6254         va_end(args);
6255
6256         return r;
6257 }
6258 EXPORT_SYMBOL(netdev_printk);
6259
6260 #define define_netdev_printk_level(func, level)                 \
6261 int func(const struct net_device *dev, const char *fmt, ...)    \
6262 {                                                               \
6263         int r;                                                  \
6264         struct va_format vaf;                                   \
6265         va_list args;                                           \
6266                                                                 \
6267         va_start(args, fmt);                                    \
6268                                                                 \
6269         vaf.fmt = fmt;                                          \
6270         vaf.va = &args;                                         \
6271                                                                 \
6272         r = __netdev_printk(level, dev, &vaf);                  \
6273         va_end(args);                                           \
6274                                                                 \
6275         return r;                                               \
6276 }                                                               \
6277 EXPORT_SYMBOL(func);
6278
6279 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6280 define_netdev_printk_level(netdev_alert, KERN_ALERT);
6281 define_netdev_printk_level(netdev_crit, KERN_CRIT);
6282 define_netdev_printk_level(netdev_err, KERN_ERR);
6283 define_netdev_printk_level(netdev_warn, KERN_WARNING);
6284 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6285 define_netdev_printk_level(netdev_info, KERN_INFO);
6286
6287 static void __net_exit netdev_exit(struct net *net)
6288 {
6289         kfree(net->dev_name_head);
6290         kfree(net->dev_index_head);
6291 }
6292
6293 static struct pernet_operations __net_initdata netdev_net_ops = {
6294         .init = netdev_init,
6295         .exit = netdev_exit,
6296 };
6297
6298 static void __net_exit default_device_exit(struct net *net)
6299 {
6300         struct net_device *dev, *aux;
6301         /*
6302          * Push all migratable network devices back to the
6303          * initial network namespace
6304          */
6305         rtnl_lock();
6306         for_each_netdev_safe(net, dev, aux) {
6307                 int err;
6308                 char fb_name[IFNAMSIZ];
6309
6310                 /* Ignore unmoveable devices (i.e. loopback) */
6311                 if (dev->features & NETIF_F_NETNS_LOCAL)
6312                         continue;
6313
6314                 /* Leave virtual devices for the generic cleanup */
6315                 if (dev->rtnl_link_ops)
6316                         continue;
6317
6318                 /* Push remaing network devices to init_net */
6319                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6320                 err = dev_change_net_namespace(dev, &init_net, fb_name);
6321                 if (err) {
6322                         printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
6323                                 __func__, dev->name, err);
6324                         BUG();
6325                 }
6326         }
6327         rtnl_unlock();
6328 }
6329
6330 static void __net_exit default_device_exit_batch(struct list_head *net_list)
6331 {
6332         /* At exit all network devices most be removed from a network
6333          * namespace.  Do this in the reverse order of registration.
6334          * Do this across as many network namespaces as possible to
6335          * improve batching efficiency.
6336          */
6337         struct net_device *dev;
6338         struct net *net;
6339         LIST_HEAD(dev_kill_list);
6340
6341         rtnl_lock();
6342         list_for_each_entry(net, net_list, exit_list) {
6343                 for_each_netdev_reverse(net, dev) {
6344                         if (dev->rtnl_link_ops)
6345                                 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6346                         else
6347                                 unregister_netdevice_queue(dev, &dev_kill_list);
6348                 }
6349         }
6350         unregister_netdevice_many(&dev_kill_list);
6351         list_del(&dev_kill_list);
6352         rtnl_unlock();
6353 }
6354
6355 static struct pernet_operations __net_initdata default_device_ops = {
6356         .exit = default_device_exit,
6357         .exit_batch = default_device_exit_batch,
6358 };
6359
6360 /*
6361  *      Initialize the DEV module. At boot time this walks the device list and
6362  *      unhooks any devices that fail to initialise (normally hardware not
6363  *      present) and leaves us with a valid list of present and active devices.
6364  *
6365  */
6366
6367 /*
6368  *       This is called single threaded during boot, so no need
6369  *       to take the rtnl semaphore.
6370  */
6371 static int __init net_dev_init(void)
6372 {
6373         int i, rc = -ENOMEM;
6374
6375         BUG_ON(!dev_boot_phase);
6376
6377         if (dev_proc_init())
6378                 goto out;
6379
6380         if (netdev_kobject_init())
6381                 goto out;
6382
6383         INIT_LIST_HEAD(&ptype_all);
6384         for (i = 0; i < PTYPE_HASH_SIZE; i++)
6385                 INIT_LIST_HEAD(&ptype_base[i]);
6386
6387         if (register_pernet_subsys(&netdev_net_ops))
6388                 goto out;
6389
6390         /*
6391          *      Initialise the packet receive queues.
6392          */
6393
6394         for_each_possible_cpu(i) {
6395                 struct softnet_data *sd = &per_cpu(softnet_data, i);
6396
6397                 memset(sd, 0, sizeof(*sd));
6398                 skb_queue_head_init(&sd->input_pkt_queue);
6399                 skb_queue_head_init(&sd->process_queue);
6400                 sd->completion_queue = NULL;
6401                 INIT_LIST_HEAD(&sd->poll_list);
6402                 sd->output_queue = NULL;
6403                 sd->output_queue_tailp = &sd->output_queue;
6404 #ifdef CONFIG_RPS
6405                 sd->csd.func = rps_trigger_softirq;
6406                 sd->csd.info = sd;
6407                 sd->csd.flags = 0;
6408                 sd->cpu = i;
6409 #endif
6410
6411                 sd->backlog.poll = process_backlog;
6412                 sd->backlog.weight = weight_p;
6413                 sd->backlog.gro_list = NULL;
6414                 sd->backlog.gro_count = 0;
6415         }
6416
6417         dev_boot_phase = 0;
6418
6419         /* The loopback device is special if any other network devices
6420          * is present in a network namespace the loopback device must
6421          * be present. Since we now dynamically allocate and free the
6422          * loopback device ensure this invariant is maintained by
6423          * keeping the loopback device as the first device on the
6424          * list of network devices.  Ensuring the loopback devices
6425          * is the first device that appears and the last network device
6426          * that disappears.
6427          */
6428         if (register_pernet_device(&loopback_net_ops))
6429                 goto out;
6430
6431         if (register_pernet_device(&default_device_ops))
6432                 goto out;
6433
6434         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6435         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6436
6437         hotcpu_notifier(dev_cpu_callback, 0);
6438         dst_init();
6439         dev_mcast_init();
6440         rc = 0;
6441 out:
6442         return rc;
6443 }
6444
6445 subsys_initcall(net_dev_init);
6446
6447 static int __init initialize_hashrnd(void)
6448 {
6449         get_random_bytes(&hashrnd, sizeof(hashrnd));
6450         return 0;
6451 }
6452
6453 late_initcall_sync(initialize_hashrnd);
6454