net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro
  11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17  *              David Hinds <dahinds@users.sourceforge.net>
  18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19  *              Adam Sulmicki <adam@cfar.umd.edu>
  20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <asm/system.h>
  77 #include <linux/bitops.h>
  78 #include <linux/capability.h>
  79 #include <linux/cpu.h>
  80 #include <linux/types.h>
  81 #include <linux/kernel.h>
  82 #include <linux/hash.h>
  83 #include <linux/slab.h>
  84 #include <linux/sched.h>
  85 #include <linux/mutex.h>
  86 #include <linux/string.h>
  87 #include <linux/mm.h>
  88 #include <linux/socket.h>
  89 #include <linux/sockios.h>
  90 #include <linux/errno.h>
  91 #include <linux/interrupt.h>
  92 #include <linux/if_ether.h>
  93 #include <linux/netdevice.h>
  94 #include <linux/etherdevice.h>
  95 #include <linux/ethtool.h>
  96 #include <linux/notifier.h>
  97 #include <linux/skbuff.h>
  98 #include <net/net_namespace.h>
  99 #include <net/sock.h>
 100 #include <linux/rtnetlink.h>
 101 #include <linux/proc_fs.h>
 102 #include <linux/seq_file.h>
 103 #include <linux/stat.h>
 104 #include <net/dst.h>
 105 #include <net/pkt_sched.h>
 106 #include <net/checksum.h>
 107 #include <net/xfrm.h>
 108 #include <linux/highmem.h>
 109 #include <linux/init.h>
 110 #include <linux/kmod.h>
 111 #include <linux/module.h>
 112 #include <linux/netpoll.h>
 113 #include <linux/rcupdate.h>
 114 #include <linux/delay.h>
 115 #include <net/wext.h>
 116 #include <net/iw_handler.h>
 117 #include <asm/current.h>
 118 #include <linux/audit.h>
 119 #include <linux/dmaengine.h>
 120 #include <linux/err.h>
 121 #include <linux/ctype.h>
 122 #include <linux/if_arp.h>
 123 #include <linux/if_vlan.h>
 124 #include <linux/ip.h>
 125 #include <net/ip.h>
 126 #include <linux/ipv6.h>
 127 #include <linux/in.h>
 128 #include <linux/jhash.h>
 129 #include <linux/random.h>
 130 #include <trace/events/napi.h>
 131 #include <trace/events/net.h>
 132 #include <trace/events/skb.h>
 133 #include <linux/pci.h>
 134 #include <linux/inetdevice.h>
 135 #include <linux/cpu_rmap.h>
 136
 137 #include "net-sysfs.h"
 138
 139 /* Instead of increasing this, you should create a hash table. */
 140 #define MAX_GRO_SKBS 8
 141
 142 /* This should be increased if a protocol with a bigger head is added. */
 143 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 144
 145 /*
 146  *      The list of packet types we will receive (as opposed to discard)
 147  *      and the routines to invoke.
 148  *
 149  *      Why 16. Because with 16 the only overlap we get on a hash of the
 150  *      low nibble of the protocol value is RARP/SNAP/X.25.
 151  *
 152  *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
 153  *             sure which should go first, but I bet it won't make much
 154  *             difference if we are running VLANs.  The good news is that
 155  *             this protocol won't be in the list unless compiled in, so
 156  *             the average user (w/out VLANs) will not be adversely affected.
 157  *             --BLG
 158  *
 159  *              0800    IP
 160  *              8100    802.1Q VLAN
 161  *              0001    802.3
 162  *              0002    AX.25
 163  *              0004    802.2
 164  *              8035    RARP
 165  *              0005    SNAP
 166  *              0805    X.25
 167  *              0806    ARP
 168  *              8137    IPX
 169  *              0009    Localtalk
 170  *              86DD    IPv6
 171  */
 172
 173 #define PTYPE_HASH_SIZE (16)
 174 #define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
 175
 176 static DEFINE_SPINLOCK(ptype_lock);
 177 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 178 static struct list_head ptype_all __read_mostly;        /* Taps */
 179
 180 /*
 181  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 182  * semaphore.
 183  *
 184  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 185  *
 186  * Writers must hold the rtnl semaphore while they loop through the
 187  * dev_base_head list, and hold dev_base_lock for writing when they do the
 188  * actual updates.  This allows pure readers to access the list even
 189  * while a writer is preparing to update it.
 190  *
 191  * To put it another way, dev_base_lock is held for writing only to
 192  * protect against pure readers; the rtnl semaphore provides the
 193  * protection against other writers.
 194  *
 195  * See, for example usages, register_netdevice() and
 196  * unregister_netdevice(), which must be called with the rtnl
 197  * semaphore held.
 198  */
 199 DEFINE_RWLOCK(dev_base_lock);
 200 EXPORT_SYMBOL(dev_base_lock);
 201
 202 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 203 {
 204         unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 205         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 206 }
 207
 208 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 209 {
 210         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 211 }
 212
 213 static inline void rps_lock(struct softnet_data *sd)
 214 {
 215 #ifdef CONFIG_RPS
 216         spin_lock(&sd->input_pkt_queue.lock);
 217 #endif
 218 }
 219
 220 static inline void rps_unlock(struct softnet_data *sd)
 221 {
 222 #ifdef CONFIG_RPS
 223         spin_unlock(&sd->input_pkt_queue.lock);
 224 #endif
 225 }
 226
 227 /* Device list insertion */
 228 static int list_netdevice(struct net_device *dev)
 229 {
 230         struct net *net = dev_net(dev);
 231
 232         ASSERT_RTNL();
 233
 234         write_lock_bh(&dev_base_lock);
 235         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 236         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 237         hlist_add_head_rcu(&dev->index_hlist,
 238                            dev_index_hash(net, dev->ifindex));
 239         write_unlock_bh(&dev_base_lock);
 240         return 0;
 241 }
 242
 243 /* Device list removal
 244  * caller must respect a RCU grace period before freeing/reusing dev
 245  */
 246 static void unlist_netdevice(struct net_device *dev)
 247 {
 248         ASSERT_RTNL();
 249
 250         /* Unlink dev from the device chain */
 251         write_lock_bh(&dev_base_lock);
 252         list_del_rcu(&dev->dev_list);
 253         hlist_del_rcu(&dev->name_hlist);
 254         hlist_del_rcu(&dev->index_hlist);
 255         write_unlock_bh(&dev_base_lock);
 256 }
 257
 258 /*
 259  *      Our notifier list
 260  */
 261
 262 static RAW_NOTIFIER_HEAD(netdev_chain);
 263
 264 /*
 265  *      Device drivers call our routines to queue packets here. We empty the
 266  *      queue in the local softnet handler.
 267  */
 268
 269 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 270 EXPORT_PER_CPU_SYMBOL(softnet_data);
 271
 272 #ifdef CONFIG_LOCKDEP
 273 /*
 274  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 275  * according to dev->type
 276  */
 277 static const unsigned short netdev_lock_type[] =
 278         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 279          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 280          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 281          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 282          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 283          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 284          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 285          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 286          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 287          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 288          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 289          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 290          ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
 291          ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
 292          ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,
 293          ARPHRD_VOID, ARPHRD_NONE};
 294
 295 static const char *const netdev_lock_name[] =
 296         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 297          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 298          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 299          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 300          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 301          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 302          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 303          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 304          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 305          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 306          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 307          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 308          "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
 309          "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
 310          "_xmit_PHONET_PIPE", "_xmit_IEEE802154",
 311          "_xmit_VOID", "_xmit_NONE"};
 312
 313 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 314 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 315
 316 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 317 {
 318         int i;
 319
 320         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 321                 if (netdev_lock_type[i] == dev_type)
 322                         return i;
 323         /* the last key is used by default */
 324         return ARRAY_SIZE(netdev_lock_type) - 1;
 325 }
 326
 327 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 328                                                  unsigned short dev_type)
 329 {
 330         int i;
 331
 332         i = netdev_lock_pos(dev_type);
 333         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 334                                    netdev_lock_name[i]);
 335 }
 336
 337 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 338 {
 339         int i;
 340
 341         i = netdev_lock_pos(dev->type);
 342         lockdep_set_class_and_name(&dev->addr_list_lock,
 343                                    &netdev_addr_lock_key[i],
 344                                    netdev_lock_name[i]);
 345 }
 346 #else
 347 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 348                                                  unsigned short dev_type)
 349 {
 350 }
 351 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 352 {
 353 }
 354 #endif
 355
 356 /*******************************************************************************
 357
 358                 Protocol management and registration routines
 359
 360 *******************************************************************************/
 361
 362 /*
 363  *      Add a protocol ID to the list. Now that the input handler is
 364  *      smarter we can dispense with all the messy stuff that used to be
 365  *      here.
 366  *
 367  *      BEWARE!!! Protocol handlers, mangling input packets,
 368  *      MUST BE last in hash buckets and checking protocol handlers
 369  *      MUST start from promiscuous ptype_all chain in net_bh.
 370  *      It is true now, do not change it.
 371  *      Explanation follows: if protocol handler, mangling packet, will
 372  *      be the first on list, it is not able to sense, that packet
 373  *      is cloned and should be copied-on-write, so that it will
 374  *      change it and subsequent readers will get broken packet.
 375  *                                                      --ANK (980803)
 376  */
 377
 378 static inline struct list_head *ptype_head(const struct packet_type *pt)
 379 {
 380         if (pt->type == htons(ETH_P_ALL))
 381                 return &ptype_all;
 382         else
 383                 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 384 }
 385
 386 /**
 387  *      dev_add_pack - add packet handler
 388  *      @pt: packet type declaration
 389  *
 390  *      Add a protocol handler to the networking stack. The passed &packet_type
 391  *      is linked into kernel lists and may not be freed until it has been
 392  *      removed from the kernel lists.
 393  *
 394  *      This call does not sleep therefore it can not
 395  *      guarantee all CPU's that are in middle of receiving packets
 396  *      will see the new packet type (until the next received packet).
 397  */
 398
 399 void dev_add_pack(struct packet_type *pt)
 400 {
 401         struct list_head *head = ptype_head(pt);
 402
 403         spin_lock(&ptype_lock);
 404         list_add_rcu(&pt->list, head);
 405         spin_unlock(&ptype_lock);
 406 }
 407 EXPORT_SYMBOL(dev_add_pack);
 408
 409 /**
 410  *      __dev_remove_pack        - remove packet handler
 411  *      @pt: packet type declaration
 412  *
 413  *      Remove a protocol handler that was previously added to the kernel
 414  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 415  *      from the kernel lists and can be freed or reused once this function
 416  *      returns.
 417  *
 418  *      The packet type might still be in use by receivers
 419  *      and must not be freed until after all the CPU's have gone
 420  *      through a quiescent state.
 421  */
 422 void __dev_remove_pack(struct packet_type *pt)
 423 {
 424         struct list_head *head = ptype_head(pt);
 425         struct packet_type *pt1;
 426
 427         spin_lock(&ptype_lock);
 428
 429         list_for_each_entry(pt1, head, list) {
 430                 if (pt == pt1) {
 431                         list_del_rcu(&pt->list);
 432                         goto out;
 433                 }
 434         }
 435
 436         printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
 437 out:
 438         spin_unlock(&ptype_lock);
 439 }
 440 EXPORT_SYMBOL(__dev_remove_pack);
 441
 442 /**
 443  *      dev_remove_pack  - remove packet handler
 444  *      @pt: packet type declaration
 445  *
 446  *      Remove a protocol handler that was previously added to the kernel
 447  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 448  *      from the kernel lists and can be freed or reused once this function
 449  *      returns.
 450  *
 451  *      This call sleeps to guarantee that no CPU is looking at the packet
 452  *      type after return.
 453  */
 454 void dev_remove_pack(struct packet_type *pt)
 455 {
 456         __dev_remove_pack(pt);
 457
 458         synchronize_net();
 459 }
 460 EXPORT_SYMBOL(dev_remove_pack);
 461
 462 /******************************************************************************
 463
 464                       Device Boot-time Settings Routines
 465
 466 *******************************************************************************/
 467
 468 /* Boot time configuration table */
 469 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 470
 471 /**
 472  *      netdev_boot_setup_add   - add new setup entry
 473  *      @name: name of the device
 474  *      @map: configured settings for the device
 475  *
 476  *      Adds new setup entry to the dev_boot_setup list.  The function
 477  *      returns 0 on error and 1 on success.  This is a generic routine to
 478  *      all netdevices.
 479  */
 480 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 481 {
 482         struct netdev_boot_setup *s;
 483         int i;
 484
 485         s = dev_boot_setup;
 486         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 487                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 488                         memset(s[i].name, 0, sizeof(s[i].name));
 489                         strlcpy(s[i].name, name, IFNAMSIZ);
 490                         memcpy(&s[i].map, map, sizeof(s[i].map));
 491                         break;
 492                 }
 493         }
 494
 495         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 496 }
 497
 498 /**
 499  *      netdev_boot_setup_check - check boot time settings
 500  *      @dev: the netdevice
 501  *
 502  *      Check boot time settings for the device.
 503  *      The found settings are set for the device to be used
 504  *      later in the device probing.
 505  *      Returns 0 if no settings found, 1 if they are.
 506  */
 507 int netdev_boot_setup_check(struct net_device *dev)
 508 {
 509         struct netdev_boot_setup *s = dev_boot_setup;
 510         int i;
 511
 512         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 513                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 514                     !strcmp(dev->name, s[i].name)) {
 515                         dev->irq        = s[i].map.irq;
 516                         dev->base_addr  = s[i].map.base_addr;
 517                         dev->mem_start  = s[i].map.mem_start;
 518                         dev->mem_end    = s[i].map.mem_end;
 519                         return 1;
 520                 }
 521         }
 522         return 0;
 523 }
 524 EXPORT_SYMBOL(netdev_boot_setup_check);
 525
 526
 527 /**
 528  *      netdev_boot_base        - get address from boot time settings
 529  *      @prefix: prefix for network device
 530  *      @unit: id for network device
 531  *
 532  *      Check boot time settings for the base address of device.
 533  *      The found settings are set for the device to be used
 534  *      later in the device probing.
 535  *      Returns 0 if no settings found.
 536  */
 537 unsigned long netdev_boot_base(const char *prefix, int unit)
 538 {
 539         const struct netdev_boot_setup *s = dev_boot_setup;
 540         char name[IFNAMSIZ];
 541         int i;
 542
 543         sprintf(name, "%s%d", prefix, unit);
 544
 545         /*
 546          * If device already registered then return base of 1
 547          * to indicate not to probe for this interface
 548          */
 549         if (__dev_get_by_name(&init_net, name))
 550                 return 1;
 551
 552         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 553                 if (!strcmp(name, s[i].name))
 554                         return s[i].map.base_addr;
 555         return 0;
 556 }
 557
 558 /*
 559  * Saves at boot time configured settings for any netdevice.
 560  */
 561 int __init netdev_boot_setup(char *str)
 562 {
 563         int ints[5];
 564         struct ifmap map;
 565
 566         str = get_options(str, ARRAY_SIZE(ints), ints);
 567         if (!str || !*str)
 568                 return 0;
 569
 570         /* Save settings */
 571         memset(&map, 0, sizeof(map));
 572         if (ints[0] > 0)
 573                 map.irq = ints[1];
 574         if (ints[0] > 1)
 575                 map.base_addr = ints[2];
 576         if (ints[0] > 2)
 577                 map.mem_start = ints[3];
 578         if (ints[0] > 3)
 579                 map.mem_end = ints[4];
 580
 581         /* Add new entry to the list */
 582         return netdev_boot_setup_add(str, &map);
 583 }
 584
 585 __setup("netdev=", netdev_boot_setup);
 586
 587 /*******************************************************************************
 588
 589                             Device Interface Subroutines
 590
 591 *******************************************************************************/
 592
 593 /**
 594  *      __dev_get_by_name       - find a device by its name
 595  *      @net: the applicable net namespace
 596  *      @name: name to find
 597  *
 598  *      Find an interface by name. Must be called under RTNL semaphore
 599  *      or @dev_base_lock. If the name is found a pointer to the device
 600  *      is returned. If the name is not found then %NULL is returned. The
 601  *      reference counters are not incremented so the caller must be
 602  *      careful with locks.
 603  */
 604
 605 struct net_device *__dev_get_by_name(struct net *net, const char *name)
 606 {
 607         struct hlist_node *p;
 608         struct net_device *dev;
 609         struct hlist_head *head = dev_name_hash(net, name);
 610
 611         hlist_for_each_entry(dev, p, head, name_hlist)
 612                 if (!strncmp(dev->name, name, IFNAMSIZ))
 613                         return dev;
 614
 615         return NULL;
 616 }
 617 EXPORT_SYMBOL(__dev_get_by_name);
 618
 619 /**
 620  *      dev_get_by_name_rcu     - find a device by its name
 621  *      @net: the applicable net namespace
 622  *      @name: name to find
 623  *
 624  *      Find an interface by name.
 625  *      If the name is found a pointer to the device is returned.
 626  *      If the name is not found then %NULL is returned.
 627  *      The reference counters are not incremented so the caller must be
 628  *      careful with locks. The caller must hold RCU lock.
 629  */
 630
 631 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 632 {
 633         struct hlist_node *p;
 634         struct net_device *dev;
 635         struct hlist_head *head = dev_name_hash(net, name);
 636
 637         hlist_for_each_entry_rcu(dev, p, head, name_hlist)
 638                 if (!strncmp(dev->name, name, IFNAMSIZ))
 639                         return dev;
 640
 641         return NULL;
 642 }
 643 EXPORT_SYMBOL(dev_get_by_name_rcu);
 644
 645 /**
 646  *      dev_get_by_name         - find a device by its name
 647  *      @net: the applicable net namespace
 648  *      @name: name to find
 649  *
 650  *      Find an interface by name. This can be called from any
 651  *      context and does its own locking. The returned handle has
 652  *      the usage count incremented and the caller must use dev_put() to
 653  *      release it when it is no longer needed. %NULL is returned if no
 654  *      matching device is found.
 655  */
 656
 657 struct net_device *dev_get_by_name(struct net *net, const char *name)
 658 {
 659         struct net_device *dev;
 660
 661         rcu_read_lock();
 662         dev = dev_get_by_name_rcu(net, name);
 663         if (dev)
 664                 dev_hold(dev);
 665         rcu_read_unlock();
 666         return dev;
 667 }
 668 EXPORT_SYMBOL(dev_get_by_name);
 669
 670 /**
 671  *      __dev_get_by_index - find a device by its ifindex
 672  *      @net: the applicable net namespace
 673  *      @ifindex: index of device
 674  *
 675  *      Search for an interface by index. Returns %NULL if the device
 676  *      is not found or a pointer to the device. The device has not
 677  *      had its reference counter increased so the caller must be careful
 678  *      about locking. The caller must hold either the RTNL semaphore
 679  *      or @dev_base_lock.
 680  */
 681
 682 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 683 {
 684         struct hlist_node *p;
 685         struct net_device *dev;
 686         struct hlist_head *head = dev_index_hash(net, ifindex);
 687
 688         hlist_for_each_entry(dev, p, head, index_hlist)
 689                 if (dev->ifindex == ifindex)
 690                         return dev;
 691
 692         return NULL;
 693 }
 694 EXPORT_SYMBOL(__dev_get_by_index);
 695
 696 /**
 697  *      dev_get_by_index_rcu - find a device by its ifindex
 698  *      @net: the applicable net namespace
 699  *      @ifindex: index of device
 700  *
 701  *      Search for an interface by index. Returns %NULL if the device
 702  *      is not found or a pointer to the device. The device has not
 703  *      had its reference counter increased so the caller must be careful
 704  *      about locking. The caller must hold RCU lock.
 705  */
 706
 707 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 708 {
 709         struct hlist_node *p;
 710         struct net_device *dev;
 711         struct hlist_head *head = dev_index_hash(net, ifindex);
 712
 713         hlist_for_each_entry_rcu(dev, p, head, index_hlist)
 714                 if (dev->ifindex == ifindex)
 715                         return dev;
 716
 717         return NULL;
 718 }
 719 EXPORT_SYMBOL(dev_get_by_index_rcu);
 720
 721
 722 /**
 723  *      dev_get_by_index - find a device by its ifindex
 724  *      @net: the applicable net namespace
 725  *      @ifindex: index of device
 726  *
 727  *      Search for an interface by index. Returns NULL if the device
 728  *      is not found or a pointer to the device. The device returned has
 729  *      had a reference added and the pointer is safe until the user calls
 730  *      dev_put to indicate they have finished with it.
 731  */
 732
 733 struct net_device *dev_get_by_index(struct net *net, int ifindex)
 734 {
 735         struct net_device *dev;
 736
 737         rcu_read_lock();
 738         dev = dev_get_by_index_rcu(net, ifindex);
 739         if (dev)
 740                 dev_hold(dev);
 741         rcu_read_unlock();
 742         return dev;
 743 }
 744 EXPORT_SYMBOL(dev_get_by_index);
 745
 746 /**
 747  *      dev_getbyhwaddr_rcu - find a device by its hardware address
 748  *      @net: the applicable net namespace
 749  *      @type: media type of device
 750  *      @ha: hardware address
 751  *
 752  *      Search for an interface by MAC address. Returns NULL if the device
 753  *      is not found or a pointer to the device.
 754  *      The caller must hold RCU or RTNL.
 755  *      The returned device has not had its ref count increased
 756  *      and the caller must therefore be careful about locking
 757  *
 758  */
 759
 760 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 761                                        const char *ha)
 762 {
 763         struct net_device *dev;
 764
 765         for_each_netdev_rcu(net, dev)
 766                 if (dev->type == type &&
 767                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 768                         return dev;
 769
 770         return NULL;
 771 }
 772 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 773
 774 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 775 {
 776         struct net_device *dev;
 777
 778         ASSERT_RTNL();
 779         for_each_netdev(net, dev)
 780                 if (dev->type == type)
 781                         return dev;
 782
 783         return NULL;
 784 }
 785 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 786
 787 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 788 {
 789         struct net_device *dev, *ret = NULL;
 790
 791         rcu_read_lock();
 792         for_each_netdev_rcu(net, dev)
 793                 if (dev->type == type) {
 794                         dev_hold(dev);
 795                         ret = dev;
 796                         break;
 797                 }
 798         rcu_read_unlock();
 799         return ret;
 800 }
 801 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 802
 803 /**
 804  *      dev_get_by_flags_rcu - find any device with given flags
 805  *      @net: the applicable net namespace
 806  *      @if_flags: IFF_* values
 807  *      @mask: bitmask of bits in if_flags to check
 808  *
 809  *      Search for any interface with the given flags. Returns NULL if a device
 810  *      is not found or a pointer to the device. Must be called inside
 811  *      rcu_read_lock(), and result refcount is unchanged.
 812  */
 813
 814 struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
 815                                     unsigned short mask)
 816 {
 817         struct net_device *dev, *ret;
 818
 819         ret = NULL;
 820         for_each_netdev_rcu(net, dev) {
 821                 if (((dev->flags ^ if_flags) & mask) == 0) {
 822                         ret = dev;
 823                         break;
 824                 }
 825         }
 826         return ret;
 827 }
 828 EXPORT_SYMBOL(dev_get_by_flags_rcu);
 829
 830 /**
 831  *      dev_valid_name - check if name is okay for network device
 832  *      @name: name string
 833  *
 834  *      Network device names need to be valid file names to
 835  *      to allow sysfs to work.  We also disallow any kind of
 836  *      whitespace.
 837  */
 838 int dev_valid_name(const char *name)
 839 {
 840         if (*name == '\0')
 841                 return 0;
 842         if (strlen(name) >= IFNAMSIZ)
 843                 return 0;
 844         if (!strcmp(name, ".") || !strcmp(name, ".."))
 845                 return 0;
 846
 847         while (*name) {
 848                 if (*name == '/' || isspace(*name))
 849                         return 0;
 850                 name++;
 851         }
 852         return 1;
 853 }
 854 EXPORT_SYMBOL(dev_valid_name);
 855
 856 /**
 857  *      __dev_alloc_name - allocate a name for a device
 858  *      @net: network namespace to allocate the device name in
 859  *      @name: name format string
 860  *      @buf:  scratch buffer and result name string
 861  *
 862  *      Passed a format string - eg "lt%d" it will try and find a suitable
 863  *      id. It scans list of devices to build up a free map, then chooses
 864  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 865  *      while allocating the name and adding the device in order to avoid
 866  *      duplicates.
 867  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 868  *      Returns the number of the unit assigned or a negative errno code.
 869  */
 870
 871 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 872 {
 873         int i = 0;
 874         const char *p;
 875         const int max_netdevices = 8*PAGE_SIZE;
 876         unsigned long *inuse;
 877         struct net_device *d;
 878
 879         p = strnchr(name, IFNAMSIZ-1, '%');
 880         if (p) {
 881                 /*
 882                  * Verify the string as this thing may have come from
 883                  * the user.  There must be either one "%d" and no other "%"
 884                  * characters.
 885                  */
 886                 if (p[1] != 'd' || strchr(p + 2, '%'))
 887                         return -EINVAL;
 888
 889                 /* Use one page as a bit array of possible slots */
 890                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 891                 if (!inuse)
 892                         return -ENOMEM;
 893
 894                 for_each_netdev(net, d) {
 895                         if (!sscanf(d->name, name, &i))
 896                                 continue;
 897                         if (i < 0 || i >= max_netdevices)
 898                                 continue;
 899
 900                         /*  avoid cases where sscanf is not exact inverse of printf */
 901                         snprintf(buf, IFNAMSIZ, name, i);
 902                         if (!strncmp(buf, d->name, IFNAMSIZ))
 903                                 set_bit(i, inuse);
 904                 }
 905
 906                 i = find_first_zero_bit(inuse, max_netdevices);
 907                 free_page((unsigned long) inuse);
 908         }
 909
 910         if (buf != name)
 911                 snprintf(buf, IFNAMSIZ, name, i);
 912         if (!__dev_get_by_name(net, buf))
 913                 return i;
 914
 915         /* It is possible to run out of possible slots
 916          * when the name is long and there isn't enough space left
 917          * for the digits, or if all bits are used.
 918          */
 919         return -ENFILE;
 920 }
 921
 922 /**
 923  *      dev_alloc_name - allocate a name for a device
 924  *      @dev: device
 925  *      @name: name format string
 926  *
 927  *      Passed a format string - eg "lt%d" it will try and find a suitable
 928  *      id. It scans list of devices to build up a free map, then chooses
 929  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 930  *      while allocating the name and adding the device in order to avoid
 931  *      duplicates.
 932  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 933  *      Returns the number of the unit assigned or a negative errno code.
 934  */
 935
 936 int dev_alloc_name(struct net_device *dev, const char *name)
 937 {
 938         char buf[IFNAMSIZ];
 939         struct net *net;
 940         int ret;
 941
 942         BUG_ON(!dev_net(dev));
 943         net = dev_net(dev);
 944         ret = __dev_alloc_name(net, name, buf);
 945         if (ret >= 0)
 946                 strlcpy(dev->name, buf, IFNAMSIZ);
 947         return ret;
 948 }
 949 EXPORT_SYMBOL(dev_alloc_name);
 950
 951 static int dev_get_valid_name(struct net_device *dev, const char *name, bool fmt)
 952 {
 953         struct net *net;
 954
 955         BUG_ON(!dev_net(dev));
 956         net = dev_net(dev);
 957
 958         if (!dev_valid_name(name))
 959                 return -EINVAL;
 960
 961         if (fmt && strchr(name, '%'))
 962                 return dev_alloc_name(dev, name);
 963         else if (__dev_get_by_name(net, name))
 964                 return -EEXIST;
 965         else if (dev->name != name)
 966                 strlcpy(dev->name, name, IFNAMSIZ);
 967
 968         return 0;
 969 }
 970
 971 /**
 972  *      dev_change_name - change name of a device
 973  *      @dev: device
 974  *      @newname: name (or format string) must be at least IFNAMSIZ
 975  *
 976  *      Change name of a device, can pass format strings "eth%d".
 977  *      for wildcarding.
 978  */
 979 int dev_change_name(struct net_device *dev, const char *newname)
 980 {
 981         char oldname[IFNAMSIZ];
 982         int err = 0;
 983         int ret;
 984         struct net *net;
 985
 986         ASSERT_RTNL();
 987         BUG_ON(!dev_net(dev));
 988
 989         net = dev_net(dev);
 990         if (dev->flags & IFF_UP)
 991                 return -EBUSY;
 992
 993         if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
 994                 return 0;
 995
 996         memcpy(oldname, dev->name, IFNAMSIZ);
 997
 998         err = dev_get_valid_name(dev, newname, 1);
 999         if (err < 0)
1000                 return err;
1001
1002 rollback:
1003         ret = device_rename(&dev->dev, dev->name);
1004         if (ret) {
1005                 memcpy(dev->name, oldname, IFNAMSIZ);
1006                 return ret;
1007         }
1008
1009         write_lock_bh(&dev_base_lock);
1010         hlist_del(&dev->name_hlist);
1011         write_unlock_bh(&dev_base_lock);
1012
1013         synchronize_rcu();
1014
1015         write_lock_bh(&dev_base_lock);
1016         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1017         write_unlock_bh(&dev_base_lock);
1018
1019         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1020         ret = notifier_to_errno(ret);
1021
1022         if (ret) {
1023                 /* err >= 0 after dev_alloc_name() or stores the first errno */
1024                 if (err >= 0) {
1025                         err = ret;
1026                         memcpy(dev->name, oldname, IFNAMSIZ);
1027                         goto rollback;
1028                 } else {
1029                         printk(KERN_ERR
1030                                "%s: name change rollback failed: %d.\n",
1031                                dev->name, ret);
1032                 }
1033         }
1034
1035         return err;
1036 }
1037
1038 /**
1039  *      dev_set_alias - change ifalias of a device
1040  *      @dev: device
1041  *      @alias: name up to IFALIASZ
1042  *      @len: limit of bytes to copy from info
1043  *
1044  *      Set ifalias for a device,
1045  */
1046 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1047 {
1048         ASSERT_RTNL();
1049
1050         if (len >= IFALIASZ)
1051                 return -EINVAL;
1052
1053         if (!len) {
1054                 if (dev->ifalias) {
1055                         kfree(dev->ifalias);
1056                         dev->ifalias = NULL;
1057                 }
1058                 return 0;
1059         }
1060
1061         dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1062         if (!dev->ifalias)
1063                 return -ENOMEM;
1064
1065         strlcpy(dev->ifalias, alias, len+1);
1066         return len;
1067 }
1068
1069
1070 /**
1071  *      netdev_features_change - device changes features
1072  *      @dev: device to cause notification
1073  *
1074  *      Called to indicate a device has changed features.
1075  */
1076 void netdev_features_change(struct net_device *dev)
1077 {
1078         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1079 }
1080 EXPORT_SYMBOL(netdev_features_change);
1081
1082 /**
1083  *      netdev_state_change - device changes state
1084  *      @dev: device to cause notification
1085  *
1086  *      Called to indicate a device has changed state. This function calls
1087  *      the notifier chains for netdev_chain and sends a NEWLINK message
1088  *      to the routing socket.
1089  */
1090 void netdev_state_change(struct net_device *dev)
1091 {
1092         if (dev->flags & IFF_UP) {
1093                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1094                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1095         }
1096 }
1097 EXPORT_SYMBOL(netdev_state_change);
1098
1099 int netdev_bonding_change(struct net_device *dev, unsigned long event)
1100 {
1101         return call_netdevice_notifiers(event, dev);
1102 }
1103 EXPORT_SYMBOL(netdev_bonding_change);
1104
1105 /**
1106  *      dev_load        - load a network module
1107  *      @net: the applicable net namespace
1108  *      @name: name of interface
1109  *
1110  *      If a network interface is not present and the process has suitable
1111  *      privileges this function loads the module. If module loading is not
1112  *      available in this kernel then it becomes a nop.
1113  */
1114
1115 void dev_load(struct net *net, const char *name)
1116 {
1117         struct net_device *dev;
1118
1119         rcu_read_lock();
1120         dev = dev_get_by_name_rcu(net, name);
1121         rcu_read_unlock();
1122
1123         if (!dev && capable(CAP_NET_ADMIN))
1124                 request_module("%s", name);
1125 }
1126 EXPORT_SYMBOL(dev_load);
1127
1128 static int __dev_open(struct net_device *dev)
1129 {
1130         const struct net_device_ops *ops = dev->netdev_ops;
1131         int ret;
1132
1133         ASSERT_RTNL();
1134
1135         /*
1136          *      Is it even present?
1137          */
1138         if (!netif_device_present(dev))
1139                 return -ENODEV;
1140
1141         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1142         ret = notifier_to_errno(ret);
1143         if (ret)
1144                 return ret;
1145
1146         /*
1147          *      Call device private open method
1148          */
1149         set_bit(__LINK_STATE_START, &dev->state);
1150
1151         if (ops->ndo_validate_addr)
1152                 ret = ops->ndo_validate_addr(dev);
1153
1154         if (!ret && ops->ndo_open)
1155                 ret = ops->ndo_open(dev);
1156
1157         /*
1158          *      If it went open OK then:
1159          */
1160
1161         if (ret)
1162                 clear_bit(__LINK_STATE_START, &dev->state);
1163         else {
1164                 /*
1165                  *      Set the flags.
1166                  */
1167                 dev->flags |= IFF_UP;
1168
1169                 /*
1170                  *      Enable NET_DMA
1171                  */
1172                 net_dmaengine_get();
1173
1174                 /*
1175                  *      Initialize multicasting status
1176                  */
1177                 dev_set_rx_mode(dev);
1178
1179                 /*
1180                  *      Wakeup transmit queue engine
1181                  */
1182                 dev_activate(dev);
1183         }
1184
1185         return ret;
1186 }
1187
1188 /**
1189  *      dev_open        - prepare an interface for use.
1190  *      @dev:   device to open
1191  *
1192  *      Takes a device from down to up state. The device's private open
1193  *      function is invoked and then the multicast lists are loaded. Finally
1194  *      the device is moved into the up state and a %NETDEV_UP message is
1195  *      sent to the netdev notifier chain.
1196  *
1197  *      Calling this function on an active interface is a nop. On a failure
1198  *      a negative errno code is returned.
1199  */
1200 int dev_open(struct net_device *dev)
1201 {
1202         int ret;
1203
1204         /*
1205          *      Is it already up?
1206          */
1207         if (dev->flags & IFF_UP)
1208                 return 0;
1209
1210         /*
1211          *      Open device
1212          */
1213         ret = __dev_open(dev);
1214         if (ret < 0)
1215                 return ret;
1216
1217         /*
1218          *      ... and announce new interface.
1219          */
1220         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1221         call_netdevice_notifiers(NETDEV_UP, dev);
1222
1223         return ret;
1224 }
1225 EXPORT_SYMBOL(dev_open);
1226
1227 static int __dev_close_many(struct list_head *head)
1228 {
1229         struct net_device *dev;
1230
1231         ASSERT_RTNL();
1232         might_sleep();
1233
1234         list_for_each_entry(dev, head, unreg_list) {
1235                 /*
1236                  *      Tell people we are going down, so that they can
1237                  *      prepare to death, when device is still operating.
1238                  */
1239                 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1240
1241                 clear_bit(__LINK_STATE_START, &dev->state);
1242
1243                 /* Synchronize to scheduled poll. We cannot touch poll list, it
1244                  * can be even on different cpu. So just clear netif_running().
1245                  *
1246                  * dev->stop() will invoke napi_disable() on all of it's
1247                  * napi_struct instances on this device.
1248                  */
1249                 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1250         }
1251
1252         dev_deactivate_many(head);
1253
1254         list_for_each_entry(dev, head, unreg_list) {
1255                 const struct net_device_ops *ops = dev->netdev_ops;
1256
1257                 /*
1258                  *      Call the device specific close. This cannot fail.
1259                  *      Only if device is UP
1260                  *
1261                  *      We allow it to be called even after a DETACH hot-plug
1262                  *      event.
1263                  */
1264                 if (ops->ndo_stop)
1265                         ops->ndo_stop(dev);
1266
1267                 /*
1268                  *      Device is now down.
1269                  */
1270
1271                 dev->flags &= ~IFF_UP;
1272
1273                 /*
1274                  *      Shutdown NET_DMA
1275                  */
1276                 net_dmaengine_put();
1277         }
1278
1279         return 0;
1280 }
1281
1282 static int __dev_close(struct net_device *dev)
1283 {
1284         LIST_HEAD(single);
1285
1286         list_add(&dev->unreg_list, &single);
1287         return __dev_close_many(&single);
1288 }
1289
1290 static int dev_close_many(struct list_head *head)
1291 {
1292         struct net_device *dev, *tmp;
1293         LIST_HEAD(tmp_list);
1294
1295         list_for_each_entry_safe(dev, tmp, head, unreg_list)
1296                 if (!(dev->flags & IFF_UP))
1297                         list_move(&dev->unreg_list, &tmp_list);
1298
1299         __dev_close_many(head);
1300
1301         /*
1302          * Tell people we are down
1303          */
1304         list_for_each_entry(dev, head, unreg_list) {
1305                 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1306                 call_netdevice_notifiers(NETDEV_DOWN, dev);
1307         }
1308
1309         /* rollback_registered_many needs the complete original list */
1310         list_splice(&tmp_list, head);
1311         return 0;
1312 }
1313
1314 /**
1315  *      dev_close - shutdown an interface.
1316  *      @dev: device to shutdown
1317  *
1318  *      This function moves an active device into down state. A
1319  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1320  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1321  *      chain.
1322  */
1323 int dev_close(struct net_device *dev)
1324 {
1325         LIST_HEAD(single);
1326
1327         list_add(&dev->unreg_list, &single);
1328         dev_close_many(&single);
1329
1330         return 0;
1331 }
1332 EXPORT_SYMBOL(dev_close);
1333
1334
1335 /**
1336  *      dev_disable_lro - disable Large Receive Offload on a device
1337  *      @dev: device
1338  *
1339  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1340  *      called under RTNL.  This is needed if received packets may be
1341  *      forwarded to another interface.
1342  */
1343 void dev_disable_lro(struct net_device *dev)
1344 {
1345         if (dev->ethtool_ops && dev->ethtool_ops->get_flags &&
1346             dev->ethtool_ops->set_flags) {
1347                 u32 flags = dev->ethtool_ops->get_flags(dev);
1348                 if (flags & ETH_FLAG_LRO) {
1349                         flags &= ~ETH_FLAG_LRO;
1350                         dev->ethtool_ops->set_flags(dev, flags);
1351                 }
1352         }
1353         WARN_ON(dev->features & NETIF_F_LRO);
1354 }
1355 EXPORT_SYMBOL(dev_disable_lro);
1356
1357
1358 static int dev_boot_phase = 1;
1359
1360 /*
1361  *      Device change register/unregister. These are not inline or static
1362  *      as we export them to the world.
1363  */
1364
1365 /**
1366  *      register_netdevice_notifier - register a network notifier block
1367  *      @nb: notifier
1368  *
1369  *      Register a notifier to be called when network device events occur.
1370  *      The notifier passed is linked into the kernel structures and must
1371  *      not be reused until it has been unregistered. A negative errno code
1372  *      is returned on a failure.
1373  *
1374  *      When registered all registration and up events are replayed
1375  *      to the new notifier to allow device to have a race free
1376  *      view of the network device list.
1377  */
1378
1379 int register_netdevice_notifier(struct notifier_block *nb)
1380 {
1381         struct net_device *dev;
1382         struct net_device *last;
1383         struct net *net;
1384         int err;
1385
1386         rtnl_lock();
1387         err = raw_notifier_chain_register(&netdev_chain, nb);
1388         if (err)
1389                 goto unlock;
1390         if (dev_boot_phase)
1391                 goto unlock;
1392         for_each_net(net) {
1393                 for_each_netdev(net, dev) {
1394                         err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1395                         err = notifier_to_errno(err);
1396                         if (err)
1397                                 goto rollback;
1398
1399                         if (!(dev->flags & IFF_UP))
1400                                 continue;
1401
1402                         nb->notifier_call(nb, NETDEV_UP, dev);
1403                 }
1404         }
1405
1406 unlock:
1407         rtnl_unlock();
1408         return err;
1409
1410 rollback:
1411         last = dev;
1412         for_each_net(net) {
1413                 for_each_netdev(net, dev) {
1414                         if (dev == last)
1415                                 break;
1416
1417                         if (dev->flags & IFF_UP) {
1418                                 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1419                                 nb->notifier_call(nb, NETDEV_DOWN, dev);
1420                         }
1421                         nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1422                         nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1423                 }
1424         }
1425
1426         raw_notifier_chain_unregister(&netdev_chain, nb);
1427         goto unlock;
1428 }
1429 EXPORT_SYMBOL(register_netdevice_notifier);
1430
1431 /**
1432  *      unregister_netdevice_notifier - unregister a network notifier block
1433  *      @nb: notifier
1434  *
1435  *      Unregister a notifier previously registered by
1436  *      register_netdevice_notifier(). The notifier is unlinked into the
1437  *      kernel structures and may then be reused. A negative errno code
1438  *      is returned on a failure.
1439  */
1440
1441 int unregister_netdevice_notifier(struct notifier_block *nb)
1442 {
1443         int err;
1444
1445         rtnl_lock();
1446         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1447         rtnl_unlock();
1448         return err;
1449 }
1450 EXPORT_SYMBOL(unregister_netdevice_notifier);
1451
1452 /**
1453  *      call_netdevice_notifiers - call all network notifier blocks
1454  *      @val: value passed unmodified to notifier function
1455  *      @dev: net_device pointer passed unmodified to notifier function
1456  *
1457  *      Call all network notifier blocks.  Parameters and return value
1458  *      are as for raw_notifier_call_chain().
1459  */
1460
1461 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1462 {
1463         ASSERT_RTNL();
1464         return raw_notifier_call_chain(&netdev_chain, val, dev);
1465 }
1466
1467 /* When > 0 there are consumers of rx skb time stamps */
1468 static atomic_t netstamp_needed = ATOMIC_INIT(0);
1469
1470 void net_enable_timestamp(void)
1471 {
1472         atomic_inc(&netstamp_needed);
1473 }
1474 EXPORT_SYMBOL(net_enable_timestamp);
1475
1476 void net_disable_timestamp(void)
1477 {
1478         atomic_dec(&netstamp_needed);
1479 }
1480 EXPORT_SYMBOL(net_disable_timestamp);
1481
1482 static inline void net_timestamp_set(struct sk_buff *skb)
1483 {
1484         if (atomic_read(&netstamp_needed))
1485                 __net_timestamp(skb);
1486         else
1487                 skb->tstamp.tv64 = 0;
1488 }
1489
1490 static inline void net_timestamp_check(struct sk_buff *skb)
1491 {
1492         if (!skb->tstamp.tv64 && atomic_read(&netstamp_needed))
1493                 __net_timestamp(skb);
1494 }
1495
1496 /**
1497  * dev_forward_skb - loopback an skb to another netif
1498  *
1499  * @dev: destination network device
1500  * @skb: buffer to forward
1501  *
1502  * return values:
1503  *      NET_RX_SUCCESS  (no congestion)
1504  *      NET_RX_DROP     (packet was dropped, but freed)
1505  *
1506  * dev_forward_skb can be used for injecting an skb from the
1507  * start_xmit function of one device into the receive queue
1508  * of another device.
1509  *
1510  * The receiving device may be in another namespace, so
1511  * we have to clear all information in the skb that could
1512  * impact namespace isolation.
1513  */
1514 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1515 {
1516         skb_orphan(skb);
1517         nf_reset(skb);
1518
1519         if (unlikely(!(dev->flags & IFF_UP) ||
1520                      (skb->len > (dev->mtu + dev->hard_header_len + VLAN_HLEN)))) {
1521                 atomic_long_inc(&dev->rx_dropped);
1522                 kfree_skb(skb);
1523                 return NET_RX_DROP;
1524         }
1525         skb_set_dev(skb, dev);
1526         skb->tstamp.tv64 = 0;
1527         skb->pkt_type = PACKET_HOST;
1528         skb->protocol = eth_type_trans(skb, dev);
1529         return netif_rx(skb);
1530 }
1531 EXPORT_SYMBOL_GPL(dev_forward_skb);
1532
1533 static inline int deliver_skb(struct sk_buff *skb,
1534                               struct packet_type *pt_prev,
1535                               struct net_device *orig_dev)
1536 {
1537         atomic_inc(&skb->users);
1538         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1539 }
1540
1541 /*
1542  *      Support routine. Sends outgoing frames to any network
1543  *      taps currently in use.
1544  */
1545
1546 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1547 {
1548         struct packet_type *ptype;
1549         struct sk_buff *skb2 = NULL;
1550         struct packet_type *pt_prev = NULL;
1551
1552         rcu_read_lock();
1553         list_for_each_entry_rcu(ptype, &ptype_all, list) {
1554                 /* Never send packets back to the socket
1555                  * they originated from - MvS (miquels@drinkel.ow.org)
1556                  */
1557                 if ((ptype->dev == dev || !ptype->dev) &&
1558                     (ptype->af_packet_priv == NULL ||
1559                      (struct sock *)ptype->af_packet_priv != skb->sk)) {
1560                         if (pt_prev) {
1561                                 deliver_skb(skb2, pt_prev, skb->dev);
1562                                 pt_prev = ptype;
1563                                 continue;
1564                         }
1565
1566                         skb2 = skb_clone(skb, GFP_ATOMIC);
1567                         if (!skb2)
1568                                 break;
1569
1570                         net_timestamp_set(skb2);
1571
1572                         /* skb->nh should be correctly
1573                            set by sender, so that the second statement is
1574                            just protection against buggy protocols.
1575                          */
1576                         skb_reset_mac_header(skb2);
1577
1578                         if (skb_network_header(skb2) < skb2->data ||
1579                             skb2->network_header > skb2->tail) {
1580                                 if (net_ratelimit())
1581                                         printk(KERN_CRIT "protocol %04x is "
1582                                                "buggy, dev %s\n",
1583                                                ntohs(skb2->protocol),
1584                                                dev->name);
1585                                 skb_reset_network_header(skb2);
1586                         }
1587
1588                         skb2->transport_header = skb2->network_header;
1589                         skb2->pkt_type = PACKET_OUTGOING;
1590                         pt_prev = ptype;
1591                 }
1592         }
1593         if (pt_prev)
1594                 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1595         rcu_read_unlock();
1596 }
1597
1598 /* netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1599  * @dev: Network device
1600  * @txq: number of queues available
1601  *
1602  * If real_num_tx_queues is changed the tc mappings may no longer be
1603  * valid. To resolve this verify the tc mapping remains valid and if
1604  * not NULL the mapping. With no priorities mapping to this
1605  * offset/count pair it will no longer be used. In the worst case TC0
1606  * is invalid nothing can be done so disable priority mappings. If is
1607  * expected that drivers will fix this mapping if they can before
1608  * calling netif_set_real_num_tx_queues.
1609  */
1610 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1611 {
1612         int i;
1613         struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1614
1615         /* If TC0 is invalidated disable TC mapping */
1616         if (tc->offset + tc->count > txq) {
1617                 pr_warning("Number of in use tx queues changed "
1618                            "invalidating tc mappings. Priority "
1619                            "traffic classification disabled!\n");
1620                 dev->num_tc = 0;
1621                 return;
1622         }
1623
1624         /* Invalidated prio to tc mappings set to TC0 */
1625         for (i = 1; i < TC_BITMASK + 1; i++) {
1626                 int q = netdev_get_prio_tc_map(dev, i);
1627
1628                 tc = &dev->tc_to_txq[q];
1629                 if (tc->offset + tc->count > txq) {
1630                         pr_warning("Number of in use tx queues "
1631                                    "changed. Priority %i to tc "
1632                                    "mapping %i is no longer valid "
1633                                    "setting map to 0\n",
1634                                    i, q);
1635                         netdev_set_prio_tc_map(dev, i, 0);
1636                 }
1637         }
1638 }
1639
1640 /*
1641  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1642  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1643  */
1644 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1645 {
1646         int rc;
1647
1648         if (txq < 1 || txq > dev->num_tx_queues)
1649                 return -EINVAL;
1650
1651         if (dev->reg_state == NETREG_REGISTERED ||
1652             dev->reg_state == NETREG_UNREGISTERING) {
1653                 ASSERT_RTNL();
1654
1655                 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
1656                                                   txq);
1657                 if (rc)
1658                         return rc;
1659
1660                 if (dev->num_tc)
1661                         netif_setup_tc(dev, txq);
1662
1663                 if (txq < dev->real_num_tx_queues)
1664                         qdisc_reset_all_tx_gt(dev, txq);
1665         }
1666
1667         dev->real_num_tx_queues = txq;
1668         return 0;
1669 }
1670 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
1671
1672 #ifdef CONFIG_RPS
1673 /**
1674  *      netif_set_real_num_rx_queues - set actual number of RX queues used
1675  *      @dev: Network device
1676  *      @rxq: Actual number of RX queues
1677  *
1678  *      This must be called either with the rtnl_lock held or before
1679  *      registration of the net device.  Returns 0 on success, or a
1680  *      negative error code.  If called before registration, it always
1681  *      succeeds.
1682  */
1683 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
1684 {
1685         int rc;
1686
1687         if (rxq < 1 || rxq > dev->num_rx_queues)
1688                 return -EINVAL;
1689
1690         if (dev->reg_state == NETREG_REGISTERED) {
1691                 ASSERT_RTNL();
1692
1693                 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
1694                                                   rxq);
1695                 if (rc)
1696                         return rc;
1697         }
1698
1699         dev->real_num_rx_queues = rxq;
1700         return 0;
1701 }
1702 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
1703 #endif
1704
1705 static inline void __netif_reschedule(struct Qdisc *q)
1706 {
1707         struct softnet_data *sd;
1708         unsigned long flags;
1709
1710         local_irq_save(flags);
1711         sd = &__get_cpu_var(softnet_data);
1712         q->next_sched = NULL;
1713         *sd->output_queue_tailp = q;
1714         sd->output_queue_tailp = &q->next_sched;
1715         raise_softirq_irqoff(NET_TX_SOFTIRQ);
1716         local_irq_restore(flags);
1717 }
1718
1719 void __netif_schedule(struct Qdisc *q)
1720 {
1721         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1722                 __netif_reschedule(q);
1723 }
1724 EXPORT_SYMBOL(__netif_schedule);
1725
1726 void dev_kfree_skb_irq(struct sk_buff *skb)
1727 {
1728         if (atomic_dec_and_test(&skb->users)) {
1729                 struct softnet_data *sd;
1730                 unsigned long flags;
1731
1732                 local_irq_save(flags);
1733                 sd = &__get_cpu_var(softnet_data);
1734                 skb->next = sd->completion_queue;
1735                 sd->completion_queue = skb;
1736                 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1737                 local_irq_restore(flags);
1738         }
1739 }
1740 EXPORT_SYMBOL(dev_kfree_skb_irq);
1741
1742 void dev_kfree_skb_any(struct sk_buff *skb)
1743 {
1744         if (in_irq() || irqs_disabled())
1745                 dev_kfree_skb_irq(skb);
1746         else
1747                 dev_kfree_skb(skb);
1748 }
1749 EXPORT_SYMBOL(dev_kfree_skb_any);
1750
1751
1752 /**
1753  * netif_device_detach - mark device as removed
1754  * @dev: network device
1755  *
1756  * Mark device as removed from system and therefore no longer available.
1757  */
1758 void netif_device_detach(struct net_device *dev)
1759 {
1760         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1761             netif_running(dev)) {
1762                 netif_tx_stop_all_queues(dev);
1763         }
1764 }
1765 EXPORT_SYMBOL(netif_device_detach);
1766
1767 /**
1768  * netif_device_attach - mark device as attached
1769  * @dev: network device
1770  *
1771  * Mark device as attached from system and restart if needed.
1772  */
1773 void netif_device_attach(struct net_device *dev)
1774 {
1775         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1776             netif_running(dev)) {
1777                 netif_tx_wake_all_queues(dev);
1778                 __netdev_watchdog_up(dev);
1779         }
1780 }
1781 EXPORT_SYMBOL(netif_device_attach);
1782
1783 /**
1784  * skb_dev_set -- assign a new device to a buffer
1785  * @skb: buffer for the new device
1786  * @dev: network device
1787  *
1788  * If an skb is owned by a device already, we have to reset
1789  * all data private to the namespace a device belongs to
1790  * before assigning it a new device.
1791  */
1792 #ifdef CONFIG_NET_NS
1793 void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
1794 {
1795         skb_dst_drop(skb);
1796         if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
1797                 secpath_reset(skb);
1798                 nf_reset(skb);
1799                 skb_init_secmark(skb);
1800                 skb->mark = 0;
1801                 skb->priority = 0;
1802                 skb->nf_trace = 0;
1803                 skb->ipvs_property = 0;
1804 #ifdef CONFIG_NET_SCHED
1805                 skb->tc_index = 0;
1806 #endif
1807         }
1808         skb->dev = dev;
1809 }
1810 EXPORT_SYMBOL(skb_set_dev);
1811 #endif /* CONFIG_NET_NS */
1812
1813 /*
1814  * Invalidate hardware checksum when packet is to be mangled, and
1815  * complete checksum manually on outgoing path.
1816  */
1817 int skb_checksum_help(struct sk_buff *skb)
1818 {
1819         __wsum csum;
1820         int ret = 0, offset;
1821
1822         if (skb->ip_summed == CHECKSUM_COMPLETE)
1823                 goto out_set_summed;
1824
1825         if (unlikely(skb_shinfo(skb)->gso_size)) {
1826                 /* Let GSO fix up the checksum. */
1827                 goto out_set_summed;
1828         }
1829
1830         offset = skb_checksum_start_offset(skb);
1831         BUG_ON(offset >= skb_headlen(skb));
1832         csum = skb_checksum(skb, offset, skb->len - offset, 0);
1833
1834         offset += skb->csum_offset;
1835         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1836
1837         if (skb_cloned(skb) &&
1838             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1839                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1840                 if (ret)
1841                         goto out;
1842         }
1843
1844         *(__sum16 *)(skb->data + offset) = csum_fold(csum);
1845 out_set_summed:
1846         skb->ip_summed = CHECKSUM_NONE;
1847 out:
1848         return ret;
1849 }
1850 EXPORT_SYMBOL(skb_checksum_help);
1851
1852 /**
1853  *      skb_gso_segment - Perform segmentation on skb.
1854  *      @skb: buffer to segment
1855  *      @features: features for the output path (see dev->features)
1856  *
1857  *      This function segments the given skb and returns a list of segments.
1858  *
1859  *      It may return NULL if the skb requires no segmentation.  This is
1860  *      only possible when GSO is used for verifying header integrity.
1861  */
1862 struct sk_buff *skb_gso_segment(struct sk_buff *skb, u32 features)
1863 {
1864         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1865         struct packet_type *ptype;
1866         __be16 type = skb->protocol;
1867         int vlan_depth = ETH_HLEN;
1868         int err;
1869
1870         while (type == htons(ETH_P_8021Q)) {
1871                 struct vlan_hdr *vh;
1872
1873                 if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
1874                         return ERR_PTR(-EINVAL);
1875
1876                 vh = (struct vlan_hdr *)(skb->data + vlan_depth);
1877                 type = vh->h_vlan_encapsulated_proto;
1878                 vlan_depth += VLAN_HLEN;
1879         }
1880
1881         skb_reset_mac_header(skb);
1882         skb->mac_len = skb->network_header - skb->mac_header;
1883         __skb_pull(skb, skb->mac_len);
1884
1885         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1886                 struct net_device *dev = skb->dev;
1887                 struct ethtool_drvinfo info = {};
1888
1889                 if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1890                         dev->ethtool_ops->get_drvinfo(dev, &info);
1891
1892                 WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d ip_summed=%d\n",
1893                      info.driver, dev ? dev->features : 0L,
1894                      skb->sk ? skb->sk->sk_route_caps : 0L,
1895                      skb->len, skb->data_len, skb->ip_summed);
1896
1897                 if (skb_header_cloned(skb) &&
1898                     (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1899                         return ERR_PTR(err);
1900         }
1901
1902         rcu_read_lock();
1903         list_for_each_entry_rcu(ptype,
1904                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1905                 if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1906                         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1907                                 err = ptype->gso_send_check(skb);
1908                                 segs = ERR_PTR(err);
1909                                 if (err || skb_gso_ok(skb, features))
1910                                         break;
1911                                 __skb_push(skb, (skb->data -
1912                                                  skb_network_header(skb)));
1913                         }
1914                         segs = ptype->gso_segment(skb, features);
1915                         break;
1916                 }
1917         }
1918         rcu_read_unlock();
1919
1920         __skb_push(skb, skb->data - skb_mac_header(skb));
1921
1922         return segs;
1923 }
1924 EXPORT_SYMBOL(skb_gso_segment);
1925
1926 /* Take action when hardware reception checksum errors are detected. */
1927 #ifdef CONFIG_BUG
1928 void netdev_rx_csum_fault(struct net_device *dev)
1929 {
1930         if (net_ratelimit()) {
1931                 printk(KERN_ERR "%s: hw csum failure.\n",
1932                         dev ? dev->name : "<unknown>");
1933                 dump_stack();
1934         }
1935 }
1936 EXPORT_SYMBOL(netdev_rx_csum_fault);
1937 #endif
1938
1939 /* Actually, we should eliminate this check as soon as we know, that:
1940  * 1. IOMMU is present and allows to map all the memory.
1941  * 2. No high memory really exists on this machine.
1942  */
1943
1944 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1945 {
1946 #ifdef CONFIG_HIGHMEM
1947         int i;
1948         if (!(dev->features & NETIF_F_HIGHDMA)) {
1949                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1950                         if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1951                                 return 1;
1952         }
1953
1954         if (PCI_DMA_BUS_IS_PHYS) {
1955                 struct device *pdev = dev->dev.parent;
1956
1957                 if (!pdev)
1958                         return 0;
1959                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1960                         dma_addr_t addr = page_to_phys(skb_shinfo(skb)->frags[i].page);
1961                         if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
1962                                 return 1;
1963                 }
1964         }
1965 #endif
1966         return 0;
1967 }
1968
1969 struct dev_gso_cb {
1970         void (*destructor)(struct sk_buff *skb);
1971 };
1972
1973 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1974
1975 static void dev_gso_skb_destructor(struct sk_buff *skb)
1976 {
1977         struct dev_gso_cb *cb;
1978
1979         do {
1980                 struct sk_buff *nskb = skb->next;
1981
1982                 skb->next = nskb->next;
1983                 nskb->next = NULL;
1984                 kfree_skb(nskb);
1985         } while (skb->next);
1986
1987         cb = DEV_GSO_CB(skb);
1988         if (cb->destructor)
1989                 cb->destructor(skb);
1990 }
1991
1992 /**
1993  *      dev_gso_segment - Perform emulated hardware segmentation on skb.
1994  *      @skb: buffer to segment
1995  *      @features: device features as applicable to this skb
1996  *
1997  *      This function segments the given skb and stores the list of segments
1998  *      in skb->next.
1999  */
2000 static int dev_gso_segment(struct sk_buff *skb, int features)
2001 {
2002         struct sk_buff *segs;
2003
2004         segs = skb_gso_segment(skb, features);
2005
2006         /* Verifying header integrity only. */
2007         if (!segs)
2008                 return 0;
2009
2010         if (IS_ERR(segs))
2011                 return PTR_ERR(segs);
2012
2013         skb->next = segs;
2014         DEV_GSO_CB(skb)->destructor = skb->destructor;
2015         skb->destructor = dev_gso_skb_destructor;
2016
2017         return 0;
2018 }
2019
2020 /*
2021  * Try to orphan skb early, right before transmission by the device.
2022  * We cannot orphan skb if tx timestamp is requested or the sk-reference
2023  * is needed on driver level for other reasons, e.g. see net/can/raw.c
2024  */
2025 static inline void skb_orphan_try(struct sk_buff *skb)
2026 {
2027         struct sock *sk = skb->sk;
2028
2029         if (sk && !skb_shinfo(skb)->tx_flags) {
2030                 /* skb_tx_hash() wont be able to get sk.
2031                  * We copy sk_hash into skb->rxhash
2032                  */
2033                 if (!skb->rxhash)
2034                         skb->rxhash = sk->sk_hash;
2035                 skb_orphan(skb);
2036         }
2037 }
2038
2039 static bool can_checksum_protocol(unsigned long features, __be16 protocol)
2040 {
2041         return ((features & NETIF_F_GEN_CSUM) ||
2042                 ((features & NETIF_F_V4_CSUM) &&
2043                  protocol == htons(ETH_P_IP)) ||
2044                 ((features & NETIF_F_V6_CSUM) &&
2045                  protocol == htons(ETH_P_IPV6)) ||
2046                 ((features & NETIF_F_FCOE_CRC) &&
2047                  protocol == htons(ETH_P_FCOE)));
2048 }
2049
2050 static u32 harmonize_features(struct sk_buff *skb, __be16 protocol, u32 features)
2051 {
2052         if (!can_checksum_protocol(features, protocol)) {
2053                 features &= ~NETIF_F_ALL_CSUM;
2054                 features &= ~NETIF_F_SG;
2055         } else if (illegal_highdma(skb->dev, skb)) {
2056                 features &= ~NETIF_F_SG;
2057         }
2058
2059         return features;
2060 }
2061
2062 u32 netif_skb_features(struct sk_buff *skb)
2063 {
2064         __be16 protocol = skb->protocol;
2065         u32 features = skb->dev->features;
2066
2067         if (protocol == htons(ETH_P_8021Q)) {
2068                 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2069                 protocol = veh->h_vlan_encapsulated_proto;
2070         } else if (!vlan_tx_tag_present(skb)) {
2071                 return harmonize_features(skb, protocol, features);
2072         }
2073
2074         features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
2075
2076         if (protocol != htons(ETH_P_8021Q)) {
2077                 return harmonize_features(skb, protocol, features);
2078         } else {
2079                 features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2080                                 NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
2081                 return harmonize_features(skb, protocol, features);
2082         }
2083 }
2084 EXPORT_SYMBOL(netif_skb_features);
2085
2086 /*
2087  * Returns true if either:
2088  *      1. skb has frag_list and the device doesn't support FRAGLIST, or
2089  *      2. skb is fragmented and the device does not support SG, or if
2090  *         at least one of fragments is in highmem and device does not
2091  *         support DMA from it.
2092  */
2093 static inline int skb_needs_linearize(struct sk_buff *skb,
2094                                       int features)
2095 {
2096         return skb_is_nonlinear(skb) &&
2097                         ((skb_has_frag_list(skb) &&
2098                                 !(features & NETIF_F_FRAGLIST)) ||
2099                         (skb_shinfo(skb)->nr_frags &&
2100                                 !(features & NETIF_F_SG)));
2101 }
2102
2103 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2104                         struct netdev_queue *txq)
2105 {
2106         const struct net_device_ops *ops = dev->netdev_ops;
2107         int rc = NETDEV_TX_OK;
2108
2109         if (likely(!skb->next)) {
2110                 u32 features;
2111
2112                 /*
2113                  * If device doesnt need skb->dst, release it right now while
2114                  * its hot in this cpu cache
2115                  */
2116                 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2117                         skb_dst_drop(skb);
2118
2119                 if (!list_empty(&ptype_all))
2120                         dev_queue_xmit_nit(skb, dev);
2121
2122                 skb_orphan_try(skb);
2123
2124                 features = netif_skb_features(skb);
2125
2126                 if (vlan_tx_tag_present(skb) &&
2127                     !(features & NETIF_F_HW_VLAN_TX)) {
2128                         skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2129                         if (unlikely(!skb))
2130                                 goto out;
2131
2132                         skb->vlan_tci = 0;
2133                 }
2134
2135                 if (netif_needs_gso(skb, features)) {
2136                         if (unlikely(dev_gso_segment(skb, features)))
2137                                 goto out_kfree_skb;
2138                         if (skb->next)
2139                                 goto gso;
2140                 } else {
2141                         if (skb_needs_linearize(skb, features) &&
2142                             __skb_linearize(skb))
2143                                 goto out_kfree_skb;
2144
2145                         /* If packet is not checksummed and device does not
2146                          * support checksumming for this protocol, complete
2147                          * checksumming here.
2148                          */
2149                         if (skb->ip_summed == CHECKSUM_PARTIAL) {
2150                                 skb_set_transport_header(skb,
2151                                         skb_checksum_start_offset(skb));
2152                                 if (!(features & NETIF_F_ALL_CSUM) &&
2153                                      skb_checksum_help(skb))
2154                                         goto out_kfree_skb;
2155                         }
2156                 }
2157
2158                 rc = ops->ndo_start_xmit(skb, dev);
2159                 trace_net_dev_xmit(skb, rc);
2160                 if (rc == NETDEV_TX_OK)
2161                         txq_trans_update(txq);
2162                 return rc;
2163         }
2164
2165 gso:
2166         do {
2167                 struct sk_buff *nskb = skb->next;
2168
2169                 skb->next = nskb->next;
2170                 nskb->next = NULL;
2171
2172                 /*
2173                  * If device doesnt need nskb->dst, release it right now while
2174                  * its hot in this cpu cache
2175                  */
2176                 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2177                         skb_dst_drop(nskb);
2178
2179                 rc = ops->ndo_start_xmit(nskb, dev);
2180                 trace_net_dev_xmit(nskb, rc);
2181                 if (unlikely(rc != NETDEV_TX_OK)) {
2182                         if (rc & ~NETDEV_TX_MASK)
2183                                 goto out_kfree_gso_skb;
2184                         nskb->next = skb->next;
2185                         skb->next = nskb;
2186                         return rc;
2187                 }
2188                 txq_trans_update(txq);
2189                 if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
2190                         return NETDEV_TX_BUSY;
2191         } while (skb->next);
2192
2193 out_kfree_gso_skb:
2194         if (likely(skb->next == NULL))
2195                 skb->destructor = DEV_GSO_CB(skb)->destructor;
2196 out_kfree_skb:
2197         kfree_skb(skb);
2198 out:
2199         return rc;
2200 }
2201
2202 static u32 hashrnd __read_mostly;
2203
2204 /*
2205  * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2206  * to be used as a distribution range.
2207  */
2208 u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
2209                   unsigned int num_tx_queues)
2210 {
2211         u32 hash;
2212         u16 qoffset = 0;
2213         u16 qcount = num_tx_queues;
2214
2215         if (skb_rx_queue_recorded(skb)) {
2216                 hash = skb_get_rx_queue(skb);
2217                 while (unlikely(hash >= num_tx_queues))
2218                         hash -= num_tx_queues;
2219                 return hash;
2220         }
2221
2222         if (dev->num_tc) {
2223                 u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2224                 qoffset = dev->tc_to_txq[tc].offset;
2225                 qcount = dev->tc_to_txq[tc].count;
2226         }
2227
2228         if (skb->sk && skb->sk->sk_hash)
2229                 hash = skb->sk->sk_hash;
2230         else
2231                 hash = (__force u16) skb->protocol ^ skb->rxhash;
2232         hash = jhash_1word(hash, hashrnd);
2233
2234         return (u16) (((u64) hash * qcount) >> 32) + qoffset;
2235 }
2236 EXPORT_SYMBOL(__skb_tx_hash);
2237
2238 static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2239 {
2240         if (unlikely(queue_index >= dev->real_num_tx_queues)) {
2241                 if (net_ratelimit()) {
2242                         pr_warning("%s selects TX queue %d, but "
2243                                 "real number of TX queues is %d\n",
2244                                 dev->name, queue_index, dev->real_num_tx_queues);
2245                 }
2246                 return 0;
2247         }
2248         return queue_index;
2249 }
2250
2251 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2252 {
2253 #ifdef CONFIG_XPS
2254         struct xps_dev_maps *dev_maps;
2255         struct xps_map *map;
2256         int queue_index = -1;
2257
2258         rcu_read_lock();
2259         dev_maps = rcu_dereference(dev->xps_maps);
2260         if (dev_maps) {
2261                 map = rcu_dereference(
2262                     dev_maps->cpu_map[raw_smp_processor_id()]);
2263                 if (map) {
2264                         if (map->len == 1)
2265                                 queue_index = map->queues[0];
2266                         else {
2267                                 u32 hash;
2268                                 if (skb->sk && skb->sk->sk_hash)
2269                                         hash = skb->sk->sk_hash;
2270                                 else
2271                                         hash = (__force u16) skb->protocol ^
2272                                             skb->rxhash;
2273                                 hash = jhash_1word(hash, hashrnd);
2274                                 queue_index = map->queues[
2275                                     ((u64)hash * map->len) >> 32];
2276                         }
2277                         if (unlikely(queue_index >= dev->real_num_tx_queues))
2278                                 queue_index = -1;
2279                 }
2280         }
2281         rcu_read_unlock();
2282
2283         return queue_index;
2284 #else
2285         return -1;
2286 #endif
2287 }
2288
2289 static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2290                                         struct sk_buff *skb)
2291 {
2292         int queue_index;
2293         const struct net_device_ops *ops = dev->netdev_ops;
2294
2295         if (dev->real_num_tx_queues == 1)
2296                 queue_index = 0;
2297         else if (ops->ndo_select_queue) {
2298                 queue_index = ops->ndo_select_queue(dev, skb);
2299                 queue_index = dev_cap_txqueue(dev, queue_index);
2300         } else {
2301                 struct sock *sk = skb->sk;
2302                 queue_index = sk_tx_queue_get(sk);
2303
2304                 if (queue_index < 0 || skb->ooo_okay ||
2305                     queue_index >= dev->real_num_tx_queues) {
2306                         int old_index = queue_index;
2307
2308                         queue_index = get_xps_queue(dev, skb);
2309                         if (queue_index < 0)
2310                                 queue_index = skb_tx_hash(dev, skb);
2311
2312                         if (queue_index != old_index && sk) {
2313                                 struct dst_entry *dst =
2314                                     rcu_dereference_check(sk->sk_dst_cache, 1);
2315
2316                                 if (dst && skb_dst(skb) == dst)
2317                                         sk_tx_queue_set(sk, queue_index);
2318                         }
2319                 }
2320         }
2321
2322         skb_set_queue_mapping(skb, queue_index);
2323         return netdev_get_tx_queue(dev, queue_index);
2324 }
2325
2326 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2327                                  struct net_device *dev,
2328                                  struct netdev_queue *txq)
2329 {
2330         spinlock_t *root_lock = qdisc_lock(q);
2331         bool contended;
2332         int rc;
2333
2334         qdisc_skb_cb(skb)->pkt_len = skb->len;
2335         qdisc_calculate_pkt_len(skb, q);
2336         /*
2337          * Heuristic to force contended enqueues to serialize on a
2338          * separate lock before trying to get qdisc main lock.
2339          * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2340          * and dequeue packets faster.
2341          */
2342         contended = qdisc_is_running(q);
2343         if (unlikely(contended))
2344                 spin_lock(&q->busylock);
2345
2346         spin_lock(root_lock);
2347         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2348                 kfree_skb(skb);
2349                 rc = NET_XMIT_DROP;
2350         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2351                    qdisc_run_begin(q)) {
2352                 /*
2353                  * This is a work-conserving queue; there are no old skbs
2354                  * waiting to be sent out; and the qdisc is not running -
2355                  * xmit the skb directly.
2356                  */
2357                 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2358                         skb_dst_force(skb);
2359
2360                 qdisc_bstats_update(q, skb);
2361
2362                 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2363                         if (unlikely(contended)) {
2364                                 spin_unlock(&q->busylock);
2365                                 contended = false;
2366                         }
2367                         __qdisc_run(q);
2368                 } else
2369                         qdisc_run_end(q);
2370
2371                 rc = NET_XMIT_SUCCESS;
2372         } else {
2373                 skb_dst_force(skb);
2374                 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2375                 if (qdisc_run_begin(q)) {
2376                         if (unlikely(contended)) {
2377                                 spin_unlock(&q->busylock);
2378                                 contended = false;
2379                         }
2380                         __qdisc_run(q);
2381                 }
2382         }
2383         spin_unlock(root_lock);
2384         if (unlikely(contended))
2385                 spin_unlock(&q->busylock);
2386         return rc;
2387 }
2388
2389 static DEFINE_PER_CPU(int, xmit_recursion);
2390 #define RECURSION_LIMIT 10
2391
2392 /**
2393  *      dev_queue_xmit - transmit a buffer
2394  *      @skb: buffer to transmit
2395  *
2396  *      Queue a buffer for transmission to a network device. The caller must
2397  *      have set the device and priority and built the buffer before calling
2398  *      this function. The function can be called from an interrupt.
2399  *
2400  *      A negative errno code is returned on a failure. A success does not
2401  *      guarantee the frame will be transmitted as it may be dropped due
2402  *      to congestion or traffic shaping.
2403  *
2404  * -----------------------------------------------------------------------------------
2405  *      I notice this method can also return errors from the queue disciplines,
2406  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2407  *      be positive.
2408  *
2409  *      Regardless of the return value, the skb is consumed, so it is currently
2410  *      difficult to retry a send to this method.  (You can bump the ref count
2411  *      before sending to hold a reference for retry if you are careful.)
2412  *
2413  *      When calling this method, interrupts MUST be enabled.  This is because
2414  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2415  *          --BLG
2416  */
2417 int dev_queue_xmit(struct sk_buff *skb)
2418 {
2419         struct net_device *dev = skb->dev;
2420         struct netdev_queue *txq;
2421         struct Qdisc *q;
2422         int rc = -ENOMEM;
2423
2424         /* Disable soft irqs for various locks below. Also
2425          * stops preemption for RCU.
2426          */
2427         rcu_read_lock_bh();
2428
2429         txq = dev_pick_tx(dev, skb);
2430         q = rcu_dereference_bh(txq->qdisc);
2431
2432 #ifdef CONFIG_NET_CLS_ACT
2433         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2434 #endif
2435         trace_net_dev_queue(skb);
2436         if (q->enqueue) {
2437                 rc = __dev_xmit_skb(skb, q, dev, txq);
2438                 goto out;
2439         }
2440
2441         /* The device has no queue. Common case for software devices:
2442            loopback, all the sorts of tunnels...
2443
2444            Really, it is unlikely that netif_tx_lock protection is necessary
2445            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2446            counters.)
2447            However, it is possible, that they rely on protection
2448            made by us here.
2449
2450            Check this and shot the lock. It is not prone from deadlocks.
2451            Either shot noqueue qdisc, it is even simpler 8)
2452          */
2453         if (dev->flags & IFF_UP) {
2454                 int cpu = smp_processor_id(); /* ok because BHs are off */
2455
2456                 if (txq->xmit_lock_owner != cpu) {
2457
2458                         if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2459                                 goto recursion_alert;
2460
2461                         HARD_TX_LOCK(dev, txq, cpu);
2462
2463                         if (!netif_tx_queue_stopped(txq)) {
2464                                 __this_cpu_inc(xmit_recursion);
2465                                 rc = dev_hard_start_xmit(skb, dev, txq);
2466                                 __this_cpu_dec(xmit_recursion);
2467                                 if (dev_xmit_complete(rc)) {
2468                                         HARD_TX_UNLOCK(dev, txq);
2469                                         goto out;
2470                                 }
2471                         }
2472                         HARD_TX_UNLOCK(dev, txq);
2473                         if (net_ratelimit())
2474                                 printk(KERN_CRIT "Virtual device %s asks to "
2475                                        "queue packet!\n", dev->name);
2476                 } else {
2477                         /* Recursion is detected! It is possible,
2478                          * unfortunately
2479                          */
2480 recursion_alert:
2481                         if (net_ratelimit())
2482                                 printk(KERN_CRIT "Dead loop on virtual device "
2483                                        "%s, fix it urgently!\n", dev->name);
2484                 }
2485         }
2486
2487         rc = -ENETDOWN;
2488         rcu_read_unlock_bh();
2489
2490         kfree_skb(skb);
2491         return rc;
2492 out:
2493         rcu_read_unlock_bh();
2494         return rc;
2495 }
2496 EXPORT_SYMBOL(dev_queue_xmit);
2497
2498
2499 /*=======================================================================
2500                         Receiver routines
2501   =======================================================================*/
2502
2503 int netdev_max_backlog __read_mostly = 1000;
2504 int netdev_tstamp_prequeue __read_mostly = 1;
2505 int netdev_budget __read_mostly = 300;
2506 int weight_p __read_mostly = 64;            /* old backlog weight */
2507
2508 /* Called with irq disabled */
2509 static inline void ____napi_schedule(struct softnet_data *sd,
2510                                      struct napi_struct *napi)
2511 {
2512         list_add_tail(&napi->poll_list, &sd->poll_list);
2513         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2514 }
2515
2516 /*
2517  * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
2518  * and src/dst port numbers. Returns a non-zero hash number on success
2519  * and 0 on failure.
2520  */
2521 __u32 __skb_get_rxhash(struct sk_buff *skb)
2522 {
2523         int nhoff, hash = 0, poff;
2524         struct ipv6hdr *ip6;
2525         struct iphdr *ip;
2526         u8 ip_proto;
2527         u32 addr1, addr2, ihl;
2528         union {
2529                 u32 v32;
2530                 u16 v16[2];
2531         } ports;
2532
2533         nhoff = skb_network_offset(skb);
2534
2535         switch (skb->protocol) {
2536         case __constant_htons(ETH_P_IP):
2537                 if (!pskb_may_pull(skb, sizeof(*ip) + nhoff))
2538                         goto done;
2539
2540                 ip = (struct iphdr *) (skb->data + nhoff);
2541                 if (ip->frag_off & htons(IP_MF | IP_OFFSET))
2542                         ip_proto = 0;
2543                 else
2544                         ip_proto = ip->protocol;
2545                 addr1 = (__force u32) ip->saddr;
2546                 addr2 = (__force u32) ip->daddr;
2547                 ihl = ip->ihl;
2548                 break;
2549         case __constant_htons(ETH_P_IPV6):
2550                 if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff))
2551                         goto done;
2552
2553                 ip6 = (struct ipv6hdr *) (skb->data + nhoff);
2554                 ip_proto = ip6->nexthdr;
2555                 addr1 = (__force u32) ip6->saddr.s6_addr32[3];
2556                 addr2 = (__force u32) ip6->daddr.s6_addr32[3];
2557                 ihl = (40 >> 2);
2558                 break;
2559         default:
2560                 goto done;
2561         }
2562
2563         ports.v32 = 0;
2564         poff = proto_ports_offset(ip_proto);
2565         if (poff >= 0) {
2566                 nhoff += ihl * 4 + poff;
2567                 if (pskb_may_pull(skb, nhoff + 4)) {
2568                         ports.v32 = * (__force u32 *) (skb->data + nhoff);
2569                         if (ports.v16[1] < ports.v16[0])
2570                                 swap(ports.v16[0], ports.v16[1]);
2571                 }
2572         }
2573
2574         /* get a consistent hash (same value on both flow directions) */
2575         if (addr2 < addr1)
2576                 swap(addr1, addr2);
2577
2578         hash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
2579         if (!hash)
2580                 hash = 1;
2581
2582 done:
2583         return hash;
2584 }
2585 EXPORT_SYMBOL(__skb_get_rxhash);
2586
2587 #ifdef CONFIG_RPS
2588
2589 /* One global table that all flow-based protocols share. */
2590 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2591 EXPORT_SYMBOL(rps_sock_flow_table);
2592
2593 static struct rps_dev_flow *
2594 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2595             struct rps_dev_flow *rflow, u16 next_cpu)
2596 {
2597         u16 tcpu;
2598
2599         tcpu = rflow->cpu = next_cpu;
2600         if (tcpu != RPS_NO_CPU) {
2601 #ifdef CONFIG_RFS_ACCEL
2602                 struct netdev_rx_queue *rxqueue;
2603                 struct rps_dev_flow_table *flow_table;
2604                 struct rps_dev_flow *old_rflow;
2605                 u32 flow_id;
2606                 u16 rxq_index;
2607                 int rc;
2608
2609                 /* Should we steer this flow to a different hardware queue? */
2610                 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap)
2611                         goto out;
2612                 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2613                 if (rxq_index == skb_get_rx_queue(skb))
2614                         goto out;
2615
2616                 rxqueue = dev->_rx + rxq_index;
2617                 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2618                 if (!flow_table)
2619                         goto out;
2620                 flow_id = skb->rxhash & flow_table->mask;
2621                 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2622                                                         rxq_index, flow_id);
2623                 if (rc < 0)
2624                         goto out;
2625                 old_rflow = rflow;
2626                 rflow = &flow_table->flows[flow_id];
2627                 rflow->cpu = next_cpu;
2628                 rflow->filter = rc;
2629                 if (old_rflow->filter == rflow->filter)
2630                         old_rflow->filter = RPS_NO_FILTER;
2631         out:
2632 #endif
2633                 rflow->last_qtail =
2634                         per_cpu(softnet_data, tcpu).input_queue_head;
2635         }
2636
2637         return rflow;
2638 }
2639
2640 /*
2641  * get_rps_cpu is called from netif_receive_skb and returns the target
2642  * CPU from the RPS map of the receiving queue for a given skb.
2643  * rcu_read_lock must be held on entry.
2644  */
2645 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2646                        struct rps_dev_flow **rflowp)
2647 {
2648         struct netdev_rx_queue *rxqueue;
2649         struct rps_map *map;
2650         struct rps_dev_flow_table *flow_table;
2651         struct rps_sock_flow_table *sock_flow_table;
2652         int cpu = -1;
2653         u16 tcpu;
2654
2655         if (skb_rx_queue_recorded(skb)) {
2656                 u16 index = skb_get_rx_queue(skb);
2657                 if (unlikely(index >= dev->real_num_rx_queues)) {
2658                         WARN_ONCE(dev->real_num_rx_queues > 1,
2659                                   "%s received packet on queue %u, but number "
2660                                   "of RX queues is %u\n",
2661                                   dev->name, index, dev->real_num_rx_queues);
2662                         goto done;
2663                 }
2664                 rxqueue = dev->_rx + index;
2665         } else
2666                 rxqueue = dev->_rx;
2667
2668         map = rcu_dereference(rxqueue->rps_map);
2669         if (map) {
2670                 if (map->len == 1 &&
2671                     !rcu_dereference_raw(rxqueue->rps_flow_table)) {
2672                         tcpu = map->cpus[0];
2673                         if (cpu_online(tcpu))
2674                                 cpu = tcpu;
2675                         goto done;
2676                 }
2677         } else if (!rcu_dereference_raw(rxqueue->rps_flow_table)) {
2678                 goto done;
2679         }
2680
2681         skb_reset_network_header(skb);
2682         if (!skb_get_rxhash(skb))
2683                 goto done;
2684
2685         flow_table = rcu_dereference(rxqueue->rps_flow_table);
2686         sock_flow_table = rcu_dereference(rps_sock_flow_table);
2687         if (flow_table && sock_flow_table) {
2688                 u16 next_cpu;
2689                 struct rps_dev_flow *rflow;
2690
2691                 rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2692                 tcpu = rflow->cpu;
2693
2694                 next_cpu = sock_flow_table->ents[skb->rxhash &
2695                     sock_flow_table->mask];
2696
2697                 /*
2698                  * If the desired CPU (where last recvmsg was done) is
2699                  * different from current CPU (one in the rx-queue flow
2700                  * table entry), switch if one of the following holds:
2701                  *   - Current CPU is unset (equal to RPS_NO_CPU).
2702                  *   - Current CPU is offline.
2703                  *   - The current CPU's queue tail has advanced beyond the
2704                  *     last packet that was enqueued using this table entry.
2705                  *     This guarantees that all previous packets for the flow
2706                  *     have been dequeued, thus preserving in order delivery.
2707                  */
2708                 if (unlikely(tcpu != next_cpu) &&
2709                     (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2710                      ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2711                       rflow->last_qtail)) >= 0))
2712                         rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
2713
2714                 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2715                         *rflowp = rflow;
2716                         cpu = tcpu;
2717                         goto done;
2718                 }
2719         }
2720
2721         if (map) {
2722                 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2723
2724                 if (cpu_online(tcpu)) {
2725                         cpu = tcpu;
2726                         goto done;
2727                 }
2728         }
2729
2730 done:
2731         return cpu;
2732 }
2733
2734 #ifdef CONFIG_RFS_ACCEL
2735
2736 /**
2737  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
2738  * @dev: Device on which the filter was set
2739  * @rxq_index: RX queue index
2740  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
2741  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
2742  *
2743  * Drivers that implement ndo_rx_flow_steer() should periodically call
2744  * this function for each installed filter and remove the filters for
2745  * which it returns %true.
2746  */
2747 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
2748                          u32 flow_id, u16 filter_id)
2749 {
2750         struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
2751         struct rps_dev_flow_table *flow_table;
2752         struct rps_dev_flow *rflow;
2753         bool expire = true;
2754         int cpu;
2755
2756         rcu_read_lock();
2757         flow_table = rcu_dereference(rxqueue->rps_flow_table);
2758         if (flow_table && flow_id <= flow_table->mask) {
2759                 rflow = &flow_table->flows[flow_id];
2760                 cpu = ACCESS_ONCE(rflow->cpu);
2761                 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
2762                     ((int)(per_cpu(softnet_data, cpu).input_queue_head -
2763                            rflow->last_qtail) <
2764                      (int)(10 * flow_table->mask)))
2765                         expire = false;
2766         }
2767         rcu_read_unlock();
2768         return expire;
2769 }
2770 EXPORT_SYMBOL(rps_may_expire_flow);
2771
2772 #endif /* CONFIG_RFS_ACCEL */
2773
2774 /* Called from hardirq (IPI) context */
2775 static void rps_trigger_softirq(void *data)
2776 {
2777         struct softnet_data *sd = data;
2778
2779         ____napi_schedule(sd, &sd->backlog);
2780         sd->received_rps++;
2781 }
2782
2783 #endif /* CONFIG_RPS */
2784
2785 /*
2786  * Check if this softnet_data structure is another cpu one
2787  * If yes, queue it to our IPI list and return 1
2788  * If no, return 0
2789  */
2790 static int rps_ipi_queued(struct softnet_data *sd)
2791 {
2792 #ifdef CONFIG_RPS
2793         struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2794
2795         if (sd != mysd) {
2796                 sd->rps_ipi_next = mysd->rps_ipi_list;
2797                 mysd->rps_ipi_list = sd;
2798
2799                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2800                 return 1;
2801         }
2802 #endif /* CONFIG_RPS */
2803         return 0;
2804 }
2805
2806 /*
2807  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2808  * queue (may be a remote CPU queue).
2809  */
2810 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2811                               unsigned int *qtail)
2812 {
2813         struct softnet_data *sd;
2814         unsigned long flags;
2815
2816         sd = &per_cpu(softnet_data, cpu);
2817
2818         local_irq_save(flags);
2819
2820         rps_lock(sd);
2821         if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
2822                 if (skb_queue_len(&sd->input_pkt_queue)) {
2823 enqueue:
2824                         __skb_queue_tail(&sd->input_pkt_queue, skb);
2825                         input_queue_tail_incr_save(sd, qtail);
2826                         rps_unlock(sd);
2827                         local_irq_restore(flags);
2828                         return NET_RX_SUCCESS;
2829                 }
2830
2831                 /* Schedule NAPI for backlog device
2832                  * We can use non atomic operation since we own the queue lock
2833                  */
2834                 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
2835                         if (!rps_ipi_queued(sd))
2836                                 ____napi_schedule(sd, &sd->backlog);
2837                 }
2838                 goto enqueue;
2839         }
2840
2841         sd->dropped++;
2842         rps_unlock(sd);
2843
2844         local_irq_restore(flags);
2845
2846         atomic_long_inc(&skb->dev->rx_dropped);
2847         kfree_skb(skb);
2848         return NET_RX_DROP;
2849 }
2850
2851 /**
2852  *      netif_rx        -       post buffer to the network code
2853  *      @skb: buffer to post
2854  *
2855  *      This function receives a packet from a device driver and queues it for
2856  *      the upper (protocol) levels to process.  It always succeeds. The buffer
2857  *      may be dropped during processing for congestion control or by the
2858  *      protocol layers.
2859  *
2860  *      return values:
2861  *      NET_RX_SUCCESS  (no congestion)
2862  *      NET_RX_DROP     (packet was dropped)
2863  *
2864  */
2865
2866 int netif_rx(struct sk_buff *skb)
2867 {
2868         int ret;
2869
2870         /* if netpoll wants it, pretend we never saw it */
2871         if (netpoll_rx(skb))
2872                 return NET_RX_DROP;
2873
2874         if (netdev_tstamp_prequeue)
2875                 net_timestamp_check(skb);
2876
2877         trace_netif_rx(skb);
2878 #ifdef CONFIG_RPS
2879         {
2880                 struct rps_dev_flow voidflow, *rflow = &voidflow;
2881                 int cpu;
2882
2883                 preempt_disable();
2884                 rcu_read_lock();
2885
2886                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
2887                 if (cpu < 0)
2888                         cpu = smp_processor_id();
2889
2890                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2891
2892                 rcu_read_unlock();
2893                 preempt_enable();
2894         }
2895 #else
2896         {
2897                 unsigned int qtail;
2898                 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
2899                 put_cpu();
2900         }
2901 #endif
2902         return ret;
2903 }
2904 EXPORT_SYMBOL(netif_rx);
2905
2906 int netif_rx_ni(struct sk_buff *skb)
2907 {
2908         int err;
2909
2910         preempt_disable();
2911         err = netif_rx(skb);
2912         if (local_softirq_pending())
2913                 do_softirq();
2914         preempt_enable();
2915
2916         return err;
2917 }
2918 EXPORT_SYMBOL(netif_rx_ni);
2919
2920 static void net_tx_action(struct softirq_action *h)
2921 {
2922         struct softnet_data *sd = &__get_cpu_var(softnet_data);
2923
2924         if (sd->completion_queue) {
2925                 struct sk_buff *clist;
2926
2927                 local_irq_disable();
2928                 clist = sd->completion_queue;
2929                 sd->completion_queue = NULL;
2930                 local_irq_enable();
2931
2932                 while (clist) {
2933                         struct sk_buff *skb = clist;
2934                         clist = clist->next;
2935
2936                         WARN_ON(atomic_read(&skb->users));
2937                         trace_kfree_skb(skb, net_tx_action);
2938                         __kfree_skb(skb);
2939                 }
2940         }
2941
2942         if (sd->output_queue) {
2943                 struct Qdisc *head;
2944
2945                 local_irq_disable();
2946                 head = sd->output_queue;
2947                 sd->output_queue = NULL;
2948                 sd->output_queue_tailp = &sd->output_queue;
2949                 local_irq_enable();
2950
2951                 while (head) {
2952                         struct Qdisc *q = head;
2953                         spinlock_t *root_lock;
2954
2955                         head = head->next_sched;
2956
2957                         root_lock = qdisc_lock(q);
2958                         if (spin_trylock(root_lock)) {
2959                                 smp_mb__before_clear_bit();
2960                                 clear_bit(__QDISC_STATE_SCHED,
2961                                           &q->state);
2962                                 qdisc_run(q);
2963                                 spin_unlock(root_lock);
2964                         } else {
2965                                 if (!test_bit(__QDISC_STATE_DEACTIVATED,
2966                                               &q->state)) {
2967                                         __netif_reschedule(q);
2968                                 } else {
2969                                         smp_mb__before_clear_bit();
2970                                         clear_bit(__QDISC_STATE_SCHED,
2971                                                   &q->state);
2972                                 }
2973                         }
2974                 }
2975         }
2976 }
2977
2978 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
2979     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
2980 /* This hook is defined here for ATM LANE */
2981 int (*br_fdb_test_addr_hook)(struct net_device *dev,
2982                              unsigned char *addr) __read_mostly;
2983 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
2984 #endif
2985
2986 #ifdef CONFIG_NET_CLS_ACT
2987 /* TODO: Maybe we should just force sch_ingress to be compiled in
2988  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2989  * a compare and 2 stores extra right now if we dont have it on
2990  * but have CONFIG_NET_CLS_ACT
2991  * NOTE: This doesnt stop any functionality; if you dont have
2992  * the ingress scheduler, you just cant add policies on ingress.
2993  *
2994  */
2995 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
2996 {
2997         struct net_device *dev = skb->dev;
2998         u32 ttl = G_TC_RTTL(skb->tc_verd);
2999         int result = TC_ACT_OK;
3000         struct Qdisc *q;
3001
3002         if (unlikely(MAX_RED_LOOP < ttl++)) {
3003                 if (net_ratelimit())
3004                         pr_warning( "Redir loop detected Dropping packet (%d->%d)\n",
3005                                skb->skb_iif, dev->ifindex);
3006                 return TC_ACT_SHOT;
3007         }
3008
3009         skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3010         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3011
3012         q = rxq->qdisc;
3013         if (q != &noop_qdisc) {
3014                 spin_lock(qdisc_lock(q));
3015                 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3016                         result = qdisc_enqueue_root(skb, q);
3017                 spin_unlock(qdisc_lock(q));
3018         }
3019
3020         return result;
3021 }
3022
3023 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3024                                          struct packet_type **pt_prev,
3025                                          int *ret, struct net_device *orig_dev)
3026 {
3027         struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3028
3029         if (!rxq || rxq->qdisc == &noop_qdisc)
3030                 goto out;
3031
3032         if (*pt_prev) {
3033                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3034                 *pt_prev = NULL;
3035         }
3036
3037         switch (ing_filter(skb, rxq)) {
3038         case TC_ACT_SHOT:
3039         case TC_ACT_STOLEN:
3040                 kfree_skb(skb);
3041                 return NULL;
3042         }
3043
3044 out:
3045         skb->tc_verd = 0;
3046         return skb;
3047 }
3048 #endif
3049
3050 /**
3051  *      netdev_rx_handler_register - register receive handler
3052  *      @dev: device to register a handler for
3053  *      @rx_handler: receive handler to register
3054  *      @rx_handler_data: data pointer that is used by rx handler
3055  *
3056  *      Register a receive hander for a device. This handler will then be
3057  *      called from __netif_receive_skb. A negative errno code is returned
3058  *      on a failure.
3059  *
3060  *      The caller must hold the rtnl_mutex.
3061  */
3062 int netdev_rx_handler_register(struct net_device *dev,
3063                                rx_handler_func_t *rx_handler,
3064                                void *rx_handler_data)
3065 {
3066         ASSERT_RTNL();
3067
3068         if (dev->rx_handler)
3069                 return -EBUSY;
3070
3071         rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3072         rcu_assign_pointer(dev->rx_handler, rx_handler);
3073
3074         return 0;
3075 }
3076 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3077
3078 /**
3079  *      netdev_rx_handler_unregister - unregister receive handler
3080  *      @dev: device to unregister a handler from
3081  *
3082  *      Unregister a receive hander from a device.
3083  *
3084  *      The caller must hold the rtnl_mutex.
3085  */
3086 void netdev_rx_handler_unregister(struct net_device *dev)
3087 {
3088
3089         ASSERT_RTNL();
3090         rcu_assign_pointer(dev->rx_handler, NULL);
3091         rcu_assign_pointer(dev->rx_handler_data, NULL);
3092 }
3093 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3094
3095 static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
3096                                               struct net_device *master)
3097 {
3098         if (skb->pkt_type == PACKET_HOST) {
3099                 u16 *dest = (u16 *) eth_hdr(skb)->h_dest;
3100
3101                 memcpy(dest, master->dev_addr, ETH_ALEN);
3102         }
3103 }
3104
3105 /* On bonding slaves other than the currently active slave, suppress
3106  * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
3107  * ARP on active-backup slaves with arp_validate enabled.
3108  */
3109 int __skb_bond_should_drop(struct sk_buff *skb, struct net_device *master)
3110 {
3111         struct net_device *dev = skb->dev;
3112
3113         if (master->priv_flags & IFF_MASTER_ARPMON)
3114                 dev->last_rx = jiffies;
3115
3116         if ((master->priv_flags & IFF_MASTER_ALB) &&
3117             (master->priv_flags & IFF_BRIDGE_PORT)) {
3118                 /* Do address unmangle. The local destination address
3119                  * will be always the one master has. Provides the right
3120                  * functionality in a bridge.
3121                  */
3122                 skb_bond_set_mac_by_master(skb, master);
3123         }
3124
3125         if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
3126                 if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
3127                     skb->protocol == __cpu_to_be16(ETH_P_ARP))
3128                         return 0;
3129
3130                 if (master->priv_flags & IFF_MASTER_ALB) {
3131                         if (skb->pkt_type != PACKET_BROADCAST &&
3132                             skb->pkt_type != PACKET_MULTICAST)
3133                                 return 0;
3134                 }
3135                 if (master->priv_flags & IFF_MASTER_8023AD &&
3136                     skb->protocol == __cpu_to_be16(ETH_P_SLOW))
3137                         return 0;
3138
3139                 return 1;
3140         }
3141         return 0;
3142 }
3143 EXPORT_SYMBOL(__skb_bond_should_drop);
3144
3145 static int __netif_receive_skb(struct sk_buff *skb)
3146 {
3147         struct packet_type *ptype, *pt_prev;
3148         rx_handler_func_t *rx_handler;
3149         struct net_device *orig_dev;
3150         struct net_device *master;
3151         struct net_device *null_or_orig;
3152         struct net_device *orig_or_bond;
3153         int ret = NET_RX_DROP;
3154         __be16 type;
3155
3156         if (!netdev_tstamp_prequeue)
3157                 net_timestamp_check(skb);
3158
3159         trace_netif_receive_skb(skb);
3160
3161         /* if we've gotten here through NAPI, check netpoll */
3162         if (netpoll_receive_skb(skb))
3163                 return NET_RX_DROP;
3164
3165         if (!skb->skb_iif)
3166                 skb->skb_iif = skb->dev->ifindex;
3167
3168         /*
3169          * bonding note: skbs received on inactive slaves should only
3170          * be delivered to pkt handlers that are exact matches.  Also
3171          * the deliver_no_wcard flag will be set.  If packet handlers
3172          * are sensitive to duplicate packets these skbs will need to
3173          * be dropped at the handler.
3174          */
3175         null_or_orig = NULL;
3176         orig_dev = skb->dev;
3177         master = ACCESS_ONCE(orig_dev->master);
3178         if (skb->deliver_no_wcard)
3179                 null_or_orig = orig_dev;
3180         else if (master) {
3181                 if (skb_bond_should_drop(skb, master)) {
3182                         skb->deliver_no_wcard = 1;
3183                         null_or_orig = orig_dev; /* deliver only exact match */
3184                 } else
3185                         skb->dev = master;
3186         }
3187
3188         __this_cpu_inc(softnet_data.processed);
3189         skb_reset_network_header(skb);
3190         skb_reset_transport_header(skb);
3191         skb->mac_len = skb->network_header - skb->mac_header;
3192
3193         pt_prev = NULL;
3194
3195         rcu_read_lock();
3196
3197 #ifdef CONFIG_NET_CLS_ACT
3198         if (skb->tc_verd & TC_NCLS) {
3199                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3200                 goto ncls;
3201         }
3202 #endif
3203
3204         list_for_each_entry_rcu(ptype, &ptype_all, list) {
3205                 if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
3206                     ptype->dev == orig_dev) {
3207                         if (pt_prev)
3208                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3209                         pt_prev = ptype;
3210                 }
3211         }
3212
3213 #ifdef CONFIG_NET_CLS_ACT
3214         skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3215         if (!skb)
3216                 goto out;
3217 ncls:
3218 #endif
3219
3220         /* Handle special case of bridge or macvlan */
3221         rx_handler = rcu_dereference(skb->dev->rx_handler);
3222         if (rx_handler) {
3223                 if (pt_prev) {
3224                         ret = deliver_skb(skb, pt_prev, orig_dev);
3225                         pt_prev = NULL;
3226                 }
3227                 skb = rx_handler(skb);
3228                 if (!skb)
3229                         goto out;
3230         }
3231
3232         if (vlan_tx_tag_present(skb)) {
3233                 if (pt_prev) {
3234                         ret = deliver_skb(skb, pt_prev, orig_dev);
3235                         pt_prev = NULL;
3236                 }
3237                 if (vlan_hwaccel_do_receive(&skb)) {
3238                         ret = __netif_receive_skb(skb);
3239                         goto out;
3240                 } else if (unlikely(!skb))
3241                         goto out;
3242         }
3243
3244         /*
3245          * Make sure frames received on VLAN interfaces stacked on
3246          * bonding interfaces still make their way to any base bonding
3247          * device that may have registered for a specific ptype.  The
3248          * handler may have to adjust skb->dev and orig_dev.
3249          */
3250         orig_or_bond = orig_dev;
3251         if ((skb->dev->priv_flags & IFF_802_1Q_VLAN) &&
3252             (vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING)) {
3253                 orig_or_bond = vlan_dev_real_dev(skb->dev);
3254         }
3255
3256         type = skb->protocol;
3257         list_for_each_entry_rcu(ptype,
3258                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3259                 if (ptype->type == type && (ptype->dev == null_or_orig ||
3260                      ptype->dev == skb->dev || ptype->dev == orig_dev ||
3261                      ptype->dev == orig_or_bond)) {
3262                         if (pt_prev)
3263                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3264                         pt_prev = ptype;
3265                 }
3266         }
3267
3268         if (pt_prev) {
3269                 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3270         } else {
3271                 atomic_long_inc(&skb->dev->rx_dropped);
3272                 kfree_skb(skb);
3273                 /* Jamal, now you will not able to escape explaining
3274                  * me how you were going to use this. :-)
3275                  */
3276                 ret = NET_RX_DROP;
3277         }
3278
3279 out:
3280         rcu_read_unlock();
3281         return ret;
3282 }
3283
3284 /**
3285  *      netif_receive_skb - process receive buffer from network
3286  *      @skb: buffer to process
3287  *
3288  *      netif_receive_skb() is the main receive data processing function.
3289  *      It always succeeds. The buffer may be dropped during processing
3290  *      for congestion control or by the protocol layers.
3291  *
3292  *      This function may only be called from softirq context and interrupts
3293  *      should be enabled.
3294  *
3295  *      Return values (usually ignored):
3296  *      NET_RX_SUCCESS: no congestion
3297  *      NET_RX_DROP: packet was dropped
3298  */
3299 int netif_receive_skb(struct sk_buff *skb)
3300 {
3301         if (netdev_tstamp_prequeue)
3302                 net_timestamp_check(skb);
3303
3304         if (skb_defer_rx_timestamp(skb))
3305                 return NET_RX_SUCCESS;
3306
3307 #ifdef CONFIG_RPS
3308         {
3309                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3310                 int cpu, ret;
3311
3312                 rcu_read_lock();
3313
3314                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3315
3316                 if (cpu >= 0) {
3317                         ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3318                         rcu_read_unlock();
3319                 } else {
3320                         rcu_read_unlock();
3321                         ret = __netif_receive_skb(skb);
3322                 }
3323
3324                 return ret;
3325         }
3326 #else
3327         return __netif_receive_skb(skb);
3328 #endif
3329 }
3330 EXPORT_SYMBOL(netif_receive_skb);
3331
3332 /* Network device is going away, flush any packets still pending
3333  * Called with irqs disabled.
3334  */
3335 static void flush_backlog(void *arg)
3336 {
3337         struct net_device *dev = arg;
3338         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3339         struct sk_buff *skb, *tmp;
3340
3341         rps_lock(sd);
3342         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3343                 if (skb->dev == dev) {
3344                         __skb_unlink(skb, &sd->input_pkt_queue);
3345                         kfree_skb(skb);
3346                         input_queue_head_incr(sd);
3347                 }
3348         }
3349         rps_unlock(sd);
3350
3351         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3352                 if (skb->dev == dev) {
3353                         __skb_unlink(skb, &sd->process_queue);
3354                         kfree_skb(skb);
3355                         input_queue_head_incr(sd);
3356                 }
3357         }
3358 }
3359
3360 static int napi_gro_complete(struct sk_buff *skb)
3361 {
3362         struct packet_type *ptype;
3363         __be16 type = skb->protocol;
3364         struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3365         int err = -ENOENT;
3366
3367         if (NAPI_GRO_CB(skb)->count == 1) {
3368                 skb_shinfo(skb)->gso_size = 0;
3369                 goto out;
3370         }
3371
3372         rcu_read_lock();
3373         list_for_each_entry_rcu(ptype, head, list) {
3374                 if (ptype->type != type || ptype->dev || !ptype->gro_complete)
3375                         continue;
3376
3377                 err = ptype->gro_complete(skb);
3378                 break;
3379         }
3380         rcu_read_unlock();
3381
3382         if (err) {
3383                 WARN_ON(&ptype->list == head);
3384                 kfree_skb(skb);
3385                 return NET_RX_SUCCESS;
3386         }
3387
3388 out:
3389         return netif_receive_skb(skb);
3390 }
3391
3392 inline void napi_gro_flush(struct napi_struct *napi)
3393 {
3394         struct sk_buff *skb, *next;
3395
3396         for (skb = napi->gro_list; skb; skb = next) {
3397                 next = skb->next;
3398                 skb->next = NULL;
3399                 napi_gro_complete(skb);
3400         }
3401
3402         napi->gro_count = 0;
3403         napi->gro_list = NULL;
3404 }
3405 EXPORT_SYMBOL(napi_gro_flush);
3406
3407 enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3408 {
3409         struct sk_buff **pp = NULL;
3410         struct packet_type *ptype;
3411         __be16 type = skb->protocol;
3412         struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3413         int same_flow;
3414         int mac_len;
3415         enum gro_result ret;
3416
3417         if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3418                 goto normal;
3419
3420         if (skb_is_gso(skb) || skb_has_frag_list(skb))
3421                 goto normal;
3422
3423         rcu_read_lock();
3424         list_for_each_entry_rcu(ptype, head, list) {
3425                 if (ptype->type != type || ptype->dev || !ptype->gro_receive)
3426                         continue;
3427
3428                 skb_set_network_header(skb, skb_gro_offset(skb));
3429                 mac_len = skb->network_header - skb->mac_header;
3430                 skb->mac_len = mac_len;
3431                 NAPI_GRO_CB(skb)->same_flow = 0;
3432                 NAPI_GRO_CB(skb)->flush = 0;
3433                 NAPI_GRO_CB(skb)->free = 0;
3434
3435                 pp = ptype->gro_receive(&napi->gro_list, skb);
3436                 break;
3437         }
3438         rcu_read_unlock();
3439
3440         if (&ptype->list == head)
3441                 goto normal;
3442
3443         same_flow = NAPI_GRO_CB(skb)->same_flow;
3444         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3445
3446         if (pp) {
3447                 struct sk_buff *nskb = *pp;
3448
3449                 *pp = nskb->next;
3450                 nskb->next = NULL;
3451                 napi_gro_complete(nskb);
3452                 napi->gro_count--;
3453         }
3454
3455         if (same_flow)
3456                 goto ok;
3457
3458         if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3459                 goto normal;
3460
3461         napi->gro_count++;
3462         NAPI_GRO_CB(skb)->count = 1;
3463         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3464         skb->next = napi->gro_list;
3465         napi->gro_list = skb;
3466         ret = GRO_HELD;
3467
3468 pull:
3469         if (skb_headlen(skb) < skb_gro_offset(skb)) {
3470                 int grow = skb_gro_offset(skb) - skb_headlen(skb);
3471
3472                 BUG_ON(skb->end - skb->tail < grow);
3473
3474                 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3475
3476                 skb->tail += grow;
3477                 skb->data_len -= grow;
3478
3479                 skb_shinfo(skb)->frags[0].page_offset += grow;
3480                 skb_shinfo(skb)->frags[0].size -= grow;
3481
3482                 if (unlikely(!skb_shinfo(skb)->frags[0].size)) {
3483                         put_page(skb_shinfo(skb)->frags[0].page);
3484                         memmove(skb_shinfo(skb)->frags,
3485                                 skb_shinfo(skb)->frags + 1,
3486                                 --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3487                 }
3488         }
3489
3490 ok:
3491         return ret;
3492
3493 normal:
3494         ret = GRO_NORMAL;
3495         goto pull;
3496 }
3497 EXPORT_SYMBOL(dev_gro_receive);
3498
3499 static inline gro_result_t
3500 __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3501 {
3502         struct sk_buff *p;
3503
3504         for (p = napi->gro_list; p; p = p->next) {
3505                 unsigned long diffs;
3506
3507                 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3508                 diffs |= p->vlan_tci ^ skb->vlan_tci;
3509                 diffs |= compare_ether_header(skb_mac_header(p),
3510                                               skb_gro_mac_header(skb));
3511                 NAPI_GRO_CB(p)->same_flow = !diffs;
3512                 NAPI_GRO_CB(p)->flush = 0;
3513         }
3514
3515         return dev_gro_receive(napi, skb);
3516 }
3517
3518 gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3519 {
3520         switch (ret) {
3521         case GRO_NORMAL:
3522                 if (netif_receive_skb(skb))
3523                         ret = GRO_DROP;
3524                 break;
3525
3526         case GRO_DROP:
3527         case GRO_MERGED_FREE:
3528                 kfree_skb(skb);
3529                 break;
3530
3531         case GRO_HELD:
3532         case GRO_MERGED:
3533                 break;
3534         }
3535
3536         return ret;
3537 }
3538 EXPORT_SYMBOL(napi_skb_finish);
3539
3540 void skb_gro_reset_offset(struct sk_buff *skb)
3541 {
3542         NAPI_GRO_CB(skb)->data_offset = 0;
3543         NAPI_GRO_CB(skb)->frag0 = NULL;
3544         NAPI_GRO_CB(skb)->frag0_len = 0;
3545
3546         if (skb->mac_header == skb->tail &&
3547             !PageHighMem(skb_shinfo(skb)->frags[0].page)) {
3548                 NAPI_GRO_CB(skb)->frag0 =
3549                         page_address(skb_shinfo(skb)->frags[0].page) +
3550                         skb_shinfo(skb)->frags[0].page_offset;
3551                 NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size;
3552         }
3553 }
3554 EXPORT_SYMBOL(skb_gro_reset_offset);
3555
3556 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3557 {
3558         skb_gro_reset_offset(skb);
3559
3560         return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
3561 }
3562 EXPORT_SYMBOL(napi_gro_receive);
3563
3564 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3565 {
3566         __skb_pull(skb, skb_headlen(skb));
3567         skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
3568         skb->vlan_tci = 0;
3569         skb->dev = napi->dev;
3570         skb->skb_iif = 0;
3571
3572         napi->skb = skb;
3573 }
3574
3575 struct sk_buff *napi_get_frags(struct napi_struct *napi)
3576 {
3577         struct sk_buff *skb = napi->skb;
3578
3579         if (!skb) {
3580                 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3581                 if (skb)
3582                         napi->skb = skb;
3583         }
3584         return skb;
3585 }
3586 EXPORT_SYMBOL(napi_get_frags);
3587
3588 gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3589                                gro_result_t ret)
3590 {
3591         switch (ret) {
3592         case GRO_NORMAL:
3593         case GRO_HELD:
3594                 skb->protocol = eth_type_trans(skb, skb->dev);
3595
3596                 if (ret == GRO_HELD)
3597                         skb_gro_pull(skb, -ETH_HLEN);
3598                 else if (netif_receive_skb(skb))
3599                         ret = GRO_DROP;
3600                 break;
3601
3602         case GRO_DROP:
3603         case GRO_MERGED_FREE:
3604                 napi_reuse_skb(napi, skb);
3605                 break;
3606
3607         case GRO_MERGED:
3608                 break;
3609         }
3610
3611         return ret;
3612 }
3613 EXPORT_SYMBOL(napi_frags_finish);
3614
3615 struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3616 {
3617         struct sk_buff *skb = napi->skb;
3618         struct ethhdr *eth;
3619         unsigned int hlen;
3620         unsigned int off;
3621
3622         napi->skb = NULL;
3623
3624         skb_reset_mac_header(skb);
3625         skb_gro_reset_offset(skb);
3626
3627         off = skb_gro_offset(skb);
3628         hlen = off + sizeof(*eth);
3629         eth = skb_gro_header_fast(skb, off);
3630         if (skb_gro_header_hard(skb, hlen)) {
3631                 eth = skb_gro_header_slow(skb, hlen, off);
3632                 if (unlikely(!eth)) {
3633                         napi_reuse_skb(napi, skb);
3634                         skb = NULL;
3635                         goto out;
3636                 }
3637         }
3638
3639         skb_gro_pull(skb, sizeof(*eth));
3640
3641         /*
3642          * This works because the only protocols we care about don't require
3643          * special handling.  We'll fix it up properly at the end.
3644          */
3645         skb->protocol = eth->h_proto;
3646
3647 out:
3648         return skb;
3649 }
3650 EXPORT_SYMBOL(napi_frags_skb);
3651
3652 gro_result_t napi_gro_frags(struct napi_struct *napi)
3653 {
3654         struct sk_buff *skb = napi_frags_skb(napi);
3655
3656         if (!skb)
3657                 return GRO_DROP;
3658
3659         return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
3660 }
3661 EXPORT_SYMBOL(napi_gro_frags);
3662
3663 /*
3664  * net_rps_action sends any pending IPI's for rps.
3665  * Note: called with local irq disabled, but exits with local irq enabled.
3666  */
3667 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3668 {
3669 #ifdef CONFIG_RPS
3670         struct softnet_data *remsd = sd->rps_ipi_list;
3671
3672         if (remsd) {
3673                 sd->rps_ipi_list = NULL;
3674
3675                 local_irq_enable();
3676
3677                 /* Send pending IPI's to kick RPS processing on remote cpus. */
3678                 while (remsd) {
3679                         struct softnet_data *next = remsd->rps_ipi_next;
3680
3681                         if (cpu_online(remsd->cpu))
3682                                 __smp_call_function_single(remsd->cpu,
3683                                                            &remsd->csd, 0);
3684                         remsd = next;
3685                 }
3686         } else
3687 #endif
3688                 local_irq_enable();
3689 }
3690
3691 static int process_backlog(struct napi_struct *napi, int quota)
3692 {
3693         int work = 0;
3694         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
3695
3696 #ifdef CONFIG_RPS
3697         /* Check if we have pending ipi, its better to send them now,
3698          * not waiting net_rx_action() end.
3699          */
3700         if (sd->rps_ipi_list) {
3701                 local_irq_disable();
3702                 net_rps_action_and_irq_enable(sd);
3703         }
3704 #endif
3705         napi->weight = weight_p;
3706         local_irq_disable();
3707         while (work < quota) {
3708                 struct sk_buff *skb;
3709                 unsigned int qlen;
3710
3711                 while ((skb = __skb_dequeue(&sd->process_queue))) {
3712                         local_irq_enable();
3713                         __netif_receive_skb(skb);
3714                         local_irq_disable();
3715                         input_queue_head_incr(sd);
3716                         if (++work >= quota) {
3717                                 local_irq_enable();
3718                                 return work;
3719                         }
3720                 }
3721
3722                 rps_lock(sd);
3723                 qlen = skb_queue_len(&sd->input_pkt_queue);
3724                 if (qlen)
3725                         skb_queue_splice_tail_init(&sd->input_pkt_queue,
3726                                                    &sd->process_queue);
3727
3728                 if (qlen < quota - work) {
3729                         /*
3730                          * Inline a custom version of __napi_complete().
3731                          * only current cpu owns and manipulates this napi,
3732                          * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3733                          * we can use a plain write instead of clear_bit(),
3734                          * and we dont need an smp_mb() memory barrier.
3735                          */
3736                         list_del(&napi->poll_list);
3737                         napi->state = 0;
3738
3739                         quota = work + qlen;
3740                 }
3741                 rps_unlock(sd);
3742         }
3743         local_irq_enable();
3744
3745         return work;
3746 }
3747
3748 /**
3749  * __napi_schedule - schedule for receive
3750  * @n: entry to schedule
3751  *
3752  * The entry's receive function will be scheduled to run
3753  */
3754 void __napi_schedule(struct napi_struct *n)
3755 {
3756         unsigned long flags;
3757
3758         local_irq_save(flags);
3759         ____napi_schedule(&__get_cpu_var(softnet_data), n);
3760         local_irq_restore(flags);
3761 }
3762 EXPORT_SYMBOL(__napi_schedule);
3763
3764 void __napi_complete(struct napi_struct *n)
3765 {
3766         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3767         BUG_ON(n->gro_list);
3768
3769         list_del(&n->poll_list);
3770         smp_mb__before_clear_bit();
3771         clear_bit(NAPI_STATE_SCHED, &n->state);
3772 }
3773 EXPORT_SYMBOL(__napi_complete);
3774
3775 void napi_complete(struct napi_struct *n)
3776 {
3777         unsigned long flags;
3778
3779         /*
3780          * don't let napi dequeue from the cpu poll list
3781          * just in case its running on a different cpu
3782          */
3783         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3784                 return;
3785
3786         napi_gro_flush(n);
3787         local_irq_save(flags);
3788         __napi_complete(n);
3789         local_irq_restore(flags);
3790 }
3791 EXPORT_SYMBOL(napi_complete);
3792
3793 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3794                     int (*poll)(struct napi_struct *, int), int weight)
3795 {
3796         INIT_LIST_HEAD(&napi->poll_list);
3797         napi->gro_count = 0;
3798         napi->gro_list = NULL;
3799         napi->skb = NULL;
3800         napi->poll = poll;
3801         napi->weight = weight;
3802         list_add(&napi->dev_list, &dev->napi_list);
3803         napi->dev = dev;
3804 #ifdef CONFIG_NETPOLL
3805         spin_lock_init(&napi->poll_lock);
3806         napi->poll_owner = -1;
3807 #endif
3808         set_bit(NAPI_STATE_SCHED, &napi->state);
3809 }
3810 EXPORT_SYMBOL(netif_napi_add);
3811
3812 void netif_napi_del(struct napi_struct *napi)
3813 {
3814         struct sk_buff *skb, *next;
3815
3816         list_del_init(&napi->dev_list);
3817         napi_free_frags(napi);
3818
3819         for (skb = napi->gro_list; skb; skb = next) {
3820                 next = skb->next;
3821                 skb->next = NULL;
3822                 kfree_skb(skb);
3823         }
3824
3825         napi->gro_list = NULL;
3826         napi->gro_count = 0;
3827 }
3828 EXPORT_SYMBOL(netif_napi_del);
3829
3830 static void net_rx_action(struct softirq_action *h)
3831 {
3832         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3833         unsigned long time_limit = jiffies + 2;
3834         int budget = netdev_budget;
3835         void *have;
3836
3837         local_irq_disable();
3838
3839         while (!list_empty(&sd->poll_list)) {
3840                 struct napi_struct *n;
3841                 int work, weight;
3842
3843                 /* If softirq window is exhuasted then punt.
3844                  * Allow this to run for 2 jiffies since which will allow
3845                  * an average latency of 1.5/HZ.
3846                  */
3847                 if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
3848                         goto softnet_break;
3849
3850                 local_irq_enable();
3851
3852                 /* Even though interrupts have been re-enabled, this
3853                  * access is safe because interrupts can only add new
3854                  * entries to the tail of this list, and only ->poll()
3855                  * calls can remove this head entry from the list.
3856                  */
3857                 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
3858
3859                 have = netpoll_poll_lock(n);
3860
3861                 weight = n->weight;
3862
3863                 /* This NAPI_STATE_SCHED test is for avoiding a race
3864                  * with netpoll's poll_napi().  Only the entity which
3865                  * obtains the lock and sees NAPI_STATE_SCHED set will
3866                  * actually make the ->poll() call.  Therefore we avoid
3867                  * accidently calling ->poll() when NAPI is not scheduled.
3868                  */
3869                 work = 0;
3870                 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
3871                         work = n->poll(n, weight);
3872                         trace_napi_poll(n);
3873                 }
3874
3875                 WARN_ON_ONCE(work > weight);
3876
3877                 budget -= work;
3878
3879                 local_irq_disable();
3880
3881                 /* Drivers must not modify the NAPI state if they
3882                  * consume the entire weight.  In such cases this code
3883                  * still "owns" the NAPI instance and therefore can
3884                  * move the instance around on the list at-will.
3885                  */
3886                 if (unlikely(work == weight)) {
3887                         if (unlikely(napi_disable_pending(n))) {
3888                                 local_irq_enable();
3889                                 napi_complete(n);
3890                                 local_irq_disable();
3891                         } else
3892                                 list_move_tail(&n->poll_list, &sd->poll_list);
3893                 }
3894
3895                 netpoll_poll_unlock(have);
3896         }
3897 out:
3898         net_rps_action_and_irq_enable(sd);
3899
3900 #ifdef CONFIG_NET_DMA
3901         /*
3902          * There may not be any more sk_buffs coming right now, so push
3903          * any pending DMA copies to hardware
3904          */
3905         dma_issue_pending_all();
3906 #endif
3907
3908         return;
3909
3910 softnet_break:
3911         sd->time_squeeze++;
3912         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3913         goto out;
3914 }
3915
3916 static gifconf_func_t *gifconf_list[NPROTO];
3917
3918 /**
3919  *      register_gifconf        -       register a SIOCGIF handler
3920  *      @family: Address family
3921  *      @gifconf: Function handler
3922  *
3923  *      Register protocol dependent address dumping routines. The handler
3924  *      that is passed must not be freed or reused until it has been replaced
3925  *      by another handler.
3926  */
3927 int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
3928 {
3929         if (family >= NPROTO)
3930                 return -EINVAL;
3931         gifconf_list[family] = gifconf;
3932         return 0;
3933 }
3934 EXPORT_SYMBOL(register_gifconf);
3935
3936
3937 /*
3938  *      Map an interface index to its name (SIOCGIFNAME)
3939  */
3940
3941 /*
3942  *      We need this ioctl for efficient implementation of the
3943  *      if_indextoname() function required by the IPv6 API.  Without
3944  *      it, we would have to search all the interfaces to find a
3945  *      match.  --pb
3946  */
3947
3948 static int dev_ifname(struct net *net, struct ifreq __user *arg)
3949 {
3950         struct net_device *dev;
3951         struct ifreq ifr;
3952
3953         /*
3954          *      Fetch the caller's info block.
3955          */
3956
3957         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3958                 return -EFAULT;
3959
3960         rcu_read_lock();
3961         dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
3962         if (!dev) {
3963                 rcu_read_unlock();
3964                 return -ENODEV;
3965         }
3966
3967         strcpy(ifr.ifr_name, dev->name);
3968         rcu_read_unlock();
3969
3970         if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
3971                 return -EFAULT;
3972         return 0;
3973 }
3974
3975 /*
3976  *      Perform a SIOCGIFCONF call. This structure will change
3977  *      size eventually, and there is nothing I can do about it.
3978  *      Thus we will need a 'compatibility mode'.
3979  */
3980
3981 static int dev_ifconf(struct net *net, char __user *arg)
3982 {
3983         struct ifconf ifc;
3984         struct net_device *dev;
3985         char __user *pos;
3986         int len;
3987         int total;
3988         int i;
3989
3990         /*
3991          *      Fetch the caller's info block.
3992          */
3993
3994         if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
3995                 return -EFAULT;
3996
3997         pos = ifc.ifc_buf;
3998         len = ifc.ifc_len;
3999
4000         /*
4001          *      Loop over the interfaces, and write an info block for each.
4002          */
4003
4004         total = 0;
4005         for_each_netdev(net, dev) {
4006                 for (i = 0; i < NPROTO; i++) {
4007                         if (gifconf_list[i]) {
4008                                 int done;
4009                                 if (!pos)
4010                                         done = gifconf_list[i](dev, NULL, 0);
4011                                 else
4012                                         done = gifconf_list[i](dev, pos + total,
4013                                                                len - total);
4014                                 if (done < 0)
4015                                         return -EFAULT;
4016                                 total += done;
4017                         }
4018                 }
4019         }
4020
4021         /*
4022          *      All done.  Write the updated control block back to the caller.
4023          */
4024         ifc.ifc_len = total;
4025
4026         /*
4027          *      Both BSD and Solaris return 0 here, so we do too.
4028          */
4029         return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
4030 }
4031
4032 #ifdef CONFIG_PROC_FS
4033 /*
4034  *      This is invoked by the /proc filesystem handler to display a device
4035  *      in detail.
4036  */
4037 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
4038         __acquires(RCU)
4039 {
4040         struct net *net = seq_file_net(seq);
4041         loff_t off;
4042         struct net_device *dev;
4043
4044         rcu_read_lock();
4045         if (!*pos)
4046                 return SEQ_START_TOKEN;
4047
4048         off = 1;
4049         for_each_netdev_rcu(net, dev)
4050                 if (off++ == *pos)
4051                         return dev;
4052
4053         return NULL;
4054 }
4055
4056 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4057 {
4058         struct net_device *dev = v;
4059
4060         if (v == SEQ_START_TOKEN)
4061                 dev = first_net_device_rcu(seq_file_net(seq));
4062         else
4063                 dev = next_net_device_rcu(dev);
4064
4065         ++*pos;
4066         return dev;
4067 }
4068
4069 void dev_seq_stop(struct seq_file *seq, void *v)
4070         __releases(RCU)
4071 {
4072         rcu_read_unlock();
4073 }
4074
4075 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
4076 {
4077         struct rtnl_link_stats64 temp;
4078         const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
4079
4080         seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
4081                    "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
4082                    dev->name, stats->rx_bytes, stats->rx_packets,
4083                    stats->rx_errors,
4084                    stats->rx_dropped + stats->rx_missed_errors,
4085                    stats->rx_fifo_errors,
4086                    stats->rx_length_errors + stats->rx_over_errors +
4087                     stats->rx_crc_errors + stats->rx_frame_errors,
4088                    stats->rx_compressed, stats->multicast,
4089                    stats->tx_bytes, stats->tx_packets,
4090                    stats->tx_errors, stats->tx_dropped,
4091                    stats->tx_fifo_errors, stats->collisions,
4092                    stats->tx_carrier_errors +
4093                     stats->tx_aborted_errors +
4094                     stats->tx_window_errors +
4095                     stats->tx_heartbeat_errors,
4096                    stats->tx_compressed);
4097 }
4098
4099 /*
4100  *      Called from the PROCfs module. This now uses the new arbitrary sized
4101  *      /proc/net interface to create /proc/net/dev
4102  */
4103 static int dev_seq_show(struct seq_file *seq, void *v)
4104 {
4105         if (v == SEQ_START_TOKEN)
4106                 seq_puts(seq, "Inter-|   Receive                            "
4107                               "                    |  Transmit\n"
4108                               " face |bytes    packets errs drop fifo frame "
4109                               "compressed multicast|bytes    packets errs "
4110                               "drop fifo colls carrier compressed\n");
4111         else
4112                 dev_seq_printf_stats(seq, v);
4113         return 0;
4114 }
4115
4116 static struct softnet_data *softnet_get_online(loff_t *pos)
4117 {
4118         struct softnet_data *sd = NULL;
4119
4120         while (*pos < nr_cpu_ids)
4121                 if (cpu_online(*pos)) {
4122                         sd = &per_cpu(softnet_data, *pos);
4123                         break;
4124                 } else
4125                         ++*pos;
4126         return sd;
4127 }
4128
4129 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
4130 {
4131         return softnet_get_online(pos);
4132 }
4133
4134 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4135 {
4136         ++*pos;
4137         return softnet_get_online(pos);
4138 }
4139
4140 static void softnet_seq_stop(struct seq_file *seq, void *v)
4141 {
4142 }
4143
4144 static int softnet_seq_show(struct seq_file *seq, void *v)
4145 {
4146         struct softnet_data *sd = v;
4147
4148         seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
4149                    sd->processed, sd->dropped, sd->time_squeeze, 0,
4150                    0, 0, 0, 0, /* was fastroute */
4151                    sd->cpu_collision, sd->received_rps);
4152         return 0;
4153 }
4154
4155 static const struct seq_operations dev_seq_ops = {
4156         .start = dev_seq_start,
4157         .next  = dev_seq_next,
4158         .stop  = dev_seq_stop,
4159         .show  = dev_seq_show,
4160 };
4161
4162 static int dev_seq_open(struct inode *inode, struct file *file)
4163 {
4164         return seq_open_net(inode, file, &dev_seq_ops,
4165                             sizeof(struct seq_net_private));
4166 }
4167
4168 static const struct file_operations dev_seq_fops = {
4169         .owner   = THIS_MODULE,
4170         .open    = dev_seq_open,
4171         .read    = seq_read,
4172         .llseek  = seq_lseek,
4173         .release = seq_release_net,
4174 };
4175
4176 static const struct seq_operations softnet_seq_ops = {
4177         .start = softnet_seq_start,
4178         .next  = softnet_seq_next,
4179         .stop  = softnet_seq_stop,
4180         .show  = softnet_seq_show,
4181 };
4182
4183 static int softnet_seq_open(struct inode *inode, struct file *file)
4184 {
4185         return seq_open(file, &softnet_seq_ops);
4186 }
4187
4188 static const struct file_operations softnet_seq_fops = {
4189         .owner   = THIS_MODULE,
4190         .open    = softnet_seq_open,
4191         .read    = seq_read,
4192         .llseek  = seq_lseek,
4193         .release = seq_release,
4194 };
4195
4196 static void *ptype_get_idx(loff_t pos)
4197 {
4198         struct packet_type *pt = NULL;
4199         loff_t i = 0;
4200         int t;
4201
4202         list_for_each_entry_rcu(pt, &ptype_all, list) {
4203                 if (i == pos)
4204                         return pt;
4205                 ++i;
4206         }
4207
4208         for (t = 0; t < PTYPE_HASH_SIZE; t++) {
4209                 list_for_each_entry_rcu(pt, &ptype_base[t], list) {
4210                         if (i == pos)
4211                                 return pt;
4212                         ++i;
4213                 }
4214         }
4215         return NULL;
4216 }
4217
4218 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
4219         __acquires(RCU)
4220 {
4221         rcu_read_lock();
4222         return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
4223 }
4224
4225 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4226 {
4227         struct packet_type *pt;
4228         struct list_head *nxt;
4229         int hash;
4230
4231         ++*pos;
4232         if (v == SEQ_START_TOKEN)
4233                 return ptype_get_idx(0);
4234
4235         pt = v;
4236         nxt = pt->list.next;
4237         if (pt->type == htons(ETH_P_ALL)) {
4238                 if (nxt != &ptype_all)
4239                         goto found;
4240                 hash = 0;
4241                 nxt = ptype_base[0].next;
4242         } else
4243                 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
4244
4245         while (nxt == &ptype_base[hash]) {
4246                 if (++hash >= PTYPE_HASH_SIZE)
4247                         return NULL;
4248                 nxt = ptype_base[hash].next;
4249         }
4250 found:
4251         return list_entry(nxt, struct packet_type, list);
4252 }
4253
4254 static void ptype_seq_stop(struct seq_file *seq, void *v)
4255         __releases(RCU)
4256 {
4257         rcu_read_unlock();
4258 }
4259
4260 static int ptype_seq_show(struct seq_file *seq, void *v)
4261 {
4262         struct packet_type *pt = v;
4263
4264         if (v == SEQ_START_TOKEN)
4265                 seq_puts(seq, "Type Device      Function\n");
4266         else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
4267                 if (pt->type == htons(ETH_P_ALL))
4268                         seq_puts(seq, "ALL ");
4269                 else
4270                         seq_printf(seq, "%04x", ntohs(pt->type));
4271
4272                 seq_printf(seq, " %-8s %pF\n",
4273                            pt->dev ? pt->dev->name : "", pt->func);
4274         }
4275
4276         return 0;
4277 }
4278
4279 static const struct seq_operations ptype_seq_ops = {
4280         .start = ptype_seq_start,
4281         .next  = ptype_seq_next,
4282         .stop  = ptype_seq_stop,
4283         .show  = ptype_seq_show,
4284 };
4285
4286 static int ptype_seq_open(struct inode *inode, struct file *file)
4287 {
4288         return seq_open_net(inode, file, &ptype_seq_ops,
4289                         sizeof(struct seq_net_private));
4290 }
4291
4292 static const struct file_operations ptype_seq_fops = {
4293         .owner   = THIS_MODULE,
4294         .open    = ptype_seq_open,
4295         .read    = seq_read,
4296         .llseek  = seq_lseek,
4297         .release = seq_release_net,
4298 };
4299
4300
4301 static int __net_init dev_proc_net_init(struct net *net)
4302 {
4303         int rc = -ENOMEM;
4304
4305         if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
4306                 goto out;
4307         if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
4308                 goto out_dev;
4309         if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
4310                 goto out_softnet;
4311
4312         if (wext_proc_init(net))
4313                 goto out_ptype;
4314         rc = 0;
4315 out:
4316         return rc;
4317 out_ptype:
4318         proc_net_remove(net, "ptype");
4319 out_softnet:
4320         proc_net_remove(net, "softnet_stat");
4321 out_dev:
4322         proc_net_remove(net, "dev");
4323         goto out;
4324 }
4325
4326 static void __net_exit dev_proc_net_exit(struct net *net)
4327 {
4328         wext_proc_exit(net);
4329
4330         proc_net_remove(net, "ptype");
4331         proc_net_remove(net, "softnet_stat");
4332         proc_net_remove(net, "dev");
4333 }
4334
4335 static struct pernet_operations __net_initdata dev_proc_ops = {
4336         .init = dev_proc_net_init,
4337         .exit = dev_proc_net_exit,
4338 };
4339
4340 static int __init dev_proc_init(void)
4341 {
4342         return register_pernet_subsys(&dev_proc_ops);
4343 }
4344 #else
4345 #define dev_proc_init() 0
4346 #endif  /* CONFIG_PROC_FS */
4347
4348
4349 /**
4350  *      netdev_set_master       -       set up master/slave pair
4351  *      @slave: slave device
4352  *      @master: new master device
4353  *
4354  *      Changes the master device of the slave. Pass %NULL to break the
4355  *      bonding. The caller must hold the RTNL semaphore. On a failure
4356  *      a negative errno code is returned. On success the reference counts
4357  *      are adjusted, %RTM_NEWLINK is sent to the routing socket and the
4358  *      function returns zero.
4359  */
4360 int netdev_set_master(struct net_device *slave, struct net_device *master)
4361 {
4362         struct net_device *old = slave->master;
4363
4364         ASSERT_RTNL();
4365
4366         if (master) {
4367                 if (old)
4368                         return -EBUSY;
4369                 dev_hold(master);
4370         }
4371
4372         slave->master = master;
4373
4374         if (old) {
4375                 synchronize_net();
4376                 dev_put(old);
4377         }
4378         if (master)
4379                 slave->flags |= IFF_SLAVE;
4380         else
4381                 slave->flags &= ~IFF_SLAVE;
4382
4383         rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4384         return 0;
4385 }
4386 EXPORT_SYMBOL(netdev_set_master);
4387
4388 static void dev_change_rx_flags(struct net_device *dev, int flags)
4389 {
4390         const struct net_device_ops *ops = dev->netdev_ops;
4391
4392         if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4393                 ops->ndo_change_rx_flags(dev, flags);
4394 }
4395
4396 static int __dev_set_promiscuity(struct net_device *dev, int inc)
4397 {
4398         unsigned short old_flags = dev->flags;
4399         uid_t uid;
4400         gid_t gid;
4401
4402         ASSERT_RTNL();
4403
4404         dev->flags |= IFF_PROMISC;
4405         dev->promiscuity += inc;
4406         if (dev->promiscuity == 0) {
4407                 /*
4408                  * Avoid overflow.
4409                  * If inc causes overflow, untouch promisc and return error.
4410                  */
4411                 if (inc < 0)
4412                         dev->flags &= ~IFF_PROMISC;
4413                 else {
4414                         dev->promiscuity -= inc;
4415                         printk(KERN_WARNING "%s: promiscuity touches roof, "
4416                                 "set promiscuity failed, promiscuity feature "
4417                                 "of device might be broken.\n", dev->name);
4418                         return -EOVERFLOW;
4419                 }
4420         }
4421         if (dev->flags != old_flags) {
4422                 printk(KERN_INFO "device %s %s promiscuous mode\n",
4423                        dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
4424                                                                "left");
4425                 if (audit_enabled) {
4426                         current_uid_gid(&uid, &gid);
4427                         audit_log(current->audit_context, GFP_ATOMIC,
4428                                 AUDIT_ANOM_PROMISCUOUS,
4429                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4430                                 dev->name, (dev->flags & IFF_PROMISC),
4431                                 (old_flags & IFF_PROMISC),
4432                                 audit_get_loginuid(current),
4433                                 uid, gid,
4434                                 audit_get_sessionid(current));
4435                 }
4436
4437                 dev_change_rx_flags(dev, IFF_PROMISC);
4438         }
4439         return 0;
4440 }
4441
4442 /**
4443  *      dev_set_promiscuity     - update promiscuity count on a device
4444  *      @dev: device
4445  *      @inc: modifier
4446  *
4447  *      Add or remove promiscuity from a device. While the count in the device
4448  *      remains above zero the interface remains promiscuous. Once it hits zero
4449  *      the device reverts back to normal filtering operation. A negative inc
4450  *      value is used to drop promiscuity on the device.
4451  *      Return 0 if successful or a negative errno code on error.
4452  */
4453 int dev_set_promiscuity(struct net_device *dev, int inc)
4454 {
4455         unsigned short old_flags = dev->flags;
4456         int err;
4457
4458         err = __dev_set_promiscuity(dev, inc);
4459         if (err < 0)
4460                 return err;
4461         if (dev->flags != old_flags)
4462                 dev_set_rx_mode(dev);
4463         return err;
4464 }
4465 EXPORT_SYMBOL(dev_set_promiscuity);
4466
4467 /**
4468  *      dev_set_allmulti        - update allmulti count on a device
4469  *      @dev: device
4470  *      @inc: modifier
4471  *
4472  *      Add or remove reception of all multicast frames to a device. While the
4473  *      count in the device remains above zero the interface remains listening
4474  *      to all interfaces. Once it hits zero the device reverts back to normal
4475  *      filtering operation. A negative @inc value is used to drop the counter
4476  *      when releasing a resource needing all multicasts.
4477  *      Return 0 if successful or a negative errno code on error.
4478  */
4479
4480 int dev_set_allmulti(struct net_device *dev, int inc)
4481 {
4482         unsigned short old_flags = dev->flags;
4483
4484         ASSERT_RTNL();
4485
4486         dev->flags |= IFF_ALLMULTI;
4487         dev->allmulti += inc;
4488         if (dev->allmulti == 0) {
4489                 /*
4490                  * Avoid overflow.
4491                  * If inc causes overflow, untouch allmulti and return error.
4492                  */
4493                 if (inc < 0)
4494                         dev->flags &= ~IFF_ALLMULTI;
4495                 else {
4496                         dev->allmulti -= inc;
4497                         printk(KERN_WARNING "%s: allmulti touches roof, "
4498                                 "set allmulti failed, allmulti feature of "
4499                                 "device might be broken.\n", dev->name);
4500                         return -EOVERFLOW;
4501                 }
4502         }
4503         if (dev->flags ^ old_flags) {
4504                 dev_change_rx_flags(dev, IFF_ALLMULTI);
4505                 dev_set_rx_mode(dev);
4506         }
4507         return 0;
4508 }
4509 EXPORT_SYMBOL(dev_set_allmulti);
4510
4511 /*
4512  *      Upload unicast and multicast address lists to device and
4513  *      configure RX filtering. When the device doesn't support unicast
4514  *      filtering it is put in promiscuous mode while unicast addresses
4515  *      are present.
4516  */
4517 void __dev_set_rx_mode(struct net_device *dev)
4518 {
4519         const struct net_device_ops *ops = dev->netdev_ops;
4520
4521         /* dev_open will call this function so the list will stay sane. */
4522         if (!(dev->flags&IFF_UP))
4523                 return;
4524
4525         if (!netif_device_present(dev))
4526                 return;
4527
4528         if (ops->ndo_set_rx_mode)
4529                 ops->ndo_set_rx_mode(dev);
4530         else {
4531                 /* Unicast addresses changes may only happen under the rtnl,
4532                  * therefore calling __dev_set_promiscuity here is safe.
4533                  */
4534                 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4535                         __dev_set_promiscuity(dev, 1);
4536                         dev->uc_promisc = 1;
4537                 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4538                         __dev_set_promiscuity(dev, -1);
4539                         dev->uc_promisc = 0;
4540                 }
4541
4542                 if (ops->ndo_set_multicast_list)
4543                         ops->ndo_set_multicast_list(dev);
4544         }
4545 }
4546
4547 void dev_set_rx_mode(struct net_device *dev)
4548 {
4549         netif_addr_lock_bh(dev);
4550         __dev_set_rx_mode(dev);
4551         netif_addr_unlock_bh(dev);
4552 }
4553
4554 /**
4555  *      dev_get_flags - get flags reported to userspace
4556  *      @dev: device
4557  *
4558  *      Get the combination of flag bits exported through APIs to userspace.
4559  */
4560 unsigned dev_get_flags(const struct net_device *dev)
4561 {
4562         unsigned flags;
4563
4564         flags = (dev->flags & ~(IFF_PROMISC |
4565                                 IFF_ALLMULTI |
4566                                 IFF_RUNNING |
4567                                 IFF_LOWER_UP |
4568                                 IFF_DORMANT)) |
4569                 (dev->gflags & (IFF_PROMISC |
4570                                 IFF_ALLMULTI));
4571
4572         if (netif_running(dev)) {
4573                 if (netif_oper_up(dev))
4574                         flags |= IFF_RUNNING;
4575                 if (netif_carrier_ok(dev))
4576                         flags |= IFF_LOWER_UP;
4577                 if (netif_dormant(dev))
4578                         flags |= IFF_DORMANT;
4579         }
4580
4581         return flags;
4582 }
4583 EXPORT_SYMBOL(dev_get_flags);
4584
4585 int __dev_change_flags(struct net_device *dev, unsigned int flags)
4586 {
4587         int old_flags = dev->flags;
4588         int ret;
4589
4590         ASSERT_RTNL();
4591
4592         /*
4593          *      Set the flags on our device.
4594          */
4595
4596         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4597                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4598                                IFF_AUTOMEDIA)) |
4599                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4600                                     IFF_ALLMULTI));
4601
4602         /*
4603          *      Load in the correct multicast list now the flags have changed.
4604          */
4605
4606         if ((old_flags ^ flags) & IFF_MULTICAST)
4607                 dev_change_rx_flags(dev, IFF_MULTICAST);
4608
4609         dev_set_rx_mode(dev);
4610
4611         /*
4612          *      Have we downed the interface. We handle IFF_UP ourselves
4613          *      according to user attempts to set it, rather than blindly
4614          *      setting it.
4615          */
4616
4617         ret = 0;
4618         if ((old_flags ^ flags) & IFF_UP) {     /* Bit is different  ? */
4619                 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4620
4621                 if (!ret)
4622                         dev_set_rx_mode(dev);
4623         }
4624
4625         if ((flags ^ dev->gflags) & IFF_PROMISC) {
4626                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
4627
4628                 dev->gflags ^= IFF_PROMISC;
4629                 dev_set_promiscuity(dev, inc);
4630         }
4631
4632         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4633            is important. Some (broken) drivers set IFF_PROMISC, when
4634            IFF_ALLMULTI is requested not asking us and not reporting.
4635          */
4636         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4637                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4638
4639                 dev->gflags ^= IFF_ALLMULTI;
4640                 dev_set_allmulti(dev, inc);
4641         }
4642
4643         return ret;
4644 }
4645
4646 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4647 {
4648         unsigned int changes = dev->flags ^ old_flags;
4649
4650         if (changes & IFF_UP) {
4651                 if (dev->flags & IFF_UP)
4652                         call_netdevice_notifiers(NETDEV_UP, dev);
4653                 else
4654                         call_netdevice_notifiers(NETDEV_DOWN, dev);
4655         }
4656
4657         if (dev->flags & IFF_UP &&
4658             (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4659                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
4660 }
4661
4662 /**
4663  *      dev_change_flags - change device settings
4664  *      @dev: device
4665  *      @flags: device state flags
4666  *
4667  *      Change settings on device based state flags. The flags are
4668  *      in the userspace exported format.
4669  */
4670 int dev_change_flags(struct net_device *dev, unsigned flags)
4671 {
4672         int ret, changes;
4673         int old_flags = dev->flags;
4674
4675         ret = __dev_change_flags(dev, flags);
4676         if (ret < 0)
4677                 return ret;
4678
4679         changes = old_flags ^ dev->flags;
4680         if (changes)
4681                 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4682
4683         __dev_notify_flags(dev, old_flags);
4684         return ret;
4685 }
4686 EXPORT_SYMBOL(dev_change_flags);
4687
4688 /**
4689  *      dev_set_mtu - Change maximum transfer unit
4690  *      @dev: device
4691  *      @new_mtu: new transfer unit
4692  *
4693  *      Change the maximum transfer size of the network device.
4694  */
4695 int dev_set_mtu(struct net_device *dev, int new_mtu)
4696 {
4697         const struct net_device_ops *ops = dev->netdev_ops;
4698         int err;
4699
4700         if (new_mtu == dev->mtu)
4701                 return 0;
4702
4703         /*      MTU must be positive.    */
4704         if (new_mtu < 0)
4705                 return -EINVAL;
4706
4707         if (!netif_device_present(dev))
4708                 return -ENODEV;
4709
4710         err = 0;
4711         if (ops->ndo_change_mtu)
4712                 err = ops->ndo_change_mtu(dev, new_mtu);
4713         else
4714                 dev->mtu = new_mtu;
4715
4716         if (!err && dev->flags & IFF_UP)
4717                 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4718         return err;
4719 }
4720 EXPORT_SYMBOL(dev_set_mtu);
4721
4722 /**
4723  *      dev_set_group - Change group this device belongs to
4724  *      @dev: device
4725  *      @new_group: group this device should belong to
4726  */
4727 void dev_set_group(struct net_device *dev, int new_group)
4728 {
4729         dev->group = new_group;
4730 }
4731 EXPORT_SYMBOL(dev_set_group);
4732
4733 /**
4734  *      dev_set_mac_address - Change Media Access Control Address
4735  *      @dev: device
4736  *      @sa: new address
4737  *
4738  *      Change the hardware (MAC) address of the device
4739  */
4740 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4741 {
4742         const struct net_device_ops *ops = dev->netdev_ops;
4743         int err;
4744
4745         if (!ops->ndo_set_mac_address)
4746                 return -EOPNOTSUPP;
4747         if (sa->sa_family != dev->type)
4748                 return -EINVAL;
4749         if (!netif_device_present(dev))
4750                 return -ENODEV;
4751         err = ops->ndo_set_mac_address(dev, sa);
4752         if (!err)
4753                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4754         return err;
4755 }
4756 EXPORT_SYMBOL(dev_set_mac_address);
4757
4758 /*
4759  *      Perform the SIOCxIFxxx calls, inside rcu_read_lock()
4760  */
4761 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4762 {
4763         int err;
4764         struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
4765
4766         if (!dev)
4767                 return -ENODEV;
4768
4769         switch (cmd) {
4770         case SIOCGIFFLAGS:      /* Get interface flags */
4771                 ifr->ifr_flags = (short) dev_get_flags(dev);
4772                 return 0;
4773
4774         case SIOCGIFMETRIC:     /* Get the metric on the interface
4775                                    (currently unused) */
4776                 ifr->ifr_metric = 0;
4777                 return 0;
4778
4779         case SIOCGIFMTU:        /* Get the MTU of a device */
4780                 ifr->ifr_mtu = dev->mtu;
4781                 return 0;
4782
4783         case SIOCGIFHWADDR:
4784                 if (!dev->addr_len)
4785                         memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4786                 else
4787                         memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4788                                min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4789                 ifr->ifr_hwaddr.sa_family = dev->type;
4790                 return 0;
4791
4792         case SIOCGIFSLAVE:
4793                 err = -EINVAL;
4794                 break;
4795
4796         case SIOCGIFMAP:
4797                 ifr->ifr_map.mem_start = dev->mem_start;
4798                 ifr->ifr_map.mem_end   = dev->mem_end;
4799                 ifr->ifr_map.base_addr = dev->base_addr;
4800                 ifr->ifr_map.irq       = dev->irq;
4801                 ifr->ifr_map.dma       = dev->dma;
4802                 ifr->ifr_map.port      = dev->if_port;
4803                 return 0;
4804
4805         case SIOCGIFINDEX:
4806                 ifr->ifr_ifindex = dev->ifindex;
4807                 return 0;
4808
4809         case SIOCGIFTXQLEN:
4810                 ifr->ifr_qlen = dev->tx_queue_len;
4811                 return 0;
4812
4813         default:
4814                 /* dev_ioctl() should ensure this case
4815                  * is never reached
4816                  */
4817                 WARN_ON(1);
4818                 err = -EINVAL;
4819                 break;
4820
4821         }
4822         return err;
4823 }
4824
4825 /*
4826  *      Perform the SIOCxIFxxx calls, inside rtnl_lock()
4827  */
4828 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4829 {
4830         int err;
4831         struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4832         const struct net_device_ops *ops;
4833
4834         if (!dev)
4835                 return -ENODEV;
4836
4837         ops = dev->netdev_ops;
4838
4839         switch (cmd) {
4840         case SIOCSIFFLAGS:      /* Set interface flags */
4841                 return dev_change_flags(dev, ifr->ifr_flags);
4842
4843         case SIOCSIFMETRIC:     /* Set the metric on the interface
4844                                    (currently unused) */
4845                 return -EOPNOTSUPP;
4846
4847         case SIOCSIFMTU:        /* Set the MTU of a device */
4848                 return dev_set_mtu(dev, ifr->ifr_mtu);
4849
4850         case SIOCSIFHWADDR:
4851                 return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
4852
4853         case SIOCSIFHWBROADCAST:
4854                 if (ifr->ifr_hwaddr.sa_family != dev->type)
4855                         return -EINVAL;
4856                 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4857                        min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4858                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4859                 return 0;
4860
4861         case SIOCSIFMAP:
4862                 if (ops->ndo_set_config) {
4863                         if (!netif_device_present(dev))
4864                                 return -ENODEV;
4865                         return ops->ndo_set_config(dev, &ifr->ifr_map);
4866                 }
4867                 return -EOPNOTSUPP;
4868
4869         case SIOCADDMULTI:
4870                 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4871                     ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4872                         return -EINVAL;
4873                 if (!netif_device_present(dev))
4874                         return -ENODEV;
4875                 return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
4876
4877         case SIOCDELMULTI:
4878                 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4879                     ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4880                         return -EINVAL;
4881                 if (!netif_device_present(dev))
4882                         return -ENODEV;
4883                 return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
4884
4885         case SIOCSIFTXQLEN:
4886                 if (ifr->ifr_qlen < 0)
4887                         return -EINVAL;
4888                 dev->tx_queue_len = ifr->ifr_qlen;
4889                 return 0;
4890
4891         case SIOCSIFNAME:
4892                 ifr->ifr_newname[IFNAMSIZ-1] = '\0';
4893                 return dev_change_name(dev, ifr->ifr_newname);
4894
4895         /*
4896          *      Unknown or private ioctl
4897          */
4898         default:
4899                 if ((cmd >= SIOCDEVPRIVATE &&
4900                     cmd <= SIOCDEVPRIVATE + 15) ||
4901                     cmd == SIOCBONDENSLAVE ||
4902                     cmd == SIOCBONDRELEASE ||
4903                     cmd == SIOCBONDSETHWADDR ||
4904                     cmd == SIOCBONDSLAVEINFOQUERY ||
4905                     cmd == SIOCBONDINFOQUERY ||
4906                     cmd == SIOCBONDCHANGEACTIVE ||
4907                     cmd == SIOCGMIIPHY ||
4908                     cmd == SIOCGMIIREG ||
4909                     cmd == SIOCSMIIREG ||
4910                     cmd == SIOCBRADDIF ||
4911                     cmd == SIOCBRDELIF ||
4912                     cmd == SIOCSHWTSTAMP ||
4913                     cmd == SIOCWANDEV) {
4914                         err = -EOPNOTSUPP;
4915                         if (ops->ndo_do_ioctl) {
4916                                 if (netif_device_present(dev))
4917                                         err = ops->ndo_do_ioctl(dev, ifr, cmd);
4918                                 else
4919                                         err = -ENODEV;
4920                         }
4921                 } else
4922                         err = -EINVAL;
4923
4924         }
4925         return err;
4926 }
4927
4928 /*
4929  *      This function handles all "interface"-type I/O control requests. The actual
4930  *      'doing' part of this is dev_ifsioc above.
4931  */
4932
4933 /**
4934  *      dev_ioctl       -       network device ioctl
4935  *      @net: the applicable net namespace
4936  *      @cmd: command to issue
4937  *      @arg: pointer to a struct ifreq in user space
4938  *
4939  *      Issue ioctl functions to devices. This is normally called by the
4940  *      user space syscall interfaces but can sometimes be useful for
4941  *      other purposes. The return value is the return from the syscall if
4942  *      positive or a negative errno code on error.
4943  */
4944
4945 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
4946 {
4947         struct ifreq ifr;
4948         int ret;
4949         char *colon;
4950
4951         /* One special case: SIOCGIFCONF takes ifconf argument
4952            and requires shared lock, because it sleeps writing
4953            to user space.
4954          */
4955
4956         if (cmd == SIOCGIFCONF) {
4957                 rtnl_lock();
4958                 ret = dev_ifconf(net, (char __user *) arg);
4959                 rtnl_unlock();
4960                 return ret;
4961         }
4962         if (cmd == SIOCGIFNAME)
4963                 return dev_ifname(net, (struct ifreq __user *)arg);
4964
4965         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4966                 return -EFAULT;
4967
4968         ifr.ifr_name[IFNAMSIZ-1] = 0;
4969
4970         colon = strchr(ifr.ifr_name, ':');
4971         if (colon)
4972                 *colon = 0;
4973
4974         /*
4975          *      See which interface the caller is talking about.
4976          */
4977
4978         switch (cmd) {
4979         /*
4980          *      These ioctl calls:
4981          *      - can be done by all.
4982          *      - atomic and do not require locking.
4983          *      - return a value
4984          */
4985         case SIOCGIFFLAGS:
4986         case SIOCGIFMETRIC:
4987         case SIOCGIFMTU:
4988         case SIOCGIFHWADDR:
4989         case SIOCGIFSLAVE:
4990         case SIOCGIFMAP:
4991         case SIOCGIFINDEX:
4992         case SIOCGIFTXQLEN:
4993                 dev_load(net, ifr.ifr_name);
4994                 rcu_read_lock();
4995                 ret = dev_ifsioc_locked(net, &ifr, cmd);
4996                 rcu_read_unlock();
4997                 if (!ret) {
4998                         if (colon)
4999                                 *colon = ':';
5000                         if (copy_to_user(arg, &ifr,
5001                                          sizeof(struct ifreq)))
5002                                 ret = -EFAULT;
5003                 }
5004                 return ret;
5005
5006         case SIOCETHTOOL:
5007                 dev_load(net, ifr.ifr_name);
5008                 rtnl_lock();
5009                 ret = dev_ethtool(net, &ifr);
5010                 rtnl_unlock();
5011                 if (!ret) {
5012                         if (colon)
5013                                 *colon = ':';
5014                         if (copy_to_user(arg, &ifr,
5015                                          sizeof(struct ifreq)))
5016                                 ret = -EFAULT;
5017                 }
5018                 return ret;
5019
5020         /*
5021          *      These ioctl calls:
5022          *      - require superuser power.
5023          *      - require strict serialization.
5024          *      - return a value
5025          */
5026         case SIOCGMIIPHY:
5027         case SIOCGMIIREG:
5028         case SIOCSIFNAME:
5029                 if (!capable(CAP_NET_ADMIN))
5030                         return -EPERM;
5031                 dev_load(net, ifr.ifr_name);
5032                 rtnl_lock();
5033                 ret = dev_ifsioc(net, &ifr, cmd);
5034                 rtnl_unlock();
5035                 if (!ret) {
5036                         if (colon)
5037                                 *colon = ':';
5038                         if (copy_to_user(arg, &ifr,
5039                                          sizeof(struct ifreq)))
5040                                 ret = -EFAULT;
5041                 }
5042                 return ret;
5043
5044         /*
5045          *      These ioctl calls:
5046          *      - require superuser power.
5047          *      - require strict serialization.
5048          *      - do not return a value
5049          */
5050         case SIOCSIFFLAGS:
5051         case SIOCSIFMETRIC:
5052         case SIOCSIFMTU:
5053         case SIOCSIFMAP:
5054         case SIOCSIFHWADDR:
5055         case SIOCSIFSLAVE:
5056         case SIOCADDMULTI:
5057         case SIOCDELMULTI:
5058         case SIOCSIFHWBROADCAST:
5059         case SIOCSIFTXQLEN:
5060         case SIOCSMIIREG:
5061         case SIOCBONDENSLAVE:
5062         case SIOCBONDRELEASE:
5063         case SIOCBONDSETHWADDR:
5064         case SIOCBONDCHANGEACTIVE:
5065         case SIOCBRADDIF:
5066         case SIOCBRDELIF:
5067         case SIOCSHWTSTAMP:
5068                 if (!capable(CAP_NET_ADMIN))
5069                         return -EPERM;
5070                 /* fall through */
5071         case SIOCBONDSLAVEINFOQUERY:
5072         case SIOCBONDINFOQUERY:
5073                 dev_load(net, ifr.ifr_name);
5074                 rtnl_lock();
5075                 ret = dev_ifsioc(net, &ifr, cmd);
5076                 rtnl_unlock();
5077                 return ret;
5078
5079         case SIOCGIFMEM:
5080                 /* Get the per device memory space. We can add this but
5081                  * currently do not support it */
5082         case SIOCSIFMEM:
5083                 /* Set the per device memory buffer space.
5084                  * Not applicable in our case */
5085         case SIOCSIFLINK:
5086                 return -EINVAL;
5087
5088         /*
5089          *      Unknown or private ioctl.
5090          */
5091         default:
5092                 if (cmd == SIOCWANDEV ||
5093                     (cmd >= SIOCDEVPRIVATE &&
5094                      cmd <= SIOCDEVPRIVATE + 15)) {
5095                         dev_load(net, ifr.ifr_name);
5096                         rtnl_lock();
5097                         ret = dev_ifsioc(net, &ifr, cmd);
5098                         rtnl_unlock();
5099                         if (!ret && copy_to_user(arg, &ifr,
5100                                                  sizeof(struct ifreq)))
5101                                 ret = -EFAULT;
5102                         return ret;
5103                 }
5104                 /* Take care of Wireless Extensions */
5105                 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
5106                         return wext_handle_ioctl(net, &ifr, cmd, arg);
5107                 return -EINVAL;
5108         }
5109 }
5110
5111
5112 /**
5113  *      dev_new_index   -       allocate an ifindex
5114  *      @net: the applicable net namespace
5115  *
5116  *      Returns a suitable unique value for a new device interface
5117  *      number.  The caller must hold the rtnl semaphore or the
5118  *      dev_base_lock to be sure it remains unique.
5119  */
5120 static int dev_new_index(struct net *net)
5121 {
5122         static int ifindex;
5123         for (;;) {
5124                 if (++ifindex <= 0)
5125                         ifindex = 1;
5126                 if (!__dev_get_by_index(net, ifindex))
5127                         return ifindex;
5128         }
5129 }
5130
5131 /* Delayed registration/unregisteration */
5132 static LIST_HEAD(net_todo_list);
5133
5134 static void net_set_todo(struct net_device *dev)
5135 {
5136         list_add_tail(&dev->todo_list, &net_todo_list);
5137 }
5138
5139 static void rollback_registered_many(struct list_head *head)
5140 {
5141         struct net_device *dev, *tmp;
5142
5143         BUG_ON(dev_boot_phase);
5144         ASSERT_RTNL();
5145
5146         list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5147                 /* Some devices call without registering
5148                  * for initialization unwind. Remove those
5149                  * devices and proceed with the remaining.
5150                  */
5151                 if (dev->reg_state == NETREG_UNINITIALIZED) {
5152                         pr_debug("unregister_netdevice: device %s/%p never "
5153                                  "was registered\n", dev->name, dev);
5154
5155                         WARN_ON(1);
5156                         list_del(&dev->unreg_list);
5157                         continue;
5158                 }
5159
5160                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
5161         }
5162
5163         /* If device is running, close it first. */
5164         dev_close_many(head);
5165
5166         list_for_each_entry(dev, head, unreg_list) {
5167                 /* And unlink it from device chain. */
5168                 unlist_netdevice(dev);
5169
5170                 dev->reg_state = NETREG_UNREGISTERING;
5171         }
5172
5173         synchronize_net();
5174
5175         list_for_each_entry(dev, head, unreg_list) {
5176                 /* Shutdown queueing discipline. */
5177                 dev_shutdown(dev);
5178
5179
5180                 /* Notify protocols, that we are about to destroy
5181                    this device. They should clean all the things.
5182                 */
5183                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5184
5185                 if (!dev->rtnl_link_ops ||
5186                     dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5187                         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5188
5189                 /*
5190                  *      Flush the unicast and multicast chains
5191                  */
5192                 dev_uc_flush(dev);
5193                 dev_mc_flush(dev);
5194
5195                 if (dev->netdev_ops->ndo_uninit)
5196                         dev->netdev_ops->ndo_uninit(dev);
5197
5198                 /* Notifier chain MUST detach us from master device. */
5199                 WARN_ON(dev->master);
5200
5201                 /* Remove entries from kobject tree */
5202                 netdev_unregister_kobject(dev);
5203         }
5204
5205         /* Process any work delayed until the end of the batch */
5206         dev = list_first_entry(head, struct net_device, unreg_list);
5207         call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5208
5209         rcu_barrier();
5210
5211         list_for_each_entry(dev, head, unreg_list)
5212                 dev_put(dev);
5213 }
5214
5215 static void rollback_registered(struct net_device *dev)
5216 {
5217         LIST_HEAD(single);
5218
5219         list_add(&dev->unreg_list, &single);
5220         rollback_registered_many(&single);
5221 }
5222
5223 u32 netdev_fix_features(struct net_device *dev, u32 features)
5224 {
5225         /* Fix illegal checksum combinations */
5226         if ((features & NETIF_F_HW_CSUM) &&
5227             (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5228                 netdev_info(dev, "mixed HW and IP checksum settings.\n");
5229                 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5230         }
5231
5232         if ((features & NETIF_F_NO_CSUM) &&
5233             (features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5234                 netdev_info(dev, "mixed no checksumming and other settings.\n");
5235                 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
5236         }
5237
5238         /* Fix illegal SG+CSUM combinations. */
5239         if ((features & NETIF_F_SG) &&
5240             !(features & NETIF_F_ALL_CSUM)) {
5241                 netdev_info(dev,
5242                             "Dropping NETIF_F_SG since no checksum feature.\n");
5243                 features &= ~NETIF_F_SG;
5244         }
5245
5246         /* TSO requires that SG is present as well. */
5247         if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) {
5248                 netdev_info(dev, "Dropping NETIF_F_TSO since no SG feature.\n");
5249                 features &= ~NETIF_F_TSO;
5250         }
5251
5252         /* UFO needs SG and checksumming */
5253         if (features & NETIF_F_UFO) {
5254                 /* maybe split UFO into V4 and V6? */
5255                 if (!((features & NETIF_F_GEN_CSUM) ||
5256                     (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5257                             == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5258                         netdev_info(dev,
5259                                 "Dropping NETIF_F_UFO since no checksum offload features.\n");
5260                         features &= ~NETIF_F_UFO;
5261                 }
5262
5263                 if (!(features & NETIF_F_SG)) {
5264                         netdev_info(dev,
5265                                 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5266                         features &= ~NETIF_F_UFO;
5267                 }
5268         }
5269
5270         return features;
5271 }
5272 EXPORT_SYMBOL(netdev_fix_features);
5273
5274 /**
5275  *      netif_stacked_transfer_operstate -      transfer operstate
5276  *      @rootdev: the root or lower level device to transfer state from
5277  *      @dev: the device to transfer operstate to
5278  *
5279  *      Transfer operational state from root to device. This is normally
5280  *      called when a stacking relationship exists between the root
5281  *      device and the device(a leaf device).
5282  */
5283 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5284                                         struct net_device *dev)
5285 {
5286         if (rootdev->operstate == IF_OPER_DORMANT)
5287                 netif_dormant_on(dev);
5288         else
5289                 netif_dormant_off(dev);
5290
5291         if (netif_carrier_ok(rootdev)) {
5292                 if (!netif_carrier_ok(dev))
5293                         netif_carrier_on(dev);
5294         } else {
5295                 if (netif_carrier_ok(dev))
5296                         netif_carrier_off(dev);
5297         }
5298 }
5299 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5300
5301 #ifdef CONFIG_RPS
5302 static int netif_alloc_rx_queues(struct net_device *dev)
5303 {
5304         unsigned int i, count = dev->num_rx_queues;
5305         struct netdev_rx_queue *rx;
5306
5307         BUG_ON(count < 1);
5308
5309         rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5310         if (!rx) {
5311                 pr_err("netdev: Unable to allocate %u rx queues.\n", count);
5312                 return -ENOMEM;
5313         }
5314         dev->_rx = rx;
5315
5316         for (i = 0; i < count; i++)
5317                 rx[i].dev = dev;
5318         return 0;
5319 }
5320 #endif
5321
5322 static void netdev_init_one_queue(struct net_device *dev,
5323                                   struct netdev_queue *queue, void *_unused)
5324 {
5325         /* Initialize queue lock */
5326         spin_lock_init(&queue->_xmit_lock);
5327         netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5328         queue->xmit_lock_owner = -1;
5329         netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5330         queue->dev = dev;
5331 }
5332
5333 static int netif_alloc_netdev_queues(struct net_device *dev)
5334 {
5335         unsigned int count = dev->num_tx_queues;
5336         struct netdev_queue *tx;
5337
5338         BUG_ON(count < 1);
5339
5340         tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5341         if (!tx) {
5342                 pr_err("netdev: Unable to allocate %u tx queues.\n",
5343                        count);
5344                 return -ENOMEM;
5345         }
5346         dev->_tx = tx;
5347
5348         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5349         spin_lock_init(&dev->tx_global_lock);
5350
5351         return 0;
5352 }
5353
5354 /**
5355  *      register_netdevice      - register a network device
5356  *      @dev: device to register
5357  *
5358  *      Take a completed network device structure and add it to the kernel
5359  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5360  *      chain. 0 is returned on success. A negative errno code is returned
5361  *      on a failure to set up the device, or if the name is a duplicate.
5362  *
5363  *      Callers must hold the rtnl semaphore. You may want
5364  *      register_netdev() instead of this.
5365  *
5366  *      BUGS:
5367  *      The locking appears insufficient to guarantee two parallel registers
5368  *      will not get the same name.
5369  */
5370
5371 int register_netdevice(struct net_device *dev)
5372 {
5373         int ret;
5374         struct net *net = dev_net(dev);
5375
5376         BUG_ON(dev_boot_phase);
5377         ASSERT_RTNL();
5378
5379         might_sleep();
5380
5381         /* When net_device's are persistent, this will be fatal. */
5382         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5383         BUG_ON(!net);
5384
5385         spin_lock_init(&dev->addr_list_lock);
5386         netdev_set_addr_lockdep_class(dev);
5387
5388         dev->iflink = -1;
5389
5390         /* Init, if this function is available */
5391         if (dev->netdev_ops->ndo_init) {
5392                 ret = dev->netdev_ops->ndo_init(dev);
5393                 if (ret) {
5394                         if (ret > 0)
5395                                 ret = -EIO;
5396                         goto out;
5397                 }
5398         }
5399
5400         ret = dev_get_valid_name(dev, dev->name, 0);
5401         if (ret)
5402                 goto err_uninit;
5403
5404         dev->ifindex = dev_new_index(net);
5405         if (dev->iflink == -1)
5406                 dev->iflink = dev->ifindex;
5407
5408         dev->features = netdev_fix_features(dev, dev->features);
5409
5410         /* Enable software GSO if SG is supported. */
5411         if (dev->features & NETIF_F_SG)
5412                 dev->features |= NETIF_F_GSO;
5413
5414         /* Enable GRO and NETIF_F_HIGHDMA for vlans by default,
5415          * vlan_dev_init() will do the dev->features check, so these features
5416          * are enabled only if supported by underlying device.
5417          */
5418         dev->vlan_features |= (NETIF_F_GRO | NETIF_F_HIGHDMA);
5419
5420         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5421         ret = notifier_to_errno(ret);
5422         if (ret)
5423                 goto err_uninit;
5424
5425         ret = netdev_register_kobject(dev);
5426         if (ret)
5427                 goto err_uninit;
5428         dev->reg_state = NETREG_REGISTERED;
5429
5430         /*
5431          *      Default initial state at registry is that the
5432          *      device is present.
5433          */
5434
5435         set_bit(__LINK_STATE_PRESENT, &dev->state);
5436
5437         dev_init_scheduler(dev);
5438         dev_hold(dev);
5439         list_netdevice(dev);
5440
5441         /* Notify protocols, that a new device appeared. */
5442         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5443         ret = notifier_to_errno(ret);
5444         if (ret) {
5445                 rollback_registered(dev);
5446                 dev->reg_state = NETREG_UNREGISTERED;
5447         }
5448         /*
5449          *      Prevent userspace races by waiting until the network
5450          *      device is fully setup before sending notifications.
5451          */
5452         if (!dev->rtnl_link_ops ||
5453             dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5454                 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5455
5456 out:
5457         return ret;
5458
5459 err_uninit:
5460         if (dev->netdev_ops->ndo_uninit)
5461                 dev->netdev_ops->ndo_uninit(dev);
5462         goto out;
5463 }
5464 EXPORT_SYMBOL(register_netdevice);
5465
5466 /**
5467  *      init_dummy_netdev       - init a dummy network device for NAPI
5468  *      @dev: device to init
5469  *
5470  *      This takes a network device structure and initialize the minimum
5471  *      amount of fields so it can be used to schedule NAPI polls without
5472  *      registering a full blown interface. This is to be used by drivers
5473  *      that need to tie several hardware interfaces to a single NAPI
5474  *      poll scheduler due to HW limitations.
5475  */
5476 int init_dummy_netdev(struct net_device *dev)
5477 {
5478         /* Clear everything. Note we don't initialize spinlocks
5479          * are they aren't supposed to be taken by any of the
5480          * NAPI code and this dummy netdev is supposed to be
5481          * only ever used for NAPI polls
5482          */
5483         memset(dev, 0, sizeof(struct net_device));
5484
5485         /* make sure we BUG if trying to hit standard
5486          * register/unregister code path
5487          */
5488         dev->reg_state = NETREG_DUMMY;
5489
5490         /* NAPI wants this */
5491         INIT_LIST_HEAD(&dev->napi_list);
5492
5493         /* a dummy interface is started by default */
5494         set_bit(__LINK_STATE_PRESENT, &dev->state);
5495         set_bit(__LINK_STATE_START, &dev->state);
5496
5497         /* Note : We dont allocate pcpu_refcnt for dummy devices,
5498          * because users of this 'device' dont need to change
5499          * its refcount.
5500          */
5501
5502         return 0;
5503 }
5504 EXPORT_SYMBOL_GPL(init_dummy_netdev);
5505
5506
5507 /**
5508  *      register_netdev - register a network device
5509  *      @dev: device to register
5510  *
5511  *      Take a completed network device structure and add it to the kernel
5512  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5513  *      chain. 0 is returned on success. A negative errno code is returned
5514  *      on a failure to set up the device, or if the name is a duplicate.
5515  *
5516  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
5517  *      and expands the device name if you passed a format string to
5518  *      alloc_netdev.
5519  */
5520 int register_netdev(struct net_device *dev)
5521 {
5522         int err;
5523
5524         rtnl_lock();
5525
5526         /*
5527          * If the name is a format string the caller wants us to do a
5528          * name allocation.
5529          */
5530         if (strchr(dev->name, '%')) {
5531                 err = dev_alloc_name(dev, dev->name);
5532                 if (err < 0)
5533                         goto out;
5534         }
5535
5536         err = register_netdevice(dev);
5537 out:
5538         rtnl_unlock();
5539         return err;
5540 }
5541 EXPORT_SYMBOL(register_netdev);
5542
5543 int netdev_refcnt_read(const struct net_device *dev)
5544 {
5545         int i, refcnt = 0;
5546
5547         for_each_possible_cpu(i)
5548                 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5549         return refcnt;
5550 }
5551 EXPORT_SYMBOL(netdev_refcnt_read);
5552
5553 /*
5554  * netdev_wait_allrefs - wait until all references are gone.
5555  *
5556  * This is called when unregistering network devices.
5557  *
5558  * Any protocol or device that holds a reference should register
5559  * for netdevice notification, and cleanup and put back the
5560  * reference if they receive an UNREGISTER event.
5561  * We can get stuck here if buggy protocols don't correctly
5562  * call dev_put.
5563  */
5564 static void netdev_wait_allrefs(struct net_device *dev)
5565 {
5566         unsigned long rebroadcast_time, warning_time;
5567         int refcnt;
5568
5569         linkwatch_forget_dev(dev);
5570
5571         rebroadcast_time = warning_time = jiffies;
5572         refcnt = netdev_refcnt_read(dev);
5573
5574         while (refcnt != 0) {
5575                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5576                         rtnl_lock();
5577
5578                         /* Rebroadcast unregister notification */
5579                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5580                         /* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
5581                          * should have already handle it the first time */
5582
5583                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5584                                      &dev->state)) {
5585                                 /* We must not have linkwatch events
5586                                  * pending on unregister. If this
5587                                  * happens, we simply run the queue
5588                                  * unscheduled, resulting in a noop
5589                                  * for this device.
5590                                  */
5591                                 linkwatch_run_queue();
5592                         }
5593
5594                         __rtnl_unlock();
5595
5596                         rebroadcast_time = jiffies;
5597                 }
5598
5599                 msleep(250);
5600
5601                 refcnt = netdev_refcnt_read(dev);
5602
5603                 if (time_after(jiffies, warning_time + 10 * HZ)) {
5604                         printk(KERN_EMERG "unregister_netdevice: "
5605                                "waiting for %s to become free. Usage "
5606                                "count = %d\n",
5607                                dev->name, refcnt);
5608                         warning_time = jiffies;
5609                 }
5610         }
5611 }
5612
5613 /* The sequence is:
5614  *
5615  *      rtnl_lock();
5616  *      ...
5617  *      register_netdevice(x1);
5618  *      register_netdevice(x2);
5619  *      ...
5620  *      unregister_netdevice(y1);
5621  *      unregister_netdevice(y2);
5622  *      ...
5623  *      rtnl_unlock();
5624  *      free_netdev(y1);
5625  *      free_netdev(y2);
5626  *
5627  * We are invoked by rtnl_unlock().
5628  * This allows us to deal with problems:
5629  * 1) We can delete sysfs objects which invoke hotplug
5630  *    without deadlocking with linkwatch via keventd.
5631  * 2) Since we run with the RTNL semaphore not held, we can sleep
5632  *    safely in order to wait for the netdev refcnt to drop to zero.
5633  *
5634  * We must not return until all unregister events added during
5635  * the interval the lock was held have been completed.
5636  */
5637 void netdev_run_todo(void)
5638 {
5639         struct list_head list;
5640
5641         /* Snapshot list, allow later requests */
5642         list_replace_init(&net_todo_list, &list);
5643
5644         __rtnl_unlock();
5645
5646         while (!list_empty(&list)) {
5647                 struct net_device *dev
5648                         = list_first_entry(&list, struct net_device, todo_list);
5649                 list_del(&dev->todo_list);
5650
5651                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5652                         printk(KERN_ERR "network todo '%s' but state %d\n",
5653                                dev->name, dev->reg_state);
5654                         dump_stack();
5655                         continue;
5656                 }
5657
5658                 dev->reg_state = NETREG_UNREGISTERED;
5659
5660                 on_each_cpu(flush_backlog, dev, 1);
5661
5662                 netdev_wait_allrefs(dev);
5663
5664                 /* paranoia */
5665                 BUG_ON(netdev_refcnt_read(dev));
5666                 WARN_ON(rcu_dereference_raw(dev->ip_ptr));
5667                 WARN_ON(rcu_dereference_raw(dev->ip6_ptr));
5668                 WARN_ON(dev->dn_ptr);
5669
5670                 if (dev->destructor)
5671                         dev->destructor(dev);
5672
5673                 /* Free network device */
5674                 kobject_put(&dev->dev.kobj);
5675         }
5676 }
5677
5678 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
5679  * fields in the same order, with only the type differing.
5680  */
5681 static void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5682                                     const struct net_device_stats *netdev_stats)
5683 {
5684 #if BITS_PER_LONG == 64
5685         BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5686         memcpy(stats64, netdev_stats, sizeof(*stats64));
5687 #else
5688         size_t i, n = sizeof(*stats64) / sizeof(u64);
5689         const unsigned long *src = (const unsigned long *)netdev_stats;
5690         u64 *dst = (u64 *)stats64;
5691
5692         BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5693                      sizeof(*stats64) / sizeof(u64));
5694         for (i = 0; i < n; i++)
5695                 dst[i] = src[i];
5696 #endif
5697 }
5698
5699 /**
5700  *      dev_get_stats   - get network device statistics
5701  *      @dev: device to get statistics from
5702  *      @storage: place to store stats
5703  *
5704  *      Get network statistics from device. Return @storage.
5705  *      The device driver may provide its own method by setting
5706  *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5707  *      otherwise the internal statistics structure is used.
5708  */
5709 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5710                                         struct rtnl_link_stats64 *storage)
5711 {
5712         const struct net_device_ops *ops = dev->netdev_ops;
5713
5714         if (ops->ndo_get_stats64) {
5715                 memset(storage, 0, sizeof(*storage));
5716                 ops->ndo_get_stats64(dev, storage);
5717         } else if (ops->ndo_get_stats) {
5718                 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
5719         } else {
5720                 netdev_stats_to_stats64(storage, &dev->stats);
5721         }
5722         storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
5723         return storage;
5724 }
5725 EXPORT_SYMBOL(dev_get_stats);
5726
5727 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
5728 {
5729         struct netdev_queue *queue = dev_ingress_queue(dev);
5730
5731 #ifdef CONFIG_NET_CLS_ACT
5732         if (queue)
5733                 return queue;
5734         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5735         if (!queue)
5736                 return NULL;
5737         netdev_init_one_queue(dev, queue, NULL);
5738         queue->qdisc = &noop_qdisc;
5739         queue->qdisc_sleeping = &noop_qdisc;
5740         rcu_assign_pointer(dev->ingress_queue, queue);
5741 #endif
5742         return queue;
5743 }
5744
5745 /**
5746  *      alloc_netdev_mqs - allocate network device
5747  *      @sizeof_priv:   size of private data to allocate space for
5748  *      @name:          device name format string
5749  *      @setup:         callback to initialize device
5750  *      @txqs:          the number of TX subqueues to allocate
5751  *      @rxqs:          the number of RX subqueues to allocate
5752  *
5753  *      Allocates a struct net_device with private data area for driver use
5754  *      and performs basic initialization.  Also allocates subquue structs
5755  *      for each queue on the device.
5756  */
5757 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
5758                 void (*setup)(struct net_device *),
5759                 unsigned int txqs, unsigned int rxqs)
5760 {
5761         struct net_device *dev;
5762         size_t alloc_size;
5763         struct net_device *p;
5764
5765         BUG_ON(strlen(name) >= sizeof(dev->name));
5766
5767         if (txqs < 1) {
5768                 pr_err("alloc_netdev: Unable to allocate device "
5769                        "with zero queues.\n");
5770                 return NULL;
5771         }
5772
5773 #ifdef CONFIG_RPS
5774         if (rxqs < 1) {
5775                 pr_err("alloc_netdev: Unable to allocate device "
5776                        "with zero RX queues.\n");
5777                 return NULL;
5778         }
5779 #endif
5780
5781         alloc_size = sizeof(struct net_device);
5782         if (sizeof_priv) {
5783                 /* ensure 32-byte alignment of private area */
5784                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5785                 alloc_size += sizeof_priv;
5786         }
5787         /* ensure 32-byte alignment of whole construct */
5788         alloc_size += NETDEV_ALIGN - 1;
5789
5790         p = kzalloc(alloc_size, GFP_KERNEL);
5791         if (!p) {
5792                 printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
5793                 return NULL;
5794         }
5795
5796         dev = PTR_ALIGN(p, NETDEV_ALIGN);
5797         dev->padded = (char *)dev - (char *)p;
5798
5799         dev->pcpu_refcnt = alloc_percpu(int);
5800         if (!dev->pcpu_refcnt)
5801                 goto free_p;
5802
5803         if (dev_addr_init(dev))
5804                 goto free_pcpu;
5805
5806         dev_mc_init(dev);
5807         dev_uc_init(dev);
5808
5809         dev_net_set(dev, &init_net);
5810
5811         dev->gso_max_size = GSO_MAX_SIZE;
5812
5813         INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list);
5814         dev->ethtool_ntuple_list.count = 0;
5815         INIT_LIST_HEAD(&dev->napi_list);
5816         INIT_LIST_HEAD(&dev->unreg_list);
5817         INIT_LIST_HEAD(&dev->link_watch_list);
5818         dev->priv_flags = IFF_XMIT_DST_RELEASE;
5819         setup(dev);
5820
5821         dev->num_tx_queues = txqs;
5822         dev->real_num_tx_queues = txqs;
5823         if (netif_alloc_netdev_queues(dev))
5824                 goto free_all;
5825
5826 #ifdef CONFIG_RPS
5827         dev->num_rx_queues = rxqs;
5828         dev->real_num_rx_queues = rxqs;
5829         if (netif_alloc_rx_queues(dev))
5830                 goto free_all;
5831 #endif
5832
5833         strcpy(dev->name, name);
5834         dev->group = INIT_NETDEV_GROUP;
5835         return dev;
5836
5837 free_all:
5838         free_netdev(dev);
5839         return NULL;
5840
5841 free_pcpu:
5842         free_percpu(dev->pcpu_refcnt);
5843         kfree(dev->_tx);
5844 #ifdef CONFIG_RPS
5845         kfree(dev->_rx);
5846 #endif
5847
5848 free_p:
5849         kfree(p);
5850         return NULL;
5851 }
5852 EXPORT_SYMBOL(alloc_netdev_mqs);
5853
5854 /**
5855  *      free_netdev - free network device
5856  *      @dev: device
5857  *
5858  *      This function does the last stage of destroying an allocated device
5859  *      interface. The reference to the device object is released.
5860  *      If this is the last reference then it will be freed.
5861  */
5862 void free_netdev(struct net_device *dev)
5863 {
5864         struct napi_struct *p, *n;
5865
5866         release_net(dev_net(dev));
5867
5868         kfree(dev->_tx);
5869 #ifdef CONFIG_RPS
5870         kfree(dev->_rx);
5871 #endif
5872
5873         kfree(rcu_dereference_raw(dev->ingress_queue));
5874
5875         /* Flush device addresses */
5876         dev_addr_flush(dev);
5877
5878         /* Clear ethtool n-tuple list */
5879         ethtool_ntuple_flush(dev);
5880
5881         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5882                 netif_napi_del(p);
5883
5884         free_percpu(dev->pcpu_refcnt);
5885         dev->pcpu_refcnt = NULL;
5886
5887         /*  Compatibility with error handling in drivers */
5888         if (dev->reg_state == NETREG_UNINITIALIZED) {
5889                 kfree((char *)dev - dev->padded);
5890                 return;
5891         }
5892
5893         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5894         dev->reg_state = NETREG_RELEASED;
5895
5896         /* will free via device release */
5897         put_device(&dev->dev);
5898 }
5899 EXPORT_SYMBOL(free_netdev);
5900
5901 /**
5902  *      synchronize_net -  Synchronize with packet receive processing
5903  *
5904  *      Wait for packets currently being received to be done.
5905  *      Does not block later packets from starting.
5906  */
5907 void synchronize_net(void)
5908 {
5909         might_sleep();
5910         synchronize_rcu();
5911 }
5912 EXPORT_SYMBOL(synchronize_net);
5913
5914 /**
5915  *      unregister_netdevice_queue - remove device from the kernel
5916  *      @dev: device
5917  *      @head: list
5918  *
5919  *      This function shuts down a device interface and removes it
5920  *      from the kernel tables.
5921  *      If head not NULL, device is queued to be unregistered later.
5922  *
5923  *      Callers must hold the rtnl semaphore.  You may want
5924  *      unregister_netdev() instead of this.
5925  */
5926
5927 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
5928 {
5929         ASSERT_RTNL();
5930
5931         if (head) {
5932                 list_move_tail(&dev->unreg_list, head);
5933         } else {
5934                 rollback_registered(dev);
5935                 /* Finish processing unregister after unlock */
5936                 net_set_todo(dev);
5937         }
5938 }
5939 EXPORT_SYMBOL(unregister_netdevice_queue);
5940
5941 /**
5942  *      unregister_netdevice_many - unregister many devices
5943  *      @head: list of devices
5944  */
5945 void unregister_netdevice_many(struct list_head *head)
5946 {
5947         struct net_device *dev;
5948
5949         if (!list_empty(head)) {
5950                 rollback_registered_many(head);
5951                 list_for_each_entry(dev, head, unreg_list)
5952                         net_set_todo(dev);
5953         }
5954 }
5955 EXPORT_SYMBOL(unregister_netdevice_many);
5956
5957 /**
5958  *      unregister_netdev - remove device from the kernel
5959  *      @dev: device
5960  *
5961  *      This function shuts down a device interface and removes it
5962  *      from the kernel tables.
5963  *
5964  *      This is just a wrapper for unregister_netdevice that takes
5965  *      the rtnl semaphore.  In general you want to use this and not
5966  *      unregister_netdevice.
5967  */
5968 void unregister_netdev(struct net_device *dev)
5969 {
5970         rtnl_lock();
5971         unregister_netdevice(dev);
5972         rtnl_unlock();
5973 }
5974 EXPORT_SYMBOL(unregister_netdev);
5975
5976 /**
5977  *      dev_change_net_namespace - move device to different nethost namespace
5978  *      @dev: device
5979  *      @net: network namespace
5980  *      @pat: If not NULL name pattern to try if the current device name
5981  *            is already taken in the destination network namespace.
5982  *
5983  *      This function shuts down a device interface and moves it
5984  *      to a new network namespace. On success 0 is returned, on
5985  *      a failure a netagive errno code is returned.
5986  *
5987  *      Callers must hold the rtnl semaphore.
5988  */
5989
5990 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
5991 {
5992         int err;
5993
5994         ASSERT_RTNL();
5995
5996         /* Don't allow namespace local devices to be moved. */
5997         err = -EINVAL;
5998         if (dev->features & NETIF_F_NETNS_LOCAL)
5999                 goto out;
6000
6001         /* Ensure the device has been registrered */
6002         err = -EINVAL;
6003         if (dev->reg_state != NETREG_REGISTERED)
6004                 goto out;
6005
6006         /* Get out if there is nothing todo */
6007         err = 0;
6008         if (net_eq(dev_net(dev), net))
6009                 goto out;
6010
6011         /* Pick the destination device name, and ensure
6012          * we can use it in the destination network namespace.
6013          */
6014         err = -EEXIST;
6015         if (__dev_get_by_name(net, dev->name)) {
6016                 /* We get here if we can't use the current device name */
6017                 if (!pat)
6018                         goto out;
6019                 if (dev_get_valid_name(dev, pat, 1))
6020                         goto out;
6021         }
6022
6023         /*
6024          * And now a mini version of register_netdevice unregister_netdevice.
6025          */
6026
6027         /* If device is running close it first. */
6028         dev_close(dev);
6029
6030         /* And unlink it from device chain */
6031         err = -ENODEV;
6032         unlist_netdevice(dev);
6033
6034         synchronize_net();
6035
6036         /* Shutdown queueing discipline. */
6037         dev_shutdown(dev);
6038
6039         /* Notify protocols, that we are about to destroy
6040            this device. They should clean all the things.
6041
6042            Note that dev->reg_state stays at NETREG_REGISTERED.
6043            This is wanted because this way 8021q and macvlan know
6044            the device is just moving and can keep their slaves up.
6045         */
6046         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6047         call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
6048
6049         /*
6050          *      Flush the unicast and multicast chains
6051          */
6052         dev_uc_flush(dev);
6053         dev_mc_flush(dev);
6054
6055         /* Actually switch the network namespace */
6056         dev_net_set(dev, net);
6057
6058         /* If there is an ifindex conflict assign a new one */
6059         if (__dev_get_by_index(net, dev->ifindex)) {
6060                 int iflink = (dev->iflink == dev->ifindex);
6061                 dev->ifindex = dev_new_index(net);
6062                 if (iflink)
6063                         dev->iflink = dev->ifindex;
6064         }
6065
6066         /* Fixup kobjects */
6067         err = device_rename(&dev->dev, dev->name);
6068         WARN_ON(err);
6069
6070         /* Add the device back in the hashes */
6071         list_netdevice(dev);
6072
6073         /* Notify protocols, that a new device appeared. */
6074         call_netdevice_notifiers(NETDEV_REGISTER, dev);
6075
6076         /*
6077          *      Prevent userspace races by waiting until the network
6078          *      device is fully setup before sending notifications.
6079          */
6080         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6081
6082         synchronize_net();
6083         err = 0;
6084 out:
6085         return err;
6086 }
6087 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6088
6089 static int dev_cpu_callback(struct notifier_block *nfb,
6090                             unsigned long action,
6091                             void *ocpu)
6092 {
6093         struct sk_buff **list_skb;
6094         struct sk_buff *skb;
6095         unsigned int cpu, oldcpu = (unsigned long)ocpu;
6096         struct softnet_data *sd, *oldsd;
6097
6098         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6099                 return NOTIFY_OK;
6100
6101         local_irq_disable();
6102         cpu = smp_processor_id();
6103         sd = &per_cpu(softnet_data, cpu);
6104         oldsd = &per_cpu(softnet_data, oldcpu);
6105
6106         /* Find end of our completion_queue. */
6107         list_skb = &sd->completion_queue;
6108         while (*list_skb)
6109                 list_skb = &(*list_skb)->next;
6110         /* Append completion queue from offline CPU. */
6111         *list_skb = oldsd->completion_queue;
6112         oldsd->completion_queue = NULL;
6113
6114         /* Append output queue from offline CPU. */
6115         if (oldsd->output_queue) {
6116                 *sd->output_queue_tailp = oldsd->output_queue;
6117                 sd->output_queue_tailp = oldsd->output_queue_tailp;
6118                 oldsd->output_queue = NULL;
6119                 oldsd->output_queue_tailp = &oldsd->output_queue;
6120         }
6121
6122         raise_softirq_irqoff(NET_TX_SOFTIRQ);
6123         local_irq_enable();
6124
6125         /* Process offline CPU's input_pkt_queue */
6126         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6127                 netif_rx(skb);
6128                 input_queue_head_incr(oldsd);
6129         }
6130         while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6131                 netif_rx(skb);
6132                 input_queue_head_incr(oldsd);
6133         }
6134
6135         return NOTIFY_OK;
6136 }
6137
6138
6139 /**
6140  *      netdev_increment_features - increment feature set by one
6141  *      @all: current feature set
6142  *      @one: new feature set
6143  *      @mask: mask feature set
6144  *
6145  *      Computes a new feature set after adding a device with feature set
6146  *      @one to the master device with current feature set @all.  Will not
6147  *      enable anything that is off in @mask. Returns the new feature set.
6148  */
6149 u32 netdev_increment_features(u32 all, u32 one, u32 mask)
6150 {
6151         /* If device needs checksumming, downgrade to it. */
6152         if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
6153                 all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM);
6154         else if (mask & NETIF_F_ALL_CSUM) {
6155                 /* If one device supports v4/v6 checksumming, set for all. */
6156                 if (one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM) &&
6157                     !(all & NETIF_F_GEN_CSUM)) {
6158                         all &= ~NETIF_F_ALL_CSUM;
6159                         all |= one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
6160                 }
6161
6162                 /* If one device supports hw checksumming, set for all. */
6163                 if (one & NETIF_F_GEN_CSUM && !(all & NETIF_F_GEN_CSUM)) {
6164                         all &= ~NETIF_F_ALL_CSUM;
6165                         all |= NETIF_F_HW_CSUM;
6166                 }
6167         }
6168
6169         one |= NETIF_F_ALL_CSUM;
6170
6171         one |= all & NETIF_F_ONE_FOR_ALL;
6172         all &= one | NETIF_F_LLTX | NETIF_F_GSO | NETIF_F_UFO;
6173         all |= one & mask & NETIF_F_ONE_FOR_ALL;
6174
6175         return all;
6176 }
6177 EXPORT_SYMBOL(netdev_increment_features);
6178
6179 static struct hlist_head *netdev_create_hash(void)
6180 {
6181         int i;
6182         struct hlist_head *hash;
6183
6184         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6185         if (hash != NULL)
6186                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
6187                         INIT_HLIST_HEAD(&hash[i]);
6188
6189         return hash;
6190 }
6191
6192 /* Initialize per network namespace state */
6193 static int __net_init netdev_init(struct net *net)
6194 {
6195         INIT_LIST_HEAD(&net->dev_base_head);
6196
6197         net->dev_name_head = netdev_create_hash();
6198         if (net->dev_name_head == NULL)
6199                 goto err_name;
6200
6201         net->dev_index_head = netdev_create_hash();
6202         if (net->dev_index_head == NULL)
6203                 goto err_idx;
6204
6205         return 0;
6206
6207 err_idx:
6208         kfree(net->dev_name_head);
6209 err_name:
6210         return -ENOMEM;
6211 }
6212
6213 /**
6214  *      netdev_drivername - network driver for the device
6215  *      @dev: network device
6216  *      @buffer: buffer for resulting name
6217  *      @len: size of buffer
6218  *
6219  *      Determine network driver for device.
6220  */
6221 char *netdev_drivername(const struct net_device *dev, char *buffer, int len)
6222 {
6223         const struct device_driver *driver;
6224         const struct device *parent;
6225
6226         if (len <= 0 || !buffer)
6227                 return buffer;
6228         buffer[0] = 0;
6229
6230         parent = dev->dev.parent;
6231
6232         if (!parent)
6233                 return buffer;
6234
6235         driver = parent->driver;
6236         if (driver && driver->name)
6237                 strlcpy(buffer, driver->name, len);
6238         return buffer;
6239 }
6240
6241 static int __netdev_printk(const char *level, const struct net_device *dev,
6242                            struct va_format *vaf)
6243 {
6244         int r;
6245
6246         if (dev && dev->dev.parent)
6247                 r = dev_printk(level, dev->dev.parent, "%s: %pV",
6248                                netdev_name(dev), vaf);
6249         else if (dev)
6250                 r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6251         else
6252                 r = printk("%s(NULL net_device): %pV", level, vaf);
6253
6254         return r;
6255 }
6256
6257 int netdev_printk(const char *level, const struct net_device *dev,
6258                   const char *format, ...)
6259 {
6260         struct va_format vaf;
6261         va_list args;
6262         int r;
6263
6264         va_start(args, format);
6265
6266         vaf.fmt = format;
6267         vaf.va = &args;
6268
6269         r = __netdev_printk(level, dev, &vaf);
6270         va_end(args);
6271
6272         return r;
6273 }
6274 EXPORT_SYMBOL(netdev_printk);
6275
6276 #define define_netdev_printk_level(func, level)                 \
6277 int func(const struct net_device *dev, const char *fmt, ...)    \
6278 {                                                               \
6279         int r;                                                  \
6280         struct va_format vaf;                                   \
6281         va_list args;                                           \
6282                                                                 \
6283         va_start(args, fmt);                                    \
6284                                                                 \
6285         vaf.fmt = fmt;                                          \
6286         vaf.va = &args;                                         \
6287                                                                 \
6288         r = __netdev_printk(level, dev, &vaf);                  \
6289         va_end(args);                                           \
6290                                                                 \
6291         return r;                                               \
6292 }                                                               \
6293 EXPORT_SYMBOL(func);
6294
6295 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6296 define_netdev_printk_level(netdev_alert, KERN_ALERT);
6297 define_netdev_printk_level(netdev_crit, KERN_CRIT);
6298 define_netdev_printk_level(netdev_err, KERN_ERR);
6299 define_netdev_printk_level(netdev_warn, KERN_WARNING);
6300 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6301 define_netdev_printk_level(netdev_info, KERN_INFO);
6302
6303 static void __net_exit netdev_exit(struct net *net)
6304 {
6305         kfree(net->dev_name_head);
6306         kfree(net->dev_index_head);
6307 }
6308
6309 static struct pernet_operations __net_initdata netdev_net_ops = {
6310         .init = netdev_init,
6311         .exit = netdev_exit,
6312 };
6313
6314 static void __net_exit default_device_exit(struct net *net)
6315 {
6316         struct net_device *dev, *aux;
6317         /*
6318          * Push all migratable network devices back to the
6319          * initial network namespace
6320          */
6321         rtnl_lock();
6322         for_each_netdev_safe(net, dev, aux) {
6323                 int err;
6324                 char fb_name[IFNAMSIZ];
6325
6326                 /* Ignore unmoveable devices (i.e. loopback) */
6327                 if (dev->features & NETIF_F_NETNS_LOCAL)
6328                         continue;
6329
6330                 /* Leave virtual devices for the generic cleanup */
6331                 if (dev->rtnl_link_ops)
6332                         continue;
6333
6334                 /* Push remaing network devices to init_net */
6335                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6336                 err = dev_change_net_namespace(dev, &init_net, fb_name);
6337                 if (err) {
6338                         printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
6339                                 __func__, dev->name, err);
6340                         BUG();
6341                 }
6342         }
6343         rtnl_unlock();
6344 }
6345
6346 static void __net_exit default_device_exit_batch(struct list_head *net_list)
6347 {
6348         /* At exit all network devices most be removed from a network
6349          * namespace.  Do this in the reverse order of registration.
6350          * Do this across as many network namespaces as possible to
6351          * improve batching efficiency.
6352          */
6353         struct net_device *dev;
6354         struct net *net;
6355         LIST_HEAD(dev_kill_list);
6356
6357         rtnl_lock();
6358         list_for_each_entry(net, net_list, exit_list) {
6359                 for_each_netdev_reverse(net, dev) {
6360                         if (dev->rtnl_link_ops)
6361                                 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6362                         else
6363                                 unregister_netdevice_queue(dev, &dev_kill_list);
6364                 }
6365         }
6366         unregister_netdevice_many(&dev_kill_list);
6367         rtnl_unlock();
6368 }
6369
6370 static struct pernet_operations __net_initdata default_device_ops = {
6371         .exit = default_device_exit,
6372         .exit_batch = default_device_exit_batch,
6373 };
6374
6375 /*
6376  *      Initialize the DEV module. At boot time this walks the device list and
6377  *      unhooks any devices that fail to initialise (normally hardware not
6378  *      present) and leaves us with a valid list of present and active devices.
6379  *
6380  */
6381
6382 /*
6383  *       This is called single threaded during boot, so no need
6384  *       to take the rtnl semaphore.
6385  */
6386 static int __init net_dev_init(void)
6387 {
6388         int i, rc = -ENOMEM;
6389
6390         BUG_ON(!dev_boot_phase);
6391
6392         if (dev_proc_init())
6393                 goto out;
6394
6395         if (netdev_kobject_init())
6396                 goto out;
6397
6398         INIT_LIST_HEAD(&ptype_all);
6399         for (i = 0; i < PTYPE_HASH_SIZE; i++)
6400                 INIT_LIST_HEAD(&ptype_base[i]);
6401
6402         if (register_pernet_subsys(&netdev_net_ops))
6403                 goto out;
6404
6405         /*
6406          *      Initialise the packet receive queues.
6407          */
6408
6409         for_each_possible_cpu(i) {
6410                 struct softnet_data *sd = &per_cpu(softnet_data, i);
6411
6412                 memset(sd, 0, sizeof(*sd));
6413                 skb_queue_head_init(&sd->input_pkt_queue);
6414                 skb_queue_head_init(&sd->process_queue);
6415                 sd->completion_queue = NULL;
6416                 INIT_LIST_HEAD(&sd->poll_list);
6417                 sd->output_queue = NULL;
6418                 sd->output_queue_tailp = &sd->output_queue;
6419 #ifdef CONFIG_RPS
6420                 sd->csd.func = rps_trigger_softirq;
6421                 sd->csd.info = sd;
6422                 sd->csd.flags = 0;
6423                 sd->cpu = i;
6424 #endif
6425
6426                 sd->backlog.poll = process_backlog;
6427                 sd->backlog.weight = weight_p;
6428                 sd->backlog.gro_list = NULL;
6429                 sd->backlog.gro_count = 0;
6430         }
6431
6432         dev_boot_phase = 0;
6433
6434         /* The loopback device is special if any other network devices
6435          * is present in a network namespace the loopback device must
6436          * be present. Since we now dynamically allocate and free the
6437          * loopback device ensure this invariant is maintained by
6438          * keeping the loopback device as the first device on the
6439          * list of network devices.  Ensuring the loopback devices
6440          * is the first device that appears and the last network device
6441          * that disappears.
6442          */
6443         if (register_pernet_device(&loopback_net_ops))
6444                 goto out;
6445
6446         if (register_pernet_device(&default_device_ops))
6447                 goto out;
6448
6449         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6450         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6451
6452         hotcpu_notifier(dev_cpu_callback, 0);
6453         dst_init();
6454         dev_mcast_init();
6455         rc = 0;
6456 out:
6457         return rc;
6458 }
6459
6460 subsys_initcall(net_dev_init);
6461
6462 static int __init initialize_hashrnd(void)
6463 {
6464         get_random_bytes(&hashrnd, sizeof(hashrnd));
6465         return 0;
6466 }
6467
6468 late_initcall_sync(initialize_hashrnd);
6469