net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro
  11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17  *              David Hinds <dahinds@users.sourceforge.net>
  18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19  *              Adam Sulmicki <adam@cfar.umd.edu>
  20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <asm/system.h>
  77 #include <linux/bitops.h>
  78 #include <linux/capability.h>
  79 #include <linux/cpu.h>
  80 #include <linux/types.h>
  81 #include <linux/kernel.h>
  82 #include <linux/hash.h>
  83 #include <linux/slab.h>
  84 #include <linux/sched.h>
  85 #include <linux/mutex.h>
  86 #include <linux/string.h>
  87 #include <linux/mm.h>
  88 #include <linux/socket.h>
  89 #include <linux/sockios.h>
  90 #include <linux/errno.h>
  91 #include <linux/interrupt.h>
  92 #include <linux/if_ether.h>
  93 #include <linux/netdevice.h>
  94 #include <linux/etherdevice.h>
  95 #include <linux/ethtool.h>
  96 #include <linux/notifier.h>
  97 #include <linux/skbuff.h>
  98 #include <net/net_namespace.h>
  99 #include <net/sock.h>
 100 #include <linux/rtnetlink.h>
 101 #include <linux/proc_fs.h>
 102 #include <linux/seq_file.h>
 103 #include <linux/stat.h>
 104 #include <net/dst.h>
 105 #include <net/pkt_sched.h>
 106 #include <net/checksum.h>
 107 #include <net/xfrm.h>
 108 #include <linux/highmem.h>
 109 #include <linux/init.h>
 110 #include <linux/kmod.h>
 111 #include <linux/module.h>
 112 #include <linux/netpoll.h>
 113 #include <linux/rcupdate.h>
 114 #include <linux/delay.h>
 115 #include <net/wext.h>
 116 #include <net/iw_handler.h>
 117 #include <asm/current.h>
 118 #include <linux/audit.h>
 119 #include <linux/dmaengine.h>
 120 #include <linux/err.h>
 121 #include <linux/ctype.h>
 122 #include <linux/if_arp.h>
 123 #include <linux/if_vlan.h>
 124 #include <linux/ip.h>
 125 #include <net/ip.h>
 126 #include <linux/ipv6.h>
 127 #include <linux/in.h>
 128 #include <linux/jhash.h>
 129 #include <linux/random.h>
 130 #include <trace/events/napi.h>
 131 #include <trace/events/net.h>
 132 #include <trace/events/skb.h>
 133 #include <linux/pci.h>
 134 #include <linux/inetdevice.h>
 135 #include <linux/cpu_rmap.h>
 136
 137 #include "net-sysfs.h"
 138
 139 /* Instead of increasing this, you should create a hash table. */
 140 #define MAX_GRO_SKBS 8
 141
 142 /* This should be increased if a protocol with a bigger head is added. */
 143 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 144
 145 /*
 146  *      The list of packet types we will receive (as opposed to discard)
 147  *      and the routines to invoke.
 148  *
 149  *      Why 16. Because with 16 the only overlap we get on a hash of the
 150  *      low nibble of the protocol value is RARP/SNAP/X.25.
 151  *
 152  *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
 153  *             sure which should go first, but I bet it won't make much
 154  *             difference if we are running VLANs.  The good news is that
 155  *             this protocol won't be in the list unless compiled in, so
 156  *             the average user (w/out VLANs) will not be adversely affected.
 157  *             --BLG
 158  *
 159  *              0800    IP
 160  *              8100    802.1Q VLAN
 161  *              0001    802.3
 162  *              0002    AX.25
 163  *              0004    802.2
 164  *              8035    RARP
 165  *              0005    SNAP
 166  *              0805    X.25
 167  *              0806    ARP
 168  *              8137    IPX
 169  *              0009    Localtalk
 170  *              86DD    IPv6
 171  */
 172
 173 #define PTYPE_HASH_SIZE (16)
 174 #define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
 175
 176 static DEFINE_SPINLOCK(ptype_lock);
 177 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 178 static struct list_head ptype_all __read_mostly;        /* Taps */
 179
 180 /*
 181  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 182  * semaphore.
 183  *
 184  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 185  *
 186  * Writers must hold the rtnl semaphore while they loop through the
 187  * dev_base_head list, and hold dev_base_lock for writing when they do the
 188  * actual updates.  This allows pure readers to access the list even
 189  * while a writer is preparing to update it.
 190  *
 191  * To put it another way, dev_base_lock is held for writing only to
 192  * protect against pure readers; the rtnl semaphore provides the
 193  * protection against other writers.
 194  *
 195  * See, for example usages, register_netdevice() and
 196  * unregister_netdevice(), which must be called with the rtnl
 197  * semaphore held.
 198  */
 199 DEFINE_RWLOCK(dev_base_lock);
 200 EXPORT_SYMBOL(dev_base_lock);
 201
 202 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 203 {
 204         unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 205         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 206 }
 207
 208 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 209 {
 210         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 211 }
 212
 213 static inline void rps_lock(struct softnet_data *sd)
 214 {
 215 #ifdef CONFIG_RPS
 216         spin_lock(&sd->input_pkt_queue.lock);
 217 #endif
 218 }
 219
 220 static inline void rps_unlock(struct softnet_data *sd)
 221 {
 222 #ifdef CONFIG_RPS
 223         spin_unlock(&sd->input_pkt_queue.lock);
 224 #endif
 225 }
 226
 227 /* Device list insertion */
 228 static int list_netdevice(struct net_device *dev)
 229 {
 230         struct net *net = dev_net(dev);
 231
 232         ASSERT_RTNL();
 233
 234         write_lock_bh(&dev_base_lock);
 235         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 236         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 237         hlist_add_head_rcu(&dev->index_hlist,
 238                            dev_index_hash(net, dev->ifindex));
 239         write_unlock_bh(&dev_base_lock);
 240         return 0;
 241 }
 242
 243 /* Device list removal
 244  * caller must respect a RCU grace period before freeing/reusing dev
 245  */
 246 static void unlist_netdevice(struct net_device *dev)
 247 {
 248         ASSERT_RTNL();
 249
 250         /* Unlink dev from the device chain */
 251         write_lock_bh(&dev_base_lock);
 252         list_del_rcu(&dev->dev_list);
 253         hlist_del_rcu(&dev->name_hlist);
 254         hlist_del_rcu(&dev->index_hlist);
 255         write_unlock_bh(&dev_base_lock);
 256 }
 257
 258 /*
 259  *      Our notifier list
 260  */
 261
 262 static RAW_NOTIFIER_HEAD(netdev_chain);
 263
 264 /*
 265  *      Device drivers call our routines to queue packets here. We empty the
 266  *      queue in the local softnet handler.
 267  */
 268
 269 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 270 EXPORT_PER_CPU_SYMBOL(softnet_data);
 271
 272 #ifdef CONFIG_LOCKDEP
 273 /*
 274  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 275  * according to dev->type
 276  */
 277 static const unsigned short netdev_lock_type[] =
 278         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 279          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 280          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 281          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 282          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 283          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 284          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 285          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 286          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 287          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 288          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 289          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 290          ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
 291          ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
 292          ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,
 293          ARPHRD_VOID, ARPHRD_NONE};
 294
 295 static const char *const netdev_lock_name[] =
 296         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 297          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 298          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 299          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 300          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 301          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 302          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 303          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 304          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 305          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 306          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 307          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 308          "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
 309          "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
 310          "_xmit_PHONET_PIPE", "_xmit_IEEE802154",
 311          "_xmit_VOID", "_xmit_NONE"};
 312
 313 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 314 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 315
 316 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 317 {
 318         int i;
 319
 320         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 321                 if (netdev_lock_type[i] == dev_type)
 322                         return i;
 323         /* the last key is used by default */
 324         return ARRAY_SIZE(netdev_lock_type) - 1;
 325 }
 326
 327 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 328                                                  unsigned short dev_type)
 329 {
 330         int i;
 331
 332         i = netdev_lock_pos(dev_type);
 333         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 334                                    netdev_lock_name[i]);
 335 }
 336
 337 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 338 {
 339         int i;
 340
 341         i = netdev_lock_pos(dev->type);
 342         lockdep_set_class_and_name(&dev->addr_list_lock,
 343                                    &netdev_addr_lock_key[i],
 344                                    netdev_lock_name[i]);
 345 }
 346 #else
 347 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 348                                                  unsigned short dev_type)
 349 {
 350 }
 351 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 352 {
 353 }
 354 #endif
 355
 356 /*******************************************************************************
 357
 358                 Protocol management and registration routines
 359
 360 *******************************************************************************/
 361
 362 /*
 363  *      Add a protocol ID to the list. Now that the input handler is
 364  *      smarter we can dispense with all the messy stuff that used to be
 365  *      here.
 366  *
 367  *      BEWARE!!! Protocol handlers, mangling input packets,
 368  *      MUST BE last in hash buckets and checking protocol handlers
 369  *      MUST start from promiscuous ptype_all chain in net_bh.
 370  *      It is true now, do not change it.
 371  *      Explanation follows: if protocol handler, mangling packet, will
 372  *      be the first on list, it is not able to sense, that packet
 373  *      is cloned and should be copied-on-write, so that it will
 374  *      change it and subsequent readers will get broken packet.
 375  *                                                      --ANK (980803)
 376  */
 377
 378 static inline struct list_head *ptype_head(const struct packet_type *pt)
 379 {
 380         if (pt->type == htons(ETH_P_ALL))
 381                 return &ptype_all;
 382         else
 383                 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 384 }
 385
 386 /**
 387  *      dev_add_pack - add packet handler
 388  *      @pt: packet type declaration
 389  *
 390  *      Add a protocol handler to the networking stack. The passed &packet_type
 391  *      is linked into kernel lists and may not be freed until it has been
 392  *      removed from the kernel lists.
 393  *
 394  *      This call does not sleep therefore it can not
 395  *      guarantee all CPU's that are in middle of receiving packets
 396  *      will see the new packet type (until the next received packet).
 397  */
 398
 399 void dev_add_pack(struct packet_type *pt)
 400 {
 401         struct list_head *head = ptype_head(pt);
 402
 403         spin_lock(&ptype_lock);
 404         list_add_rcu(&pt->list, head);
 405         spin_unlock(&ptype_lock);
 406 }
 407 EXPORT_SYMBOL(dev_add_pack);
 408
 409 /**
 410  *      __dev_remove_pack        - remove packet handler
 411  *      @pt: packet type declaration
 412  *
 413  *      Remove a protocol handler that was previously added to the kernel
 414  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 415  *      from the kernel lists and can be freed or reused once this function
 416  *      returns.
 417  *
 418  *      The packet type might still be in use by receivers
 419  *      and must not be freed until after all the CPU's have gone
 420  *      through a quiescent state.
 421  */
 422 void __dev_remove_pack(struct packet_type *pt)
 423 {
 424         struct list_head *head = ptype_head(pt);
 425         struct packet_type *pt1;
 426
 427         spin_lock(&ptype_lock);
 428
 429         list_for_each_entry(pt1, head, list) {
 430                 if (pt == pt1) {
 431                         list_del_rcu(&pt->list);
 432                         goto out;
 433                 }
 434         }
 435
 436         printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
 437 out:
 438         spin_unlock(&ptype_lock);
 439 }
 440 EXPORT_SYMBOL(__dev_remove_pack);
 441
 442 /**
 443  *      dev_remove_pack  - remove packet handler
 444  *      @pt: packet type declaration
 445  *
 446  *      Remove a protocol handler that was previously added to the kernel
 447  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 448  *      from the kernel lists and can be freed or reused once this function
 449  *      returns.
 450  *
 451  *      This call sleeps to guarantee that no CPU is looking at the packet
 452  *      type after return.
 453  */
 454 void dev_remove_pack(struct packet_type *pt)
 455 {
 456         __dev_remove_pack(pt);
 457
 458         synchronize_net();
 459 }
 460 EXPORT_SYMBOL(dev_remove_pack);
 461
 462 /******************************************************************************
 463
 464                       Device Boot-time Settings Routines
 465
 466 *******************************************************************************/
 467
 468 /* Boot time configuration table */
 469 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 470
 471 /**
 472  *      netdev_boot_setup_add   - add new setup entry
 473  *      @name: name of the device
 474  *      @map: configured settings for the device
 475  *
 476  *      Adds new setup entry to the dev_boot_setup list.  The function
 477  *      returns 0 on error and 1 on success.  This is a generic routine to
 478  *      all netdevices.
 479  */
 480 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 481 {
 482         struct netdev_boot_setup *s;
 483         int i;
 484
 485         s = dev_boot_setup;
 486         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 487                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 488                         memset(s[i].name, 0, sizeof(s[i].name));
 489                         strlcpy(s[i].name, name, IFNAMSIZ);
 490                         memcpy(&s[i].map, map, sizeof(s[i].map));
 491                         break;
 492                 }
 493         }
 494
 495         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 496 }
 497
 498 /**
 499  *      netdev_boot_setup_check - check boot time settings
 500  *      @dev: the netdevice
 501  *
 502  *      Check boot time settings for the device.
 503  *      The found settings are set for the device to be used
 504  *      later in the device probing.
 505  *      Returns 0 if no settings found, 1 if they are.
 506  */
 507 int netdev_boot_setup_check(struct net_device *dev)
 508 {
 509         struct netdev_boot_setup *s = dev_boot_setup;
 510         int i;
 511
 512         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 513                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 514                     !strcmp(dev->name, s[i].name)) {
 515                         dev->irq        = s[i].map.irq;
 516                         dev->base_addr  = s[i].map.base_addr;
 517                         dev->mem_start  = s[i].map.mem_start;
 518                         dev->mem_end    = s[i].map.mem_end;
 519                         return 1;
 520                 }
 521         }
 522         return 0;
 523 }
 524 EXPORT_SYMBOL(netdev_boot_setup_check);
 525
 526
 527 /**
 528  *      netdev_boot_base        - get address from boot time settings
 529  *      @prefix: prefix for network device
 530  *      @unit: id for network device
 531  *
 532  *      Check boot time settings for the base address of device.
 533  *      The found settings are set for the device to be used
 534  *      later in the device probing.
 535  *      Returns 0 if no settings found.
 536  */
 537 unsigned long netdev_boot_base(const char *prefix, int unit)
 538 {
 539         const struct netdev_boot_setup *s = dev_boot_setup;
 540         char name[IFNAMSIZ];
 541         int i;
 542
 543         sprintf(name, "%s%d", prefix, unit);
 544
 545         /*
 546          * If device already registered then return base of 1
 547          * to indicate not to probe for this interface
 548          */
 549         if (__dev_get_by_name(&init_net, name))
 550                 return 1;
 551
 552         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 553                 if (!strcmp(name, s[i].name))
 554                         return s[i].map.base_addr;
 555         return 0;
 556 }
 557
 558 /*
 559  * Saves at boot time configured settings for any netdevice.
 560  */
 561 int __init netdev_boot_setup(char *str)
 562 {
 563         int ints[5];
 564         struct ifmap map;
 565
 566         str = get_options(str, ARRAY_SIZE(ints), ints);
 567         if (!str || !*str)
 568                 return 0;
 569
 570         /* Save settings */
 571         memset(&map, 0, sizeof(map));
 572         if (ints[0] > 0)
 573                 map.irq = ints[1];
 574         if (ints[0] > 1)
 575                 map.base_addr = ints[2];
 576         if (ints[0] > 2)
 577                 map.mem_start = ints[3];
 578         if (ints[0] > 3)
 579                 map.mem_end = ints[4];
 580
 581         /* Add new entry to the list */
 582         return netdev_boot_setup_add(str, &map);
 583 }
 584
 585 __setup("netdev=", netdev_boot_setup);
 586
 587 /*******************************************************************************
 588
 589                             Device Interface Subroutines
 590
 591 *******************************************************************************/
 592
 593 /**
 594  *      __dev_get_by_name       - find a device by its name
 595  *      @net: the applicable net namespace
 596  *      @name: name to find
 597  *
 598  *      Find an interface by name. Must be called under RTNL semaphore
 599  *      or @dev_base_lock. If the name is found a pointer to the device
 600  *      is returned. If the name is not found then %NULL is returned. The
 601  *      reference counters are not incremented so the caller must be
 602  *      careful with locks.
 603  */
 604
 605 struct net_device *__dev_get_by_name(struct net *net, const char *name)
 606 {
 607         struct hlist_node *p;
 608         struct net_device *dev;
 609         struct hlist_head *head = dev_name_hash(net, name);
 610
 611         hlist_for_each_entry(dev, p, head, name_hlist)
 612                 if (!strncmp(dev->name, name, IFNAMSIZ))
 613                         return dev;
 614
 615         return NULL;
 616 }
 617 EXPORT_SYMBOL(__dev_get_by_name);
 618
 619 /**
 620  *      dev_get_by_name_rcu     - find a device by its name
 621  *      @net: the applicable net namespace
 622  *      @name: name to find
 623  *
 624  *      Find an interface by name.
 625  *      If the name is found a pointer to the device is returned.
 626  *      If the name is not found then %NULL is returned.
 627  *      The reference counters are not incremented so the caller must be
 628  *      careful with locks. The caller must hold RCU lock.
 629  */
 630
 631 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 632 {
 633         struct hlist_node *p;
 634         struct net_device *dev;
 635         struct hlist_head *head = dev_name_hash(net, name);
 636
 637         hlist_for_each_entry_rcu(dev, p, head, name_hlist)
 638                 if (!strncmp(dev->name, name, IFNAMSIZ))
 639                         return dev;
 640
 641         return NULL;
 642 }
 643 EXPORT_SYMBOL(dev_get_by_name_rcu);
 644
 645 /**
 646  *      dev_get_by_name         - find a device by its name
 647  *      @net: the applicable net namespace
 648  *      @name: name to find
 649  *
 650  *      Find an interface by name. This can be called from any
 651  *      context and does its own locking. The returned handle has
 652  *      the usage count incremented and the caller must use dev_put() to
 653  *      release it when it is no longer needed. %NULL is returned if no
 654  *      matching device is found.
 655  */
 656
 657 struct net_device *dev_get_by_name(struct net *net, const char *name)
 658 {
 659         struct net_device *dev;
 660
 661         rcu_read_lock();
 662         dev = dev_get_by_name_rcu(net, name);
 663         if (dev)
 664                 dev_hold(dev);
 665         rcu_read_unlock();
 666         return dev;
 667 }
 668 EXPORT_SYMBOL(dev_get_by_name);
 669
 670 /**
 671  *      __dev_get_by_index - find a device by its ifindex
 672  *      @net: the applicable net namespace
 673  *      @ifindex: index of device
 674  *
 675  *      Search for an interface by index. Returns %NULL if the device
 676  *      is not found or a pointer to the device. The device has not
 677  *      had its reference counter increased so the caller must be careful
 678  *      about locking. The caller must hold either the RTNL semaphore
 679  *      or @dev_base_lock.
 680  */
 681
 682 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 683 {
 684         struct hlist_node *p;
 685         struct net_device *dev;
 686         struct hlist_head *head = dev_index_hash(net, ifindex);
 687
 688         hlist_for_each_entry(dev, p, head, index_hlist)
 689                 if (dev->ifindex == ifindex)
 690                         return dev;
 691
 692         return NULL;
 693 }
 694 EXPORT_SYMBOL(__dev_get_by_index);
 695
 696 /**
 697  *      dev_get_by_index_rcu - find a device by its ifindex
 698  *      @net: the applicable net namespace
 699  *      @ifindex: index of device
 700  *
 701  *      Search for an interface by index. Returns %NULL if the device
 702  *      is not found or a pointer to the device. The device has not
 703  *      had its reference counter increased so the caller must be careful
 704  *      about locking. The caller must hold RCU lock.
 705  */
 706
 707 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 708 {
 709         struct hlist_node *p;
 710         struct net_device *dev;
 711         struct hlist_head *head = dev_index_hash(net, ifindex);
 712
 713         hlist_for_each_entry_rcu(dev, p, head, index_hlist)
 714                 if (dev->ifindex == ifindex)
 715                         return dev;
 716
 717         return NULL;
 718 }
 719 EXPORT_SYMBOL(dev_get_by_index_rcu);
 720
 721
 722 /**
 723  *      dev_get_by_index - find a device by its ifindex
 724  *      @net: the applicable net namespace
 725  *      @ifindex: index of device
 726  *
 727  *      Search for an interface by index. Returns NULL if the device
 728  *      is not found or a pointer to the device. The device returned has
 729  *      had a reference added and the pointer is safe until the user calls
 730  *      dev_put to indicate they have finished with it.
 731  */
 732
 733 struct net_device *dev_get_by_index(struct net *net, int ifindex)
 734 {
 735         struct net_device *dev;
 736
 737         rcu_read_lock();
 738         dev = dev_get_by_index_rcu(net, ifindex);
 739         if (dev)
 740                 dev_hold(dev);
 741         rcu_read_unlock();
 742         return dev;
 743 }
 744 EXPORT_SYMBOL(dev_get_by_index);
 745
 746 /**
 747  *      dev_getbyhwaddr_rcu - find a device by its hardware address
 748  *      @net: the applicable net namespace
 749  *      @type: media type of device
 750  *      @ha: hardware address
 751  *
 752  *      Search for an interface by MAC address. Returns NULL if the device
 753  *      is not found or a pointer to the device.
 754  *      The caller must hold RCU or RTNL.
 755  *      The returned device has not had its ref count increased
 756  *      and the caller must therefore be careful about locking
 757  *
 758  */
 759
 760 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 761                                        const char *ha)
 762 {
 763         struct net_device *dev;
 764
 765         for_each_netdev_rcu(net, dev)
 766                 if (dev->type == type &&
 767                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 768                         return dev;
 769
 770         return NULL;
 771 }
 772 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 773
 774 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 775 {
 776         struct net_device *dev;
 777
 778         ASSERT_RTNL();
 779         for_each_netdev(net, dev)
 780                 if (dev->type == type)
 781                         return dev;
 782
 783         return NULL;
 784 }
 785 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 786
 787 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 788 {
 789         struct net_device *dev, *ret = NULL;
 790
 791         rcu_read_lock();
 792         for_each_netdev_rcu(net, dev)
 793                 if (dev->type == type) {
 794                         dev_hold(dev);
 795                         ret = dev;
 796                         break;
 797                 }
 798         rcu_read_unlock();
 799         return ret;
 800 }
 801 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 802
 803 /**
 804  *      dev_get_by_flags_rcu - find any device with given flags
 805  *      @net: the applicable net namespace
 806  *      @if_flags: IFF_* values
 807  *      @mask: bitmask of bits in if_flags to check
 808  *
 809  *      Search for any interface with the given flags. Returns NULL if a device
 810  *      is not found or a pointer to the device. Must be called inside
 811  *      rcu_read_lock(), and result refcount is unchanged.
 812  */
 813
 814 struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
 815                                     unsigned short mask)
 816 {
 817         struct net_device *dev, *ret;
 818
 819         ret = NULL;
 820         for_each_netdev_rcu(net, dev) {
 821                 if (((dev->flags ^ if_flags) & mask) == 0) {
 822                         ret = dev;
 823                         break;
 824                 }
 825         }
 826         return ret;
 827 }
 828 EXPORT_SYMBOL(dev_get_by_flags_rcu);
 829
 830 /**
 831  *      dev_valid_name - check if name is okay for network device
 832  *      @name: name string
 833  *
 834  *      Network device names need to be valid file names to
 835  *      to allow sysfs to work.  We also disallow any kind of
 836  *      whitespace.
 837  */
 838 int dev_valid_name(const char *name)
 839 {
 840         if (*name == '\0')
 841                 return 0;
 842         if (strlen(name) >= IFNAMSIZ)
 843                 return 0;
 844         if (!strcmp(name, ".") || !strcmp(name, ".."))
 845                 return 0;
 846
 847         while (*name) {
 848                 if (*name == '/' || isspace(*name))
 849                         return 0;
 850                 name++;
 851         }
 852         return 1;
 853 }
 854 EXPORT_SYMBOL(dev_valid_name);
 855
 856 /**
 857  *      __dev_alloc_name - allocate a name for a device
 858  *      @net: network namespace to allocate the device name in
 859  *      @name: name format string
 860  *      @buf:  scratch buffer and result name string
 861  *
 862  *      Passed a format string - eg "lt%d" it will try and find a suitable
 863  *      id. It scans list of devices to build up a free map, then chooses
 864  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 865  *      while allocating the name and adding the device in order to avoid
 866  *      duplicates.
 867  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 868  *      Returns the number of the unit assigned or a negative errno code.
 869  */
 870
 871 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 872 {
 873         int i = 0;
 874         const char *p;
 875         const int max_netdevices = 8*PAGE_SIZE;
 876         unsigned long *inuse;
 877         struct net_device *d;
 878
 879         p = strnchr(name, IFNAMSIZ-1, '%');
 880         if (p) {
 881                 /*
 882                  * Verify the string as this thing may have come from
 883                  * the user.  There must be either one "%d" and no other "%"
 884                  * characters.
 885                  */
 886                 if (p[1] != 'd' || strchr(p + 2, '%'))
 887                         return -EINVAL;
 888
 889                 /* Use one page as a bit array of possible slots */
 890                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 891                 if (!inuse)
 892                         return -ENOMEM;
 893
 894                 for_each_netdev(net, d) {
 895                         if (!sscanf(d->name, name, &i))
 896                                 continue;
 897                         if (i < 0 || i >= max_netdevices)
 898                                 continue;
 899
 900                         /*  avoid cases where sscanf is not exact inverse of printf */
 901                         snprintf(buf, IFNAMSIZ, name, i);
 902                         if (!strncmp(buf, d->name, IFNAMSIZ))
 903                                 set_bit(i, inuse);
 904                 }
 905
 906                 i = find_first_zero_bit(inuse, max_netdevices);
 907                 free_page((unsigned long) inuse);
 908         }
 909
 910         if (buf != name)
 911                 snprintf(buf, IFNAMSIZ, name, i);
 912         if (!__dev_get_by_name(net, buf))
 913                 return i;
 914
 915         /* It is possible to run out of possible slots
 916          * when the name is long and there isn't enough space left
 917          * for the digits, or if all bits are used.
 918          */
 919         return -ENFILE;
 920 }
 921
 922 /**
 923  *      dev_alloc_name - allocate a name for a device
 924  *      @dev: device
 925  *      @name: name format string
 926  *
 927  *      Passed a format string - eg "lt%d" it will try and find a suitable
 928  *      id. It scans list of devices to build up a free map, then chooses
 929  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 930  *      while allocating the name and adding the device in order to avoid
 931  *      duplicates.
 932  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 933  *      Returns the number of the unit assigned or a negative errno code.
 934  */
 935
 936 int dev_alloc_name(struct net_device *dev, const char *name)
 937 {
 938         char buf[IFNAMSIZ];
 939         struct net *net;
 940         int ret;
 941
 942         BUG_ON(!dev_net(dev));
 943         net = dev_net(dev);
 944         ret = __dev_alloc_name(net, name, buf);
 945         if (ret >= 0)
 946                 strlcpy(dev->name, buf, IFNAMSIZ);
 947         return ret;
 948 }
 949 EXPORT_SYMBOL(dev_alloc_name);
 950
 951 static int dev_get_valid_name(struct net_device *dev, const char *name, bool fmt)
 952 {
 953         struct net *net;
 954
 955         BUG_ON(!dev_net(dev));
 956         net = dev_net(dev);
 957
 958         if (!dev_valid_name(name))
 959                 return -EINVAL;
 960
 961         if (fmt && strchr(name, '%'))
 962                 return dev_alloc_name(dev, name);
 963         else if (__dev_get_by_name(net, name))
 964                 return -EEXIST;
 965         else if (dev->name != name)
 966                 strlcpy(dev->name, name, IFNAMSIZ);
 967
 968         return 0;
 969 }
 970
 971 /**
 972  *      dev_change_name - change name of a device
 973  *      @dev: device
 974  *      @newname: name (or format string) must be at least IFNAMSIZ
 975  *
 976  *      Change name of a device, can pass format strings "eth%d".
 977  *      for wildcarding.
 978  */
 979 int dev_change_name(struct net_device *dev, const char *newname)
 980 {
 981         char oldname[IFNAMSIZ];
 982         int err = 0;
 983         int ret;
 984         struct net *net;
 985
 986         ASSERT_RTNL();
 987         BUG_ON(!dev_net(dev));
 988
 989         net = dev_net(dev);
 990         if (dev->flags & IFF_UP)
 991                 return -EBUSY;
 992
 993         if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
 994                 return 0;
 995
 996         memcpy(oldname, dev->name, IFNAMSIZ);
 997
 998         err = dev_get_valid_name(dev, newname, 1);
 999         if (err < 0)
1000                 return err;
1001
1002 rollback:
1003         ret = device_rename(&dev->dev, dev->name);
1004         if (ret) {
1005                 memcpy(dev->name, oldname, IFNAMSIZ);
1006                 return ret;
1007         }
1008
1009         write_lock_bh(&dev_base_lock);
1010         hlist_del(&dev->name_hlist);
1011         write_unlock_bh(&dev_base_lock);
1012
1013         synchronize_rcu();
1014
1015         write_lock_bh(&dev_base_lock);
1016         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1017         write_unlock_bh(&dev_base_lock);
1018
1019         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1020         ret = notifier_to_errno(ret);
1021
1022         if (ret) {
1023                 /* err >= 0 after dev_alloc_name() or stores the first errno */
1024                 if (err >= 0) {
1025                         err = ret;
1026                         memcpy(dev->name, oldname, IFNAMSIZ);
1027                         goto rollback;
1028                 } else {
1029                         printk(KERN_ERR
1030                                "%s: name change rollback failed: %d.\n",
1031                                dev->name, ret);
1032                 }
1033         }
1034
1035         return err;
1036 }
1037
1038 /**
1039  *      dev_set_alias - change ifalias of a device
1040  *      @dev: device
1041  *      @alias: name up to IFALIASZ
1042  *      @len: limit of bytes to copy from info
1043  *
1044  *      Set ifalias for a device,
1045  */
1046 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1047 {
1048         ASSERT_RTNL();
1049
1050         if (len >= IFALIASZ)
1051                 return -EINVAL;
1052
1053         if (!len) {
1054                 if (dev->ifalias) {
1055                         kfree(dev->ifalias);
1056                         dev->ifalias = NULL;
1057                 }
1058                 return 0;
1059         }
1060
1061         dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1062         if (!dev->ifalias)
1063                 return -ENOMEM;
1064
1065         strlcpy(dev->ifalias, alias, len+1);
1066         return len;
1067 }
1068
1069
1070 /**
1071  *      netdev_features_change - device changes features
1072  *      @dev: device to cause notification
1073  *
1074  *      Called to indicate a device has changed features.
1075  */
1076 void netdev_features_change(struct net_device *dev)
1077 {
1078         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1079 }
1080 EXPORT_SYMBOL(netdev_features_change);
1081
1082 /**
1083  *      netdev_state_change - device changes state
1084  *      @dev: device to cause notification
1085  *
1086  *      Called to indicate a device has changed state. This function calls
1087  *      the notifier chains for netdev_chain and sends a NEWLINK message
1088  *      to the routing socket.
1089  */
1090 void netdev_state_change(struct net_device *dev)
1091 {
1092         if (dev->flags & IFF_UP) {
1093                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1094                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1095         }
1096 }
1097 EXPORT_SYMBOL(netdev_state_change);
1098
1099 int netdev_bonding_change(struct net_device *dev, unsigned long event)
1100 {
1101         return call_netdevice_notifiers(event, dev);
1102 }
1103 EXPORT_SYMBOL(netdev_bonding_change);
1104
1105 /**
1106  *      dev_load        - load a network module
1107  *      @net: the applicable net namespace
1108  *      @name: name of interface
1109  *
1110  *      If a network interface is not present and the process has suitable
1111  *      privileges this function loads the module. If module loading is not
1112  *      available in this kernel then it becomes a nop.
1113  */
1114
1115 void dev_load(struct net *net, const char *name)
1116 {
1117         struct net_device *dev;
1118         int no_module;
1119
1120         rcu_read_lock();
1121         dev = dev_get_by_name_rcu(net, name);
1122         rcu_read_unlock();
1123
1124         no_module = !dev;
1125         if (no_module && capable(CAP_NET_ADMIN))
1126                 no_module = request_module("netdev-%s", name);
1127         if (no_module && capable(CAP_SYS_MODULE)) {
1128                 if (!request_module("%s", name))
1129                         pr_err("Loading kernel module for a network device "
1130 "with CAP_SYS_MODULE (deprecated).  Use CAP_NET_ADMIN and alias netdev-%s "
1131 "instead\n", name);
1132         }
1133 }
1134 EXPORT_SYMBOL(dev_load);
1135
1136 static int __dev_open(struct net_device *dev)
1137 {
1138         const struct net_device_ops *ops = dev->netdev_ops;
1139         int ret;
1140
1141         ASSERT_RTNL();
1142
1143         if (!netif_device_present(dev))
1144                 return -ENODEV;
1145
1146         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1147         ret = notifier_to_errno(ret);
1148         if (ret)
1149                 return ret;
1150
1151         set_bit(__LINK_STATE_START, &dev->state);
1152
1153         if (ops->ndo_validate_addr)
1154                 ret = ops->ndo_validate_addr(dev);
1155
1156         if (!ret && ops->ndo_open)
1157                 ret = ops->ndo_open(dev);
1158
1159         if (ret)
1160                 clear_bit(__LINK_STATE_START, &dev->state);
1161         else {
1162                 dev->flags |= IFF_UP;
1163                 net_dmaengine_get();
1164                 dev_set_rx_mode(dev);
1165                 dev_activate(dev);
1166         }
1167
1168         return ret;
1169 }
1170
1171 /**
1172  *      dev_open        - prepare an interface for use.
1173  *      @dev:   device to open
1174  *
1175  *      Takes a device from down to up state. The device's private open
1176  *      function is invoked and then the multicast lists are loaded. Finally
1177  *      the device is moved into the up state and a %NETDEV_UP message is
1178  *      sent to the netdev notifier chain.
1179  *
1180  *      Calling this function on an active interface is a nop. On a failure
1181  *      a negative errno code is returned.
1182  */
1183 int dev_open(struct net_device *dev)
1184 {
1185         int ret;
1186
1187         if (dev->flags & IFF_UP)
1188                 return 0;
1189
1190         ret = __dev_open(dev);
1191         if (ret < 0)
1192                 return ret;
1193
1194         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1195         call_netdevice_notifiers(NETDEV_UP, dev);
1196
1197         return ret;
1198 }
1199 EXPORT_SYMBOL(dev_open);
1200
1201 static int __dev_close_many(struct list_head *head)
1202 {
1203         struct net_device *dev;
1204
1205         ASSERT_RTNL();
1206         might_sleep();
1207
1208         list_for_each_entry(dev, head, unreg_list) {
1209                 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1210
1211                 clear_bit(__LINK_STATE_START, &dev->state);
1212
1213                 /* Synchronize to scheduled poll. We cannot touch poll list, it
1214                  * can be even on different cpu. So just clear netif_running().
1215                  *
1216                  * dev->stop() will invoke napi_disable() on all of it's
1217                  * napi_struct instances on this device.
1218                  */
1219                 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1220         }
1221
1222         dev_deactivate_many(head);
1223
1224         list_for_each_entry(dev, head, unreg_list) {
1225                 const struct net_device_ops *ops = dev->netdev_ops;
1226
1227                 /*
1228                  *      Call the device specific close. This cannot fail.
1229                  *      Only if device is UP
1230                  *
1231                  *      We allow it to be called even after a DETACH hot-plug
1232                  *      event.
1233                  */
1234                 if (ops->ndo_stop)
1235                         ops->ndo_stop(dev);
1236
1237                 dev->flags &= ~IFF_UP;
1238                 net_dmaengine_put();
1239         }
1240
1241         return 0;
1242 }
1243
1244 static int __dev_close(struct net_device *dev)
1245 {
1246         int retval;
1247         LIST_HEAD(single);
1248
1249         list_add(&dev->unreg_list, &single);
1250         retval = __dev_close_many(&single);
1251         list_del(&single);
1252         return retval;
1253 }
1254
1255 static int dev_close_many(struct list_head *head)
1256 {
1257         struct net_device *dev, *tmp;
1258         LIST_HEAD(tmp_list);
1259
1260         list_for_each_entry_safe(dev, tmp, head, unreg_list)
1261                 if (!(dev->flags & IFF_UP))
1262                         list_move(&dev->unreg_list, &tmp_list);
1263
1264         __dev_close_many(head);
1265
1266         list_for_each_entry(dev, head, unreg_list) {
1267                 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1268                 call_netdevice_notifiers(NETDEV_DOWN, dev);
1269         }
1270
1271         /* rollback_registered_many needs the complete original list */
1272         list_splice(&tmp_list, head);
1273         return 0;
1274 }
1275
1276 /**
1277  *      dev_close - shutdown an interface.
1278  *      @dev: device to shutdown
1279  *
1280  *      This function moves an active device into down state. A
1281  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1282  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1283  *      chain.
1284  */
1285 int dev_close(struct net_device *dev)
1286 {
1287         LIST_HEAD(single);
1288
1289         list_add(&dev->unreg_list, &single);
1290         dev_close_many(&single);
1291         list_del(&single);
1292         return 0;
1293 }
1294 EXPORT_SYMBOL(dev_close);
1295
1296
1297 /**
1298  *      dev_disable_lro - disable Large Receive Offload on a device
1299  *      @dev: device
1300  *
1301  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1302  *      called under RTNL.  This is needed if received packets may be
1303  *      forwarded to another interface.
1304  */
1305 void dev_disable_lro(struct net_device *dev)
1306 {
1307         u32 flags;
1308
1309         if (dev->ethtool_ops && dev->ethtool_ops->get_flags)
1310                 flags = dev->ethtool_ops->get_flags(dev);
1311         else
1312                 flags = ethtool_op_get_flags(dev);
1313
1314         if (!(flags & ETH_FLAG_LRO))
1315                 return;
1316
1317         __ethtool_set_flags(dev, flags & ~ETH_FLAG_LRO);
1318         if (unlikely(dev->features & NETIF_F_LRO))
1319                 netdev_WARN(dev, "failed to disable LRO!\n");
1320 }
1321 EXPORT_SYMBOL(dev_disable_lro);
1322
1323
1324 static int dev_boot_phase = 1;
1325
1326 /**
1327  *      register_netdevice_notifier - register a network notifier block
1328  *      @nb: notifier
1329  *
1330  *      Register a notifier to be called when network device events occur.
1331  *      The notifier passed is linked into the kernel structures and must
1332  *      not be reused until it has been unregistered. A negative errno code
1333  *      is returned on a failure.
1334  *
1335  *      When registered all registration and up events are replayed
1336  *      to the new notifier to allow device to have a race free
1337  *      view of the network device list.
1338  */
1339
1340 int register_netdevice_notifier(struct notifier_block *nb)
1341 {
1342         struct net_device *dev;
1343         struct net_device *last;
1344         struct net *net;
1345         int err;
1346
1347         rtnl_lock();
1348         err = raw_notifier_chain_register(&netdev_chain, nb);
1349         if (err)
1350                 goto unlock;
1351         if (dev_boot_phase)
1352                 goto unlock;
1353         for_each_net(net) {
1354                 for_each_netdev(net, dev) {
1355                         err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1356                         err = notifier_to_errno(err);
1357                         if (err)
1358                                 goto rollback;
1359
1360                         if (!(dev->flags & IFF_UP))
1361                                 continue;
1362
1363                         nb->notifier_call(nb, NETDEV_UP, dev);
1364                 }
1365         }
1366
1367 unlock:
1368         rtnl_unlock();
1369         return err;
1370
1371 rollback:
1372         last = dev;
1373         for_each_net(net) {
1374                 for_each_netdev(net, dev) {
1375                         if (dev == last)
1376                                 break;
1377
1378                         if (dev->flags & IFF_UP) {
1379                                 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1380                                 nb->notifier_call(nb, NETDEV_DOWN, dev);
1381                         }
1382                         nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1383                         nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1384                 }
1385         }
1386
1387         raw_notifier_chain_unregister(&netdev_chain, nb);
1388         goto unlock;
1389 }
1390 EXPORT_SYMBOL(register_netdevice_notifier);
1391
1392 /**
1393  *      unregister_netdevice_notifier - unregister a network notifier block
1394  *      @nb: notifier
1395  *
1396  *      Unregister a notifier previously registered by
1397  *      register_netdevice_notifier(). The notifier is unlinked into the
1398  *      kernel structures and may then be reused. A negative errno code
1399  *      is returned on a failure.
1400  */
1401
1402 int unregister_netdevice_notifier(struct notifier_block *nb)
1403 {
1404         int err;
1405
1406         rtnl_lock();
1407         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1408         rtnl_unlock();
1409         return err;
1410 }
1411 EXPORT_SYMBOL(unregister_netdevice_notifier);
1412
1413 /**
1414  *      call_netdevice_notifiers - call all network notifier blocks
1415  *      @val: value passed unmodified to notifier function
1416  *      @dev: net_device pointer passed unmodified to notifier function
1417  *
1418  *      Call all network notifier blocks.  Parameters and return value
1419  *      are as for raw_notifier_call_chain().
1420  */
1421
1422 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1423 {
1424         ASSERT_RTNL();
1425         return raw_notifier_call_chain(&netdev_chain, val, dev);
1426 }
1427 EXPORT_SYMBOL(call_netdevice_notifiers);
1428
1429 /* When > 0 there are consumers of rx skb time stamps */
1430 static atomic_t netstamp_needed = ATOMIC_INIT(0);
1431
1432 void net_enable_timestamp(void)
1433 {
1434         atomic_inc(&netstamp_needed);
1435 }
1436 EXPORT_SYMBOL(net_enable_timestamp);
1437
1438 void net_disable_timestamp(void)
1439 {
1440         atomic_dec(&netstamp_needed);
1441 }
1442 EXPORT_SYMBOL(net_disable_timestamp);
1443
1444 static inline void net_timestamp_set(struct sk_buff *skb)
1445 {
1446         if (atomic_read(&netstamp_needed))
1447                 __net_timestamp(skb);
1448         else
1449                 skb->tstamp.tv64 = 0;
1450 }
1451
1452 static inline void net_timestamp_check(struct sk_buff *skb)
1453 {
1454         if (!skb->tstamp.tv64 && atomic_read(&netstamp_needed))
1455                 __net_timestamp(skb);
1456 }
1457
1458 static inline bool is_skb_forwardable(struct net_device *dev,
1459                                       struct sk_buff *skb)
1460 {
1461         unsigned int len;
1462
1463         if (!(dev->flags & IFF_UP))
1464                 return false;
1465
1466         len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1467         if (skb->len <= len)
1468                 return true;
1469
1470         /* if TSO is enabled, we don't care about the length as the packet
1471          * could be forwarded without being segmented before
1472          */
1473         if (skb_is_gso(skb))
1474                 return true;
1475
1476         return false;
1477 }
1478
1479 /**
1480  * dev_forward_skb - loopback an skb to another netif
1481  *
1482  * @dev: destination network device
1483  * @skb: buffer to forward
1484  *
1485  * return values:
1486  *      NET_RX_SUCCESS  (no congestion)
1487  *      NET_RX_DROP     (packet was dropped, but freed)
1488  *
1489  * dev_forward_skb can be used for injecting an skb from the
1490  * start_xmit function of one device into the receive queue
1491  * of another device.
1492  *
1493  * The receiving device may be in another namespace, so
1494  * we have to clear all information in the skb that could
1495  * impact namespace isolation.
1496  */
1497 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1498 {
1499         skb_orphan(skb);
1500         nf_reset(skb);
1501
1502         if (unlikely(!is_skb_forwardable(dev, skb))) {
1503                 atomic_long_inc(&dev->rx_dropped);
1504                 kfree_skb(skb);
1505                 return NET_RX_DROP;
1506         }
1507         skb_set_dev(skb, dev);
1508         skb->tstamp.tv64 = 0;
1509         skb->pkt_type = PACKET_HOST;
1510         skb->protocol = eth_type_trans(skb, dev);
1511         return netif_rx(skb);
1512 }
1513 EXPORT_SYMBOL_GPL(dev_forward_skb);
1514
1515 static inline int deliver_skb(struct sk_buff *skb,
1516                               struct packet_type *pt_prev,
1517                               struct net_device *orig_dev)
1518 {
1519         atomic_inc(&skb->users);
1520         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1521 }
1522
1523 /*
1524  *      Support routine. Sends outgoing frames to any network
1525  *      taps currently in use.
1526  */
1527
1528 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1529 {
1530         struct packet_type *ptype;
1531         struct sk_buff *skb2 = NULL;
1532         struct packet_type *pt_prev = NULL;
1533
1534         rcu_read_lock();
1535         list_for_each_entry_rcu(ptype, &ptype_all, list) {
1536                 /* Never send packets back to the socket
1537                  * they originated from - MvS (miquels@drinkel.ow.org)
1538                  */
1539                 if ((ptype->dev == dev || !ptype->dev) &&
1540                     (ptype->af_packet_priv == NULL ||
1541                      (struct sock *)ptype->af_packet_priv != skb->sk)) {
1542                         if (pt_prev) {
1543                                 deliver_skb(skb2, pt_prev, skb->dev);
1544                                 pt_prev = ptype;
1545                                 continue;
1546                         }
1547
1548                         skb2 = skb_clone(skb, GFP_ATOMIC);
1549                         if (!skb2)
1550                                 break;
1551
1552                         net_timestamp_set(skb2);
1553
1554                         /* skb->nh should be correctly
1555                            set by sender, so that the second statement is
1556                            just protection against buggy protocols.
1557                          */
1558                         skb_reset_mac_header(skb2);
1559
1560                         if (skb_network_header(skb2) < skb2->data ||
1561                             skb2->network_header > skb2->tail) {
1562                                 if (net_ratelimit())
1563                                         printk(KERN_CRIT "protocol %04x is "
1564                                                "buggy, dev %s\n",
1565                                                ntohs(skb2->protocol),
1566                                                dev->name);
1567                                 skb_reset_network_header(skb2);
1568                         }
1569
1570                         skb2->transport_header = skb2->network_header;
1571                         skb2->pkt_type = PACKET_OUTGOING;
1572                         pt_prev = ptype;
1573                 }
1574         }
1575         if (pt_prev)
1576                 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1577         rcu_read_unlock();
1578 }
1579
1580 /* netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1581  * @dev: Network device
1582  * @txq: number of queues available
1583  *
1584  * If real_num_tx_queues is changed the tc mappings may no longer be
1585  * valid. To resolve this verify the tc mapping remains valid and if
1586  * not NULL the mapping. With no priorities mapping to this
1587  * offset/count pair it will no longer be used. In the worst case TC0
1588  * is invalid nothing can be done so disable priority mappings. If is
1589  * expected that drivers will fix this mapping if they can before
1590  * calling netif_set_real_num_tx_queues.
1591  */
1592 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1593 {
1594         int i;
1595         struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1596
1597         /* If TC0 is invalidated disable TC mapping */
1598         if (tc->offset + tc->count > txq) {
1599                 pr_warning("Number of in use tx queues changed "
1600                            "invalidating tc mappings. Priority "
1601                            "traffic classification disabled!\n");
1602                 dev->num_tc = 0;
1603                 return;
1604         }
1605
1606         /* Invalidated prio to tc mappings set to TC0 */
1607         for (i = 1; i < TC_BITMASK + 1; i++) {
1608                 int q = netdev_get_prio_tc_map(dev, i);
1609
1610                 tc = &dev->tc_to_txq[q];
1611                 if (tc->offset + tc->count > txq) {
1612                         pr_warning("Number of in use tx queues "
1613                                    "changed. Priority %i to tc "
1614                                    "mapping %i is no longer valid "
1615                                    "setting map to 0\n",
1616                                    i, q);
1617                         netdev_set_prio_tc_map(dev, i, 0);
1618                 }
1619         }
1620 }
1621
1622 /*
1623  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1624  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1625  */
1626 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1627 {
1628         int rc;
1629
1630         if (txq < 1 || txq > dev->num_tx_queues)
1631                 return -EINVAL;
1632
1633         if (dev->reg_state == NETREG_REGISTERED ||
1634             dev->reg_state == NETREG_UNREGISTERING) {
1635                 ASSERT_RTNL();
1636
1637                 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
1638                                                   txq);
1639                 if (rc)
1640                         return rc;
1641
1642                 if (dev->num_tc)
1643                         netif_setup_tc(dev, txq);
1644
1645                 if (txq < dev->real_num_tx_queues)
1646                         qdisc_reset_all_tx_gt(dev, txq);
1647         }
1648
1649         dev->real_num_tx_queues = txq;
1650         return 0;
1651 }
1652 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
1653
1654 #ifdef CONFIG_RPS
1655 /**
1656  *      netif_set_real_num_rx_queues - set actual number of RX queues used
1657  *      @dev: Network device
1658  *      @rxq: Actual number of RX queues
1659  *
1660  *      This must be called either with the rtnl_lock held or before
1661  *      registration of the net device.  Returns 0 on success, or a
1662  *      negative error code.  If called before registration, it always
1663  *      succeeds.
1664  */
1665 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
1666 {
1667         int rc;
1668
1669         if (rxq < 1 || rxq > dev->num_rx_queues)
1670                 return -EINVAL;
1671
1672         if (dev->reg_state == NETREG_REGISTERED) {
1673                 ASSERT_RTNL();
1674
1675                 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
1676                                                   rxq);
1677                 if (rc)
1678                         return rc;
1679         }
1680
1681         dev->real_num_rx_queues = rxq;
1682         return 0;
1683 }
1684 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
1685 #endif
1686
1687 static inline void __netif_reschedule(struct Qdisc *q)
1688 {
1689         struct softnet_data *sd;
1690         unsigned long flags;
1691
1692         local_irq_save(flags);
1693         sd = &__get_cpu_var(softnet_data);
1694         q->next_sched = NULL;
1695         *sd->output_queue_tailp = q;
1696         sd->output_queue_tailp = &q->next_sched;
1697         raise_softirq_irqoff(NET_TX_SOFTIRQ);
1698         local_irq_restore(flags);
1699 }
1700
1701 void __netif_schedule(struct Qdisc *q)
1702 {
1703         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1704                 __netif_reschedule(q);
1705 }
1706 EXPORT_SYMBOL(__netif_schedule);
1707
1708 void dev_kfree_skb_irq(struct sk_buff *skb)
1709 {
1710         if (atomic_dec_and_test(&skb->users)) {
1711                 struct softnet_data *sd;
1712                 unsigned long flags;
1713
1714                 local_irq_save(flags);
1715                 sd = &__get_cpu_var(softnet_data);
1716                 skb->next = sd->completion_queue;
1717                 sd->completion_queue = skb;
1718                 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1719                 local_irq_restore(flags);
1720         }
1721 }
1722 EXPORT_SYMBOL(dev_kfree_skb_irq);
1723
1724 void dev_kfree_skb_any(struct sk_buff *skb)
1725 {
1726         if (in_irq() || irqs_disabled())
1727                 dev_kfree_skb_irq(skb);
1728         else
1729                 dev_kfree_skb(skb);
1730 }
1731 EXPORT_SYMBOL(dev_kfree_skb_any);
1732
1733
1734 /**
1735  * netif_device_detach - mark device as removed
1736  * @dev: network device
1737  *
1738  * Mark device as removed from system and therefore no longer available.
1739  */
1740 void netif_device_detach(struct net_device *dev)
1741 {
1742         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1743             netif_running(dev)) {
1744                 netif_tx_stop_all_queues(dev);
1745         }
1746 }
1747 EXPORT_SYMBOL(netif_device_detach);
1748
1749 /**
1750  * netif_device_attach - mark device as attached
1751  * @dev: network device
1752  *
1753  * Mark device as attached from system and restart if needed.
1754  */
1755 void netif_device_attach(struct net_device *dev)
1756 {
1757         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1758             netif_running(dev)) {
1759                 netif_tx_wake_all_queues(dev);
1760                 __netdev_watchdog_up(dev);
1761         }
1762 }
1763 EXPORT_SYMBOL(netif_device_attach);
1764
1765 /**
1766  * skb_dev_set -- assign a new device to a buffer
1767  * @skb: buffer for the new device
1768  * @dev: network device
1769  *
1770  * If an skb is owned by a device already, we have to reset
1771  * all data private to the namespace a device belongs to
1772  * before assigning it a new device.
1773  */
1774 #ifdef CONFIG_NET_NS
1775 void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
1776 {
1777         skb_dst_drop(skb);
1778         if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
1779                 secpath_reset(skb);
1780                 nf_reset(skb);
1781                 skb_init_secmark(skb);
1782                 skb->mark = 0;
1783                 skb->priority = 0;
1784                 skb->nf_trace = 0;
1785                 skb->ipvs_property = 0;
1786 #ifdef CONFIG_NET_SCHED
1787                 skb->tc_index = 0;
1788 #endif
1789         }
1790         skb->dev = dev;
1791 }
1792 EXPORT_SYMBOL(skb_set_dev);
1793 #endif /* CONFIG_NET_NS */
1794
1795 /*
1796  * Invalidate hardware checksum when packet is to be mangled, and
1797  * complete checksum manually on outgoing path.
1798  */
1799 int skb_checksum_help(struct sk_buff *skb)
1800 {
1801         __wsum csum;
1802         int ret = 0, offset;
1803
1804         if (skb->ip_summed == CHECKSUM_COMPLETE)
1805                 goto out_set_summed;
1806
1807         if (unlikely(skb_shinfo(skb)->gso_size)) {
1808                 /* Let GSO fix up the checksum. */
1809                 goto out_set_summed;
1810         }
1811
1812         offset = skb_checksum_start_offset(skb);
1813         BUG_ON(offset >= skb_headlen(skb));
1814         csum = skb_checksum(skb, offset, skb->len - offset, 0);
1815
1816         offset += skb->csum_offset;
1817         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1818
1819         if (skb_cloned(skb) &&
1820             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1821                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1822                 if (ret)
1823                         goto out;
1824         }
1825
1826         *(__sum16 *)(skb->data + offset) = csum_fold(csum);
1827 out_set_summed:
1828         skb->ip_summed = CHECKSUM_NONE;
1829 out:
1830         return ret;
1831 }
1832 EXPORT_SYMBOL(skb_checksum_help);
1833
1834 /**
1835  *      skb_gso_segment - Perform segmentation on skb.
1836  *      @skb: buffer to segment
1837  *      @features: features for the output path (see dev->features)
1838  *
1839  *      This function segments the given skb and returns a list of segments.
1840  *
1841  *      It may return NULL if the skb requires no segmentation.  This is
1842  *      only possible when GSO is used for verifying header integrity.
1843  */
1844 struct sk_buff *skb_gso_segment(struct sk_buff *skb, u32 features)
1845 {
1846         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1847         struct packet_type *ptype;
1848         __be16 type = skb->protocol;
1849         int vlan_depth = ETH_HLEN;
1850         int err;
1851
1852         while (type == htons(ETH_P_8021Q)) {
1853                 struct vlan_hdr *vh;
1854
1855                 if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
1856                         return ERR_PTR(-EINVAL);
1857
1858                 vh = (struct vlan_hdr *)(skb->data + vlan_depth);
1859                 type = vh->h_vlan_encapsulated_proto;
1860                 vlan_depth += VLAN_HLEN;
1861         }
1862
1863         skb_reset_mac_header(skb);
1864         skb->mac_len = skb->network_header - skb->mac_header;
1865         __skb_pull(skb, skb->mac_len);
1866
1867         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1868                 struct net_device *dev = skb->dev;
1869                 struct ethtool_drvinfo info = {};
1870
1871                 if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1872                         dev->ethtool_ops->get_drvinfo(dev, &info);
1873
1874                 WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d ip_summed=%d\n",
1875                      info.driver, dev ? dev->features : 0L,
1876                      skb->sk ? skb->sk->sk_route_caps : 0L,
1877                      skb->len, skb->data_len, skb->ip_summed);
1878
1879                 if (skb_header_cloned(skb) &&
1880                     (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1881                         return ERR_PTR(err);
1882         }
1883
1884         rcu_read_lock();
1885         list_for_each_entry_rcu(ptype,
1886                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1887                 if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1888                         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1889                                 err = ptype->gso_send_check(skb);
1890                                 segs = ERR_PTR(err);
1891                                 if (err || skb_gso_ok(skb, features))
1892                                         break;
1893                                 __skb_push(skb, (skb->data -
1894                                                  skb_network_header(skb)));
1895                         }
1896                         segs = ptype->gso_segment(skb, features);
1897                         break;
1898                 }
1899         }
1900         rcu_read_unlock();
1901
1902         __skb_push(skb, skb->data - skb_mac_header(skb));
1903
1904         return segs;
1905 }
1906 EXPORT_SYMBOL(skb_gso_segment);
1907
1908 /* Take action when hardware reception checksum errors are detected. */
1909 #ifdef CONFIG_BUG
1910 void netdev_rx_csum_fault(struct net_device *dev)
1911 {
1912         if (net_ratelimit()) {
1913                 printk(KERN_ERR "%s: hw csum failure.\n",
1914                         dev ? dev->name : "<unknown>");
1915                 dump_stack();
1916         }
1917 }
1918 EXPORT_SYMBOL(netdev_rx_csum_fault);
1919 #endif
1920
1921 /* Actually, we should eliminate this check as soon as we know, that:
1922  * 1. IOMMU is present and allows to map all the memory.
1923  * 2. No high memory really exists on this machine.
1924  */
1925
1926 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1927 {
1928 #ifdef CONFIG_HIGHMEM
1929         int i;
1930         if (!(dev->features & NETIF_F_HIGHDMA)) {
1931                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1932                         if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1933                                 return 1;
1934         }
1935
1936         if (PCI_DMA_BUS_IS_PHYS) {
1937                 struct device *pdev = dev->dev.parent;
1938
1939                 if (!pdev)
1940                         return 0;
1941                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1942                         dma_addr_t addr = page_to_phys(skb_shinfo(skb)->frags[i].page);
1943                         if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
1944                                 return 1;
1945                 }
1946         }
1947 #endif
1948         return 0;
1949 }
1950
1951 struct dev_gso_cb {
1952         void (*destructor)(struct sk_buff *skb);
1953 };
1954
1955 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1956
1957 static void dev_gso_skb_destructor(struct sk_buff *skb)
1958 {
1959         struct dev_gso_cb *cb;
1960
1961         do {
1962                 struct sk_buff *nskb = skb->next;
1963
1964                 skb->next = nskb->next;
1965                 nskb->next = NULL;
1966                 kfree_skb(nskb);
1967         } while (skb->next);
1968
1969         cb = DEV_GSO_CB(skb);
1970         if (cb->destructor)
1971                 cb->destructor(skb);
1972 }
1973
1974 /**
1975  *      dev_gso_segment - Perform emulated hardware segmentation on skb.
1976  *      @skb: buffer to segment
1977  *      @features: device features as applicable to this skb
1978  *
1979  *      This function segments the given skb and stores the list of segments
1980  *      in skb->next.
1981  */
1982 static int dev_gso_segment(struct sk_buff *skb, int features)
1983 {
1984         struct sk_buff *segs;
1985
1986         segs = skb_gso_segment(skb, features);
1987
1988         /* Verifying header integrity only. */
1989         if (!segs)
1990                 return 0;
1991
1992         if (IS_ERR(segs))
1993                 return PTR_ERR(segs);
1994
1995         skb->next = segs;
1996         DEV_GSO_CB(skb)->destructor = skb->destructor;
1997         skb->destructor = dev_gso_skb_destructor;
1998
1999         return 0;
2000 }
2001
2002 /*
2003  * Try to orphan skb early, right before transmission by the device.
2004  * We cannot orphan skb if tx timestamp is requested or the sk-reference
2005  * is needed on driver level for other reasons, e.g. see net/can/raw.c
2006  */
2007 static inline void skb_orphan_try(struct sk_buff *skb)
2008 {
2009         struct sock *sk = skb->sk;
2010
2011         if (sk && !skb_shinfo(skb)->tx_flags) {
2012                 /* skb_tx_hash() wont be able to get sk.
2013                  * We copy sk_hash into skb->rxhash
2014                  */
2015                 if (!skb->rxhash)
2016                         skb->rxhash = sk->sk_hash;
2017                 skb_orphan(skb);
2018         }
2019 }
2020
2021 static bool can_checksum_protocol(unsigned long features, __be16 protocol)
2022 {
2023         return ((features & NETIF_F_GEN_CSUM) ||
2024                 ((features & NETIF_F_V4_CSUM) &&
2025                  protocol == htons(ETH_P_IP)) ||
2026                 ((features & NETIF_F_V6_CSUM) &&
2027                  protocol == htons(ETH_P_IPV6)) ||
2028                 ((features & NETIF_F_FCOE_CRC) &&
2029                  protocol == htons(ETH_P_FCOE)));
2030 }
2031
2032 static u32 harmonize_features(struct sk_buff *skb, __be16 protocol, u32 features)
2033 {
2034         if (!can_checksum_protocol(features, protocol)) {
2035                 features &= ~NETIF_F_ALL_CSUM;
2036                 features &= ~NETIF_F_SG;
2037         } else if (illegal_highdma(skb->dev, skb)) {
2038                 features &= ~NETIF_F_SG;
2039         }
2040
2041         return features;
2042 }
2043
2044 u32 netif_skb_features(struct sk_buff *skb)
2045 {
2046         __be16 protocol = skb->protocol;
2047         u32 features = skb->dev->features;
2048
2049         if (protocol == htons(ETH_P_8021Q)) {
2050                 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2051                 protocol = veh->h_vlan_encapsulated_proto;
2052         } else if (!vlan_tx_tag_present(skb)) {
2053                 return harmonize_features(skb, protocol, features);
2054         }
2055
2056         features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
2057
2058         if (protocol != htons(ETH_P_8021Q)) {
2059                 return harmonize_features(skb, protocol, features);
2060         } else {
2061                 features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2062                                 NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
2063                 return harmonize_features(skb, protocol, features);
2064         }
2065 }
2066 EXPORT_SYMBOL(netif_skb_features);
2067
2068 /*
2069  * Returns true if either:
2070  *      1. skb has frag_list and the device doesn't support FRAGLIST, or
2071  *      2. skb is fragmented and the device does not support SG, or if
2072  *         at least one of fragments is in highmem and device does not
2073  *         support DMA from it.
2074  */
2075 static inline int skb_needs_linearize(struct sk_buff *skb,
2076                                       int features)
2077 {
2078         return skb_is_nonlinear(skb) &&
2079                         ((skb_has_frag_list(skb) &&
2080                                 !(features & NETIF_F_FRAGLIST)) ||
2081                         (skb_shinfo(skb)->nr_frags &&
2082                                 !(features & NETIF_F_SG)));
2083 }
2084
2085 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2086                         struct netdev_queue *txq)
2087 {
2088         const struct net_device_ops *ops = dev->netdev_ops;
2089         int rc = NETDEV_TX_OK;
2090
2091         if (likely(!skb->next)) {
2092                 u32 features;
2093
2094                 /*
2095                  * If device doesn't need skb->dst, release it right now while
2096                  * its hot in this cpu cache
2097                  */
2098                 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2099                         skb_dst_drop(skb);
2100
2101                 if (!list_empty(&ptype_all))
2102                         dev_queue_xmit_nit(skb, dev);
2103
2104                 skb_orphan_try(skb);
2105
2106                 features = netif_skb_features(skb);
2107
2108                 if (vlan_tx_tag_present(skb) &&
2109                     !(features & NETIF_F_HW_VLAN_TX)) {
2110                         skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2111                         if (unlikely(!skb))
2112                                 goto out;
2113
2114                         skb->vlan_tci = 0;
2115                 }
2116
2117                 if (netif_needs_gso(skb, features)) {
2118                         if (unlikely(dev_gso_segment(skb, features)))
2119                                 goto out_kfree_skb;
2120                         if (skb->next)
2121                                 goto gso;
2122                 } else {
2123                         if (skb_needs_linearize(skb, features) &&
2124                             __skb_linearize(skb))
2125                                 goto out_kfree_skb;
2126
2127                         /* If packet is not checksummed and device does not
2128                          * support checksumming for this protocol, complete
2129                          * checksumming here.
2130                          */
2131                         if (skb->ip_summed == CHECKSUM_PARTIAL) {
2132                                 skb_set_transport_header(skb,
2133                                         skb_checksum_start_offset(skb));
2134                                 if (!(features & NETIF_F_ALL_CSUM) &&
2135                                      skb_checksum_help(skb))
2136                                         goto out_kfree_skb;
2137                         }
2138                 }
2139
2140                 rc = ops->ndo_start_xmit(skb, dev);
2141                 trace_net_dev_xmit(skb, rc);
2142                 if (rc == NETDEV_TX_OK)
2143                         txq_trans_update(txq);
2144                 return rc;
2145         }
2146
2147 gso:
2148         do {
2149                 struct sk_buff *nskb = skb->next;
2150
2151                 skb->next = nskb->next;
2152                 nskb->next = NULL;
2153
2154                 /*
2155                  * If device doesn't need nskb->dst, release it right now while
2156                  * its hot in this cpu cache
2157                  */
2158                 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2159                         skb_dst_drop(nskb);
2160
2161                 rc = ops->ndo_start_xmit(nskb, dev);
2162                 trace_net_dev_xmit(nskb, rc);
2163                 if (unlikely(rc != NETDEV_TX_OK)) {
2164                         if (rc & ~NETDEV_TX_MASK)
2165                                 goto out_kfree_gso_skb;
2166                         nskb->next = skb->next;
2167                         skb->next = nskb;
2168                         return rc;
2169                 }
2170                 txq_trans_update(txq);
2171                 if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
2172                         return NETDEV_TX_BUSY;
2173         } while (skb->next);
2174
2175 out_kfree_gso_skb:
2176         if (likely(skb->next == NULL))
2177                 skb->destructor = DEV_GSO_CB(skb)->destructor;
2178 out_kfree_skb:
2179         kfree_skb(skb);
2180 out:
2181         return rc;
2182 }
2183
2184 static u32 hashrnd __read_mostly;
2185
2186 /*
2187  * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2188  * to be used as a distribution range.
2189  */
2190 u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
2191                   unsigned int num_tx_queues)
2192 {
2193         u32 hash;
2194         u16 qoffset = 0;
2195         u16 qcount = num_tx_queues;
2196
2197         if (skb_rx_queue_recorded(skb)) {
2198                 hash = skb_get_rx_queue(skb);
2199                 while (unlikely(hash >= num_tx_queues))
2200                         hash -= num_tx_queues;
2201                 return hash;
2202         }
2203
2204         if (dev->num_tc) {
2205                 u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2206                 qoffset = dev->tc_to_txq[tc].offset;
2207                 qcount = dev->tc_to_txq[tc].count;
2208         }
2209
2210         if (skb->sk && skb->sk->sk_hash)
2211                 hash = skb->sk->sk_hash;
2212         else
2213                 hash = (__force u16) skb->protocol ^ skb->rxhash;
2214         hash = jhash_1word(hash, hashrnd);
2215
2216         return (u16) (((u64) hash * qcount) >> 32) + qoffset;
2217 }
2218 EXPORT_SYMBOL(__skb_tx_hash);
2219
2220 static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2221 {
2222         if (unlikely(queue_index >= dev->real_num_tx_queues)) {
2223                 if (net_ratelimit()) {
2224                         pr_warning("%s selects TX queue %d, but "
2225                                 "real number of TX queues is %d\n",
2226                                 dev->name, queue_index, dev->real_num_tx_queues);
2227                 }
2228                 return 0;
2229         }
2230         return queue_index;
2231 }
2232
2233 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2234 {
2235 #ifdef CONFIG_XPS
2236         struct xps_dev_maps *dev_maps;
2237         struct xps_map *map;
2238         int queue_index = -1;
2239
2240         rcu_read_lock();
2241         dev_maps = rcu_dereference(dev->xps_maps);
2242         if (dev_maps) {
2243                 map = rcu_dereference(
2244                     dev_maps->cpu_map[raw_smp_processor_id()]);
2245                 if (map) {
2246                         if (map->len == 1)
2247                                 queue_index = map->queues[0];
2248                         else {
2249                                 u32 hash;
2250                                 if (skb->sk && skb->sk->sk_hash)
2251                                         hash = skb->sk->sk_hash;
2252                                 else
2253                                         hash = (__force u16) skb->protocol ^
2254                                             skb->rxhash;
2255                                 hash = jhash_1word(hash, hashrnd);
2256                                 queue_index = map->queues[
2257                                     ((u64)hash * map->len) >> 32];
2258                         }
2259                         if (unlikely(queue_index >= dev->real_num_tx_queues))
2260                                 queue_index = -1;
2261                 }
2262         }
2263         rcu_read_unlock();
2264
2265         return queue_index;
2266 #else
2267         return -1;
2268 #endif
2269 }
2270
2271 static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2272                                         struct sk_buff *skb)
2273 {
2274         int queue_index;
2275         const struct net_device_ops *ops = dev->netdev_ops;
2276
2277         if (dev->real_num_tx_queues == 1)
2278                 queue_index = 0;
2279         else if (ops->ndo_select_queue) {
2280                 queue_index = ops->ndo_select_queue(dev, skb);
2281                 queue_index = dev_cap_txqueue(dev, queue_index);
2282         } else {
2283                 struct sock *sk = skb->sk;
2284                 queue_index = sk_tx_queue_get(sk);
2285
2286                 if (queue_index < 0 || skb->ooo_okay ||
2287                     queue_index >= dev->real_num_tx_queues) {
2288                         int old_index = queue_index;
2289
2290                         queue_index = get_xps_queue(dev, skb);
2291                         if (queue_index < 0)
2292                                 queue_index = skb_tx_hash(dev, skb);
2293
2294                         if (queue_index != old_index && sk) {
2295                                 struct dst_entry *dst =
2296                                     rcu_dereference_check(sk->sk_dst_cache, 1);
2297
2298                                 if (dst && skb_dst(skb) == dst)
2299                                         sk_tx_queue_set(sk, queue_index);
2300                         }
2301                 }
2302         }
2303
2304         skb_set_queue_mapping(skb, queue_index);
2305         return netdev_get_tx_queue(dev, queue_index);
2306 }
2307
2308 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2309                                  struct net_device *dev,
2310                                  struct netdev_queue *txq)
2311 {
2312         spinlock_t *root_lock = qdisc_lock(q);
2313         bool contended;
2314         int rc;
2315
2316         qdisc_skb_cb(skb)->pkt_len = skb->len;
2317         qdisc_calculate_pkt_len(skb, q);
2318         /*
2319          * Heuristic to force contended enqueues to serialize on a
2320          * separate lock before trying to get qdisc main lock.
2321          * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2322          * and dequeue packets faster.
2323          */
2324         contended = qdisc_is_running(q);
2325         if (unlikely(contended))
2326                 spin_lock(&q->busylock);
2327
2328         spin_lock(root_lock);
2329         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2330                 kfree_skb(skb);
2331                 rc = NET_XMIT_DROP;
2332         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2333                    qdisc_run_begin(q)) {
2334                 /*
2335                  * This is a work-conserving queue; there are no old skbs
2336                  * waiting to be sent out; and the qdisc is not running -
2337                  * xmit the skb directly.
2338                  */
2339                 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2340                         skb_dst_force(skb);
2341
2342                 qdisc_bstats_update(q, skb);
2343
2344                 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2345                         if (unlikely(contended)) {
2346                                 spin_unlock(&q->busylock);
2347                                 contended = false;
2348                         }
2349                         __qdisc_run(q);
2350                 } else
2351                         qdisc_run_end(q);
2352
2353                 rc = NET_XMIT_SUCCESS;
2354         } else {
2355                 skb_dst_force(skb);
2356                 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2357                 if (qdisc_run_begin(q)) {
2358                         if (unlikely(contended)) {
2359                                 spin_unlock(&q->busylock);
2360                                 contended = false;
2361                         }
2362                         __qdisc_run(q);
2363                 }
2364         }
2365         spin_unlock(root_lock);
2366         if (unlikely(contended))
2367                 spin_unlock(&q->busylock);
2368         return rc;
2369 }
2370
2371 static DEFINE_PER_CPU(int, xmit_recursion);
2372 #define RECURSION_LIMIT 10
2373
2374 /**
2375  *      dev_queue_xmit - transmit a buffer
2376  *      @skb: buffer to transmit
2377  *
2378  *      Queue a buffer for transmission to a network device. The caller must
2379  *      have set the device and priority and built the buffer before calling
2380  *      this function. The function can be called from an interrupt.
2381  *
2382  *      A negative errno code is returned on a failure. A success does not
2383  *      guarantee the frame will be transmitted as it may be dropped due
2384  *      to congestion or traffic shaping.
2385  *
2386  * -----------------------------------------------------------------------------------
2387  *      I notice this method can also return errors from the queue disciplines,
2388  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2389  *      be positive.
2390  *
2391  *      Regardless of the return value, the skb is consumed, so it is currently
2392  *      difficult to retry a send to this method.  (You can bump the ref count
2393  *      before sending to hold a reference for retry if you are careful.)
2394  *
2395  *      When calling this method, interrupts MUST be enabled.  This is because
2396  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2397  *          --BLG
2398  */
2399 int dev_queue_xmit(struct sk_buff *skb)
2400 {
2401         struct net_device *dev = skb->dev;
2402         struct netdev_queue *txq;
2403         struct Qdisc *q;
2404         int rc = -ENOMEM;
2405
2406         /* Disable soft irqs for various locks below. Also
2407          * stops preemption for RCU.
2408          */
2409         rcu_read_lock_bh();
2410
2411         txq = dev_pick_tx(dev, skb);
2412         q = rcu_dereference_bh(txq->qdisc);
2413
2414 #ifdef CONFIG_NET_CLS_ACT
2415         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2416 #endif
2417         trace_net_dev_queue(skb);
2418         if (q->enqueue) {
2419                 rc = __dev_xmit_skb(skb, q, dev, txq);
2420                 goto out;
2421         }
2422
2423         /* The device has no queue. Common case for software devices:
2424            loopback, all the sorts of tunnels...
2425
2426            Really, it is unlikely that netif_tx_lock protection is necessary
2427            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2428            counters.)
2429            However, it is possible, that they rely on protection
2430            made by us here.
2431
2432            Check this and shot the lock. It is not prone from deadlocks.
2433            Either shot noqueue qdisc, it is even simpler 8)
2434          */
2435         if (dev->flags & IFF_UP) {
2436                 int cpu = smp_processor_id(); /* ok because BHs are off */
2437
2438                 if (txq->xmit_lock_owner != cpu) {
2439
2440                         if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2441                                 goto recursion_alert;
2442
2443                         HARD_TX_LOCK(dev, txq, cpu);
2444
2445                         if (!netif_tx_queue_stopped(txq)) {
2446                                 __this_cpu_inc(xmit_recursion);
2447                                 rc = dev_hard_start_xmit(skb, dev, txq);
2448                                 __this_cpu_dec(xmit_recursion);
2449                                 if (dev_xmit_complete(rc)) {
2450                                         HARD_TX_UNLOCK(dev, txq);
2451                                         goto out;
2452                                 }
2453                         }
2454                         HARD_TX_UNLOCK(dev, txq);
2455                         if (net_ratelimit())
2456                                 printk(KERN_CRIT "Virtual device %s asks to "
2457                                        "queue packet!\n", dev->name);
2458                 } else {
2459                         /* Recursion is detected! It is possible,
2460                          * unfortunately
2461                          */
2462 recursion_alert:
2463                         if (net_ratelimit())
2464                                 printk(KERN_CRIT "Dead loop on virtual device "
2465                                        "%s, fix it urgently!\n", dev->name);
2466                 }
2467         }
2468
2469         rc = -ENETDOWN;
2470         rcu_read_unlock_bh();
2471
2472         kfree_skb(skb);
2473         return rc;
2474 out:
2475         rcu_read_unlock_bh();
2476         return rc;
2477 }
2478 EXPORT_SYMBOL(dev_queue_xmit);
2479
2480
2481 /*=======================================================================
2482                         Receiver routines
2483   =======================================================================*/
2484
2485 int netdev_max_backlog __read_mostly = 1000;
2486 int netdev_tstamp_prequeue __read_mostly = 1;
2487 int netdev_budget __read_mostly = 300;
2488 int weight_p __read_mostly = 64;            /* old backlog weight */
2489
2490 /* Called with irq disabled */
2491 static inline void ____napi_schedule(struct softnet_data *sd,
2492                                      struct napi_struct *napi)
2493 {
2494         list_add_tail(&napi->poll_list, &sd->poll_list);
2495         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2496 }
2497
2498 /*
2499  * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
2500  * and src/dst port numbers. Returns a non-zero hash number on success
2501  * and 0 on failure.
2502  */
2503 __u32 __skb_get_rxhash(struct sk_buff *skb)
2504 {
2505         int nhoff, hash = 0, poff;
2506         const struct ipv6hdr *ip6;
2507         const struct iphdr *ip;
2508         u8 ip_proto;
2509         u32 addr1, addr2, ihl;
2510         union {
2511                 u32 v32;
2512                 u16 v16[2];
2513         } ports;
2514
2515         nhoff = skb_network_offset(skb);
2516
2517         switch (skb->protocol) {
2518         case __constant_htons(ETH_P_IP):
2519                 if (!pskb_may_pull(skb, sizeof(*ip) + nhoff))
2520                         goto done;
2521
2522                 ip = (const struct iphdr *) (skb->data + nhoff);
2523                 if (ip->frag_off & htons(IP_MF | IP_OFFSET))
2524                         ip_proto = 0;
2525                 else
2526                         ip_proto = ip->protocol;
2527                 addr1 = (__force u32) ip->saddr;
2528                 addr2 = (__force u32) ip->daddr;
2529                 ihl = ip->ihl;
2530                 break;
2531         case __constant_htons(ETH_P_IPV6):
2532                 if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff))
2533                         goto done;
2534
2535                 ip6 = (const struct ipv6hdr *) (skb->data + nhoff);
2536                 ip_proto = ip6->nexthdr;
2537                 addr1 = (__force u32) ip6->saddr.s6_addr32[3];
2538                 addr2 = (__force u32) ip6->daddr.s6_addr32[3];
2539                 ihl = (40 >> 2);
2540                 break;
2541         default:
2542                 goto done;
2543         }
2544
2545         ports.v32 = 0;
2546         poff = proto_ports_offset(ip_proto);
2547         if (poff >= 0) {
2548                 nhoff += ihl * 4 + poff;
2549                 if (pskb_may_pull(skb, nhoff + 4)) {
2550                         ports.v32 = * (__force u32 *) (skb->data + nhoff);
2551                         if (ports.v16[1] < ports.v16[0])
2552                                 swap(ports.v16[0], ports.v16[1]);
2553                 }
2554         }
2555
2556         /* get a consistent hash (same value on both flow directions) */
2557         if (addr2 < addr1)
2558                 swap(addr1, addr2);
2559
2560         hash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
2561         if (!hash)
2562                 hash = 1;
2563
2564 done:
2565         return hash;
2566 }
2567 EXPORT_SYMBOL(__skb_get_rxhash);
2568
2569 #ifdef CONFIG_RPS
2570
2571 /* One global table that all flow-based protocols share. */
2572 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2573 EXPORT_SYMBOL(rps_sock_flow_table);
2574
2575 static struct rps_dev_flow *
2576 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2577             struct rps_dev_flow *rflow, u16 next_cpu)
2578 {
2579         u16 tcpu;
2580
2581         tcpu = rflow->cpu = next_cpu;
2582         if (tcpu != RPS_NO_CPU) {
2583 #ifdef CONFIG_RFS_ACCEL
2584                 struct netdev_rx_queue *rxqueue;
2585                 struct rps_dev_flow_table *flow_table;
2586                 struct rps_dev_flow *old_rflow;
2587                 u32 flow_id;
2588                 u16 rxq_index;
2589                 int rc;
2590
2591                 /* Should we steer this flow to a different hardware queue? */
2592                 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2593                     !(dev->features & NETIF_F_NTUPLE))
2594                         goto out;
2595                 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2596                 if (rxq_index == skb_get_rx_queue(skb))
2597                         goto out;
2598
2599                 rxqueue = dev->_rx + rxq_index;
2600                 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2601                 if (!flow_table)
2602                         goto out;
2603                 flow_id = skb->rxhash & flow_table->mask;
2604                 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2605                                                         rxq_index, flow_id);
2606                 if (rc < 0)
2607                         goto out;
2608                 old_rflow = rflow;
2609                 rflow = &flow_table->flows[flow_id];
2610                 rflow->cpu = next_cpu;
2611                 rflow->filter = rc;
2612                 if (old_rflow->filter == rflow->filter)
2613                         old_rflow->filter = RPS_NO_FILTER;
2614         out:
2615 #endif
2616                 rflow->last_qtail =
2617                         per_cpu(softnet_data, tcpu).input_queue_head;
2618         }
2619
2620         return rflow;
2621 }
2622
2623 /*
2624  * get_rps_cpu is called from netif_receive_skb and returns the target
2625  * CPU from the RPS map of the receiving queue for a given skb.
2626  * rcu_read_lock must be held on entry.
2627  */
2628 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2629                        struct rps_dev_flow **rflowp)
2630 {
2631         struct netdev_rx_queue *rxqueue;
2632         struct rps_map *map;
2633         struct rps_dev_flow_table *flow_table;
2634         struct rps_sock_flow_table *sock_flow_table;
2635         int cpu = -1;
2636         u16 tcpu;
2637
2638         if (skb_rx_queue_recorded(skb)) {
2639                 u16 index = skb_get_rx_queue(skb);
2640                 if (unlikely(index >= dev->real_num_rx_queues)) {
2641                         WARN_ONCE(dev->real_num_rx_queues > 1,
2642                                   "%s received packet on queue %u, but number "
2643                                   "of RX queues is %u\n",
2644                                   dev->name, index, dev->real_num_rx_queues);
2645                         goto done;
2646                 }
2647                 rxqueue = dev->_rx + index;
2648         } else
2649                 rxqueue = dev->_rx;
2650
2651         map = rcu_dereference(rxqueue->rps_map);
2652         if (map) {
2653                 if (map->len == 1 &&
2654                     !rcu_dereference_raw(rxqueue->rps_flow_table)) {
2655                         tcpu = map->cpus[0];
2656                         if (cpu_online(tcpu))
2657                                 cpu = tcpu;
2658                         goto done;
2659                 }
2660         } else if (!rcu_dereference_raw(rxqueue->rps_flow_table)) {
2661                 goto done;
2662         }
2663
2664         skb_reset_network_header(skb);
2665         if (!skb_get_rxhash(skb))
2666                 goto done;
2667
2668         flow_table = rcu_dereference(rxqueue->rps_flow_table);
2669         sock_flow_table = rcu_dereference(rps_sock_flow_table);
2670         if (flow_table && sock_flow_table) {
2671                 u16 next_cpu;
2672                 struct rps_dev_flow *rflow;
2673
2674                 rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2675                 tcpu = rflow->cpu;
2676
2677                 next_cpu = sock_flow_table->ents[skb->rxhash &
2678                     sock_flow_table->mask];
2679
2680                 /*
2681                  * If the desired CPU (where last recvmsg was done) is
2682                  * different from current CPU (one in the rx-queue flow
2683                  * table entry), switch if one of the following holds:
2684                  *   - Current CPU is unset (equal to RPS_NO_CPU).
2685                  *   - Current CPU is offline.
2686                  *   - The current CPU's queue tail has advanced beyond the
2687                  *     last packet that was enqueued using this table entry.
2688                  *     This guarantees that all previous packets for the flow
2689                  *     have been dequeued, thus preserving in order delivery.
2690                  */
2691                 if (unlikely(tcpu != next_cpu) &&
2692                     (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2693                      ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2694                       rflow->last_qtail)) >= 0))
2695                         rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
2696
2697                 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2698                         *rflowp = rflow;
2699                         cpu = tcpu;
2700                         goto done;
2701                 }
2702         }
2703
2704         if (map) {
2705                 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2706
2707                 if (cpu_online(tcpu)) {
2708                         cpu = tcpu;
2709                         goto done;
2710                 }
2711         }
2712
2713 done:
2714         return cpu;
2715 }
2716
2717 #ifdef CONFIG_RFS_ACCEL
2718
2719 /**
2720  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
2721  * @dev: Device on which the filter was set
2722  * @rxq_index: RX queue index
2723  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
2724  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
2725  *
2726  * Drivers that implement ndo_rx_flow_steer() should periodically call
2727  * this function for each installed filter and remove the filters for
2728  * which it returns %true.
2729  */
2730 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
2731                          u32 flow_id, u16 filter_id)
2732 {
2733         struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
2734         struct rps_dev_flow_table *flow_table;
2735         struct rps_dev_flow *rflow;
2736         bool expire = true;
2737         int cpu;
2738
2739         rcu_read_lock();
2740         flow_table = rcu_dereference(rxqueue->rps_flow_table);
2741         if (flow_table && flow_id <= flow_table->mask) {
2742                 rflow = &flow_table->flows[flow_id];
2743                 cpu = ACCESS_ONCE(rflow->cpu);
2744                 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
2745                     ((int)(per_cpu(softnet_data, cpu).input_queue_head -
2746                            rflow->last_qtail) <
2747                      (int)(10 * flow_table->mask)))
2748                         expire = false;
2749         }
2750         rcu_read_unlock();
2751         return expire;
2752 }
2753 EXPORT_SYMBOL(rps_may_expire_flow);
2754
2755 #endif /* CONFIG_RFS_ACCEL */
2756
2757 /* Called from hardirq (IPI) context */
2758 static void rps_trigger_softirq(void *data)
2759 {
2760         struct softnet_data *sd = data;
2761
2762         ____napi_schedule(sd, &sd->backlog);
2763         sd->received_rps++;
2764 }
2765
2766 #endif /* CONFIG_RPS */
2767
2768 /*
2769  * Check if this softnet_data structure is another cpu one
2770  * If yes, queue it to our IPI list and return 1
2771  * If no, return 0
2772  */
2773 static int rps_ipi_queued(struct softnet_data *sd)
2774 {
2775 #ifdef CONFIG_RPS
2776         struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2777
2778         if (sd != mysd) {
2779                 sd->rps_ipi_next = mysd->rps_ipi_list;
2780                 mysd->rps_ipi_list = sd;
2781
2782                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2783                 return 1;
2784         }
2785 #endif /* CONFIG_RPS */
2786         return 0;
2787 }
2788
2789 /*
2790  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2791  * queue (may be a remote CPU queue).
2792  */
2793 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2794                               unsigned int *qtail)
2795 {
2796         struct softnet_data *sd;
2797         unsigned long flags;
2798
2799         sd = &per_cpu(softnet_data, cpu);
2800
2801         local_irq_save(flags);
2802
2803         rps_lock(sd);
2804         if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
2805                 if (skb_queue_len(&sd->input_pkt_queue)) {
2806 enqueue:
2807                         __skb_queue_tail(&sd->input_pkt_queue, skb);
2808                         input_queue_tail_incr_save(sd, qtail);
2809                         rps_unlock(sd);
2810                         local_irq_restore(flags);
2811                         return NET_RX_SUCCESS;
2812                 }
2813
2814                 /* Schedule NAPI for backlog device
2815                  * We can use non atomic operation since we own the queue lock
2816                  */
2817                 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
2818                         if (!rps_ipi_queued(sd))
2819                                 ____napi_schedule(sd, &sd->backlog);
2820                 }
2821                 goto enqueue;
2822         }
2823
2824         sd->dropped++;
2825         rps_unlock(sd);
2826
2827         local_irq_restore(flags);
2828
2829         atomic_long_inc(&skb->dev->rx_dropped);
2830         kfree_skb(skb);
2831         return NET_RX_DROP;
2832 }
2833
2834 /**
2835  *      netif_rx        -       post buffer to the network code
2836  *      @skb: buffer to post
2837  *
2838  *      This function receives a packet from a device driver and queues it for
2839  *      the upper (protocol) levels to process.  It always succeeds. The buffer
2840  *      may be dropped during processing for congestion control or by the
2841  *      protocol layers.
2842  *
2843  *      return values:
2844  *      NET_RX_SUCCESS  (no congestion)
2845  *      NET_RX_DROP     (packet was dropped)
2846  *
2847  */
2848
2849 int netif_rx(struct sk_buff *skb)
2850 {
2851         int ret;
2852
2853         /* if netpoll wants it, pretend we never saw it */
2854         if (netpoll_rx(skb))
2855                 return NET_RX_DROP;
2856
2857         if (netdev_tstamp_prequeue)
2858                 net_timestamp_check(skb);
2859
2860         trace_netif_rx(skb);
2861 #ifdef CONFIG_RPS
2862         {
2863                 struct rps_dev_flow voidflow, *rflow = &voidflow;
2864                 int cpu;
2865
2866                 preempt_disable();
2867                 rcu_read_lock();
2868
2869                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
2870                 if (cpu < 0)
2871                         cpu = smp_processor_id();
2872
2873                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2874
2875                 rcu_read_unlock();
2876                 preempt_enable();
2877         }
2878 #else
2879         {
2880                 unsigned int qtail;
2881                 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
2882                 put_cpu();
2883         }
2884 #endif
2885         return ret;
2886 }
2887 EXPORT_SYMBOL(netif_rx);
2888
2889 int netif_rx_ni(struct sk_buff *skb)
2890 {
2891         int err;
2892
2893         preempt_disable();
2894         err = netif_rx(skb);
2895         if (local_softirq_pending())
2896                 do_softirq();
2897         preempt_enable();
2898
2899         return err;
2900 }
2901 EXPORT_SYMBOL(netif_rx_ni);
2902
2903 static void net_tx_action(struct softirq_action *h)
2904 {
2905         struct softnet_data *sd = &__get_cpu_var(softnet_data);
2906
2907         if (sd->completion_queue) {
2908                 struct sk_buff *clist;
2909
2910                 local_irq_disable();
2911                 clist = sd->completion_queue;
2912                 sd->completion_queue = NULL;
2913                 local_irq_enable();
2914
2915                 while (clist) {
2916                         struct sk_buff *skb = clist;
2917                         clist = clist->next;
2918
2919                         WARN_ON(atomic_read(&skb->users));
2920                         trace_kfree_skb(skb, net_tx_action);
2921                         __kfree_skb(skb);
2922                 }
2923         }
2924
2925         if (sd->output_queue) {
2926                 struct Qdisc *head;
2927
2928                 local_irq_disable();
2929                 head = sd->output_queue;
2930                 sd->output_queue = NULL;
2931                 sd->output_queue_tailp = &sd->output_queue;
2932                 local_irq_enable();
2933
2934                 while (head) {
2935                         struct Qdisc *q = head;
2936                         spinlock_t *root_lock;
2937
2938                         head = head->next_sched;
2939
2940                         root_lock = qdisc_lock(q);
2941                         if (spin_trylock(root_lock)) {
2942                                 smp_mb__before_clear_bit();
2943                                 clear_bit(__QDISC_STATE_SCHED,
2944                                           &q->state);
2945                                 qdisc_run(q);
2946                                 spin_unlock(root_lock);
2947                         } else {
2948                                 if (!test_bit(__QDISC_STATE_DEACTIVATED,
2949                                               &q->state)) {
2950                                         __netif_reschedule(q);
2951                                 } else {
2952                                         smp_mb__before_clear_bit();
2953                                         clear_bit(__QDISC_STATE_SCHED,
2954                                                   &q->state);
2955                                 }
2956                         }
2957                 }
2958         }
2959 }
2960
2961 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
2962     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
2963 /* This hook is defined here for ATM LANE */
2964 int (*br_fdb_test_addr_hook)(struct net_device *dev,
2965                              unsigned char *addr) __read_mostly;
2966 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
2967 #endif
2968
2969 #ifdef CONFIG_NET_CLS_ACT
2970 /* TODO: Maybe we should just force sch_ingress to be compiled in
2971  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2972  * a compare and 2 stores extra right now if we dont have it on
2973  * but have CONFIG_NET_CLS_ACT
2974  * NOTE: This doesn't stop any functionality; if you dont have
2975  * the ingress scheduler, you just can't add policies on ingress.
2976  *
2977  */
2978 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
2979 {
2980         struct net_device *dev = skb->dev;
2981         u32 ttl = G_TC_RTTL(skb->tc_verd);
2982         int result = TC_ACT_OK;
2983         struct Qdisc *q;
2984
2985         if (unlikely(MAX_RED_LOOP < ttl++)) {
2986                 if (net_ratelimit())
2987                         pr_warning( "Redir loop detected Dropping packet (%d->%d)\n",
2988                                skb->skb_iif, dev->ifindex);
2989                 return TC_ACT_SHOT;
2990         }
2991
2992         skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
2993         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
2994
2995         q = rxq->qdisc;
2996         if (q != &noop_qdisc) {
2997                 spin_lock(qdisc_lock(q));
2998                 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
2999                         result = qdisc_enqueue_root(skb, q);
3000                 spin_unlock(qdisc_lock(q));
3001         }
3002
3003         return result;
3004 }
3005
3006 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3007                                          struct packet_type **pt_prev,
3008                                          int *ret, struct net_device *orig_dev)
3009 {
3010         struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3011
3012         if (!rxq || rxq->qdisc == &noop_qdisc)
3013                 goto out;
3014
3015         if (*pt_prev) {
3016                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3017                 *pt_prev = NULL;
3018         }
3019
3020         switch (ing_filter(skb, rxq)) {
3021         case TC_ACT_SHOT:
3022         case TC_ACT_STOLEN:
3023                 kfree_skb(skb);
3024                 return NULL;
3025         }
3026
3027 out:
3028         skb->tc_verd = 0;
3029         return skb;
3030 }
3031 #endif
3032
3033 /**
3034  *      netdev_rx_handler_register - register receive handler
3035  *      @dev: device to register a handler for
3036  *      @rx_handler: receive handler to register
3037  *      @rx_handler_data: data pointer that is used by rx handler
3038  *
3039  *      Register a receive hander for a device. This handler will then be
3040  *      called from __netif_receive_skb. A negative errno code is returned
3041  *      on a failure.
3042  *
3043  *      The caller must hold the rtnl_mutex.
3044  *
3045  *      For a general description of rx_handler, see enum rx_handler_result.
3046  */
3047 int netdev_rx_handler_register(struct net_device *dev,
3048                                rx_handler_func_t *rx_handler,
3049                                void *rx_handler_data)
3050 {
3051         ASSERT_RTNL();
3052
3053         if (dev->rx_handler)
3054                 return -EBUSY;
3055
3056         rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3057         rcu_assign_pointer(dev->rx_handler, rx_handler);
3058
3059         return 0;
3060 }
3061 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3062
3063 /**
3064  *      netdev_rx_handler_unregister - unregister receive handler
3065  *      @dev: device to unregister a handler from
3066  *
3067  *      Unregister a receive hander from a device.
3068  *
3069  *      The caller must hold the rtnl_mutex.
3070  */
3071 void netdev_rx_handler_unregister(struct net_device *dev)
3072 {
3073
3074         ASSERT_RTNL();
3075         rcu_assign_pointer(dev->rx_handler, NULL);
3076         rcu_assign_pointer(dev->rx_handler_data, NULL);
3077 }
3078 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3079
3080 static void vlan_on_bond_hook(struct sk_buff *skb)
3081 {
3082         /*
3083          * Make sure ARP frames received on VLAN interfaces stacked on
3084          * bonding interfaces still make their way to any base bonding
3085          * device that may have registered for a specific ptype.
3086          */
3087         if (skb->dev->priv_flags & IFF_802_1Q_VLAN &&
3088             vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING &&
3089             skb->protocol == htons(ETH_P_ARP)) {
3090                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
3091
3092                 if (!skb2)
3093                         return;
3094                 skb2->dev = vlan_dev_real_dev(skb->dev);
3095                 netif_rx(skb2);
3096         }
3097 }
3098
3099 static int __netif_receive_skb(struct sk_buff *skb)
3100 {
3101         struct packet_type *ptype, *pt_prev;
3102         rx_handler_func_t *rx_handler;
3103         struct net_device *orig_dev;
3104         struct net_device *null_or_dev;
3105         bool deliver_exact = false;
3106         int ret = NET_RX_DROP;
3107         __be16 type;
3108
3109         if (!netdev_tstamp_prequeue)
3110                 net_timestamp_check(skb);
3111
3112         trace_netif_receive_skb(skb);
3113
3114         /* if we've gotten here through NAPI, check netpoll */
3115         if (netpoll_receive_skb(skb))
3116                 return NET_RX_DROP;
3117
3118         if (!skb->skb_iif)
3119                 skb->skb_iif = skb->dev->ifindex;
3120         orig_dev = skb->dev;
3121
3122         skb_reset_network_header(skb);
3123         skb_reset_transport_header(skb);
3124         skb->mac_len = skb->network_header - skb->mac_header;
3125
3126         pt_prev = NULL;
3127
3128         rcu_read_lock();
3129
3130 another_round:
3131
3132         __this_cpu_inc(softnet_data.processed);
3133
3134         if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) {
3135                 skb = vlan_untag(skb);
3136                 if (unlikely(!skb))
3137                         goto out;
3138         }
3139
3140 #ifdef CONFIG_NET_CLS_ACT
3141         if (skb->tc_verd & TC_NCLS) {
3142                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3143                 goto ncls;
3144         }
3145 #endif
3146
3147         list_for_each_entry_rcu(ptype, &ptype_all, list) {
3148                 if (!ptype->dev || ptype->dev == skb->dev) {
3149                         if (pt_prev)
3150                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3151                         pt_prev = ptype;
3152                 }
3153         }
3154
3155 #ifdef CONFIG_NET_CLS_ACT
3156         skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3157         if (!skb)
3158                 goto out;
3159 ncls:
3160 #endif
3161
3162         rx_handler = rcu_dereference(skb->dev->rx_handler);
3163         if (rx_handler) {
3164                 if (pt_prev) {
3165                         ret = deliver_skb(skb, pt_prev, orig_dev);
3166                         pt_prev = NULL;
3167                 }
3168                 switch (rx_handler(&skb)) {
3169                 case RX_HANDLER_CONSUMED:
3170                         goto out;
3171                 case RX_HANDLER_ANOTHER:
3172                         goto another_round;
3173                 case RX_HANDLER_EXACT:
3174                         deliver_exact = true;
3175                 case RX_HANDLER_PASS:
3176                         break;
3177                 default:
3178                         BUG();
3179                 }
3180         }
3181
3182         if (vlan_tx_tag_present(skb)) {
3183                 if (pt_prev) {
3184                         ret = deliver_skb(skb, pt_prev, orig_dev);
3185                         pt_prev = NULL;
3186                 }
3187                 if (vlan_do_receive(&skb)) {
3188                         ret = __netif_receive_skb(skb);
3189                         goto out;
3190                 } else if (unlikely(!skb))
3191                         goto out;
3192         }
3193
3194         vlan_on_bond_hook(skb);
3195
3196         /* deliver only exact match when indicated */
3197         null_or_dev = deliver_exact ? skb->dev : NULL;
3198
3199         type = skb->protocol;
3200         list_for_each_entry_rcu(ptype,
3201                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3202                 if (ptype->type == type &&
3203                     (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3204                      ptype->dev == orig_dev)) {
3205                         if (pt_prev)
3206                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3207                         pt_prev = ptype;
3208                 }
3209         }
3210
3211         if (pt_prev) {
3212                 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3213         } else {
3214                 atomic_long_inc(&skb->dev->rx_dropped);
3215                 kfree_skb(skb);
3216                 /* Jamal, now you will not able to escape explaining
3217                  * me how you were going to use this. :-)
3218                  */
3219                 ret = NET_RX_DROP;
3220         }
3221
3222 out:
3223         rcu_read_unlock();
3224         return ret;
3225 }
3226
3227 /**
3228  *      netif_receive_skb - process receive buffer from network
3229  *      @skb: buffer to process
3230  *
3231  *      netif_receive_skb() is the main receive data processing function.
3232  *      It always succeeds. The buffer may be dropped during processing
3233  *      for congestion control or by the protocol layers.
3234  *
3235  *      This function may only be called from softirq context and interrupts
3236  *      should be enabled.
3237  *
3238  *      Return values (usually ignored):
3239  *      NET_RX_SUCCESS: no congestion
3240  *      NET_RX_DROP: packet was dropped
3241  */
3242 int netif_receive_skb(struct sk_buff *skb)
3243 {
3244         if (netdev_tstamp_prequeue)
3245                 net_timestamp_check(skb);
3246
3247         if (skb_defer_rx_timestamp(skb))
3248                 return NET_RX_SUCCESS;
3249
3250 #ifdef CONFIG_RPS
3251         {
3252                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3253                 int cpu, ret;
3254
3255                 rcu_read_lock();
3256
3257                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3258
3259                 if (cpu >= 0) {
3260                         ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3261                         rcu_read_unlock();
3262                 } else {
3263                         rcu_read_unlock();
3264                         ret = __netif_receive_skb(skb);
3265                 }
3266
3267                 return ret;
3268         }
3269 #else
3270         return __netif_receive_skb(skb);
3271 #endif
3272 }
3273 EXPORT_SYMBOL(netif_receive_skb);
3274
3275 /* Network device is going away, flush any packets still pending
3276  * Called with irqs disabled.
3277  */
3278 static void flush_backlog(void *arg)
3279 {
3280         struct net_device *dev = arg;
3281         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3282         struct sk_buff *skb, *tmp;
3283
3284         rps_lock(sd);
3285         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3286                 if (skb->dev == dev) {
3287                         __skb_unlink(skb, &sd->input_pkt_queue);
3288                         kfree_skb(skb);
3289                         input_queue_head_incr(sd);
3290                 }
3291         }
3292         rps_unlock(sd);
3293
3294         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3295                 if (skb->dev == dev) {
3296                         __skb_unlink(skb, &sd->process_queue);
3297                         kfree_skb(skb);
3298                         input_queue_head_incr(sd);
3299                 }
3300         }
3301 }
3302
3303 static int napi_gro_complete(struct sk_buff *skb)
3304 {
3305         struct packet_type *ptype;
3306         __be16 type = skb->protocol;
3307         struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3308         int err = -ENOENT;
3309
3310         if (NAPI_GRO_CB(skb)->count == 1) {
3311                 skb_shinfo(skb)->gso_size = 0;
3312                 goto out;
3313         }
3314
3315         rcu_read_lock();
3316         list_for_each_entry_rcu(ptype, head, list) {
3317                 if (ptype->type != type || ptype->dev || !ptype->gro_complete)
3318                         continue;
3319
3320                 err = ptype->gro_complete(skb);
3321                 break;
3322         }
3323         rcu_read_unlock();
3324
3325         if (err) {
3326                 WARN_ON(&ptype->list == head);
3327                 kfree_skb(skb);
3328                 return NET_RX_SUCCESS;
3329         }
3330
3331 out:
3332         return netif_receive_skb(skb);
3333 }
3334
3335 inline void napi_gro_flush(struct napi_struct *napi)
3336 {
3337         struct sk_buff *skb, *next;
3338
3339         for (skb = napi->gro_list; skb; skb = next) {
3340                 next = skb->next;
3341                 skb->next = NULL;
3342                 napi_gro_complete(skb);
3343         }
3344
3345         napi->gro_count = 0;
3346         napi->gro_list = NULL;
3347 }
3348 EXPORT_SYMBOL(napi_gro_flush);
3349
3350 enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3351 {
3352         struct sk_buff **pp = NULL;
3353         struct packet_type *ptype;
3354         __be16 type = skb->protocol;
3355         struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3356         int same_flow;
3357         int mac_len;
3358         enum gro_result ret;
3359
3360         if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3361                 goto normal;
3362
3363         if (skb_is_gso(skb) || skb_has_frag_list(skb))
3364                 goto normal;
3365
3366         rcu_read_lock();
3367         list_for_each_entry_rcu(ptype, head, list) {
3368                 if (ptype->type != type || ptype->dev || !ptype->gro_receive)
3369                         continue;
3370
3371                 skb_set_network_header(skb, skb_gro_offset(skb));
3372                 mac_len = skb->network_header - skb->mac_header;
3373                 skb->mac_len = mac_len;
3374                 NAPI_GRO_CB(skb)->same_flow = 0;
3375                 NAPI_GRO_CB(skb)->flush = 0;
3376                 NAPI_GRO_CB(skb)->free = 0;
3377
3378                 pp = ptype->gro_receive(&napi->gro_list, skb);
3379                 break;
3380         }
3381         rcu_read_unlock();
3382
3383         if (&ptype->list == head)
3384                 goto normal;
3385
3386         same_flow = NAPI_GRO_CB(skb)->same_flow;
3387         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3388
3389         if (pp) {
3390                 struct sk_buff *nskb = *pp;
3391
3392                 *pp = nskb->next;
3393                 nskb->next = NULL;
3394                 napi_gro_complete(nskb);
3395                 napi->gro_count--;
3396         }
3397
3398         if (same_flow)
3399                 goto ok;
3400
3401         if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3402                 goto normal;
3403
3404         napi->gro_count++;
3405         NAPI_GRO_CB(skb)->count = 1;
3406         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3407         skb->next = napi->gro_list;
3408         napi->gro_list = skb;
3409         ret = GRO_HELD;
3410
3411 pull:
3412         if (skb_headlen(skb) < skb_gro_offset(skb)) {
3413                 int grow = skb_gro_offset(skb) - skb_headlen(skb);
3414
3415                 BUG_ON(skb->end - skb->tail < grow);
3416
3417                 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3418
3419                 skb->tail += grow;
3420                 skb->data_len -= grow;
3421
3422                 skb_shinfo(skb)->frags[0].page_offset += grow;
3423                 skb_shinfo(skb)->frags[0].size -= grow;
3424
3425                 if (unlikely(!skb_shinfo(skb)->frags[0].size)) {
3426                         put_page(skb_shinfo(skb)->frags[0].page);
3427                         memmove(skb_shinfo(skb)->frags,
3428                                 skb_shinfo(skb)->frags + 1,
3429                                 --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3430                 }
3431         }
3432
3433 ok:
3434         return ret;
3435
3436 normal:
3437         ret = GRO_NORMAL;
3438         goto pull;
3439 }
3440 EXPORT_SYMBOL(dev_gro_receive);
3441
3442 static inline gro_result_t
3443 __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3444 {
3445         struct sk_buff *p;
3446
3447         for (p = napi->gro_list; p; p = p->next) {
3448                 unsigned long diffs;
3449
3450                 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3451                 diffs |= p->vlan_tci ^ skb->vlan_tci;
3452                 diffs |= compare_ether_header(skb_mac_header(p),
3453                                               skb_gro_mac_header(skb));
3454                 NAPI_GRO_CB(p)->same_flow = !diffs;
3455                 NAPI_GRO_CB(p)->flush = 0;
3456         }
3457
3458         return dev_gro_receive(napi, skb);
3459 }
3460
3461 gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3462 {
3463         switch (ret) {
3464         case GRO_NORMAL:
3465                 if (netif_receive_skb(skb))
3466                         ret = GRO_DROP;
3467                 break;
3468
3469         case GRO_DROP:
3470         case GRO_MERGED_FREE:
3471                 kfree_skb(skb);
3472                 break;
3473
3474         case GRO_HELD:
3475         case GRO_MERGED:
3476                 break;
3477         }
3478
3479         return ret;
3480 }
3481 EXPORT_SYMBOL(napi_skb_finish);
3482
3483 void skb_gro_reset_offset(struct sk_buff *skb)
3484 {
3485         NAPI_GRO_CB(skb)->data_offset = 0;
3486         NAPI_GRO_CB(skb)->frag0 = NULL;
3487         NAPI_GRO_CB(skb)->frag0_len = 0;
3488
3489         if (skb->mac_header == skb->tail &&
3490             !PageHighMem(skb_shinfo(skb)->frags[0].page)) {
3491                 NAPI_GRO_CB(skb)->frag0 =
3492                         page_address(skb_shinfo(skb)->frags[0].page) +
3493                         skb_shinfo(skb)->frags[0].page_offset;
3494                 NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size;
3495         }
3496 }
3497 EXPORT_SYMBOL(skb_gro_reset_offset);
3498
3499 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3500 {
3501         skb_gro_reset_offset(skb);
3502
3503         return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
3504 }
3505 EXPORT_SYMBOL(napi_gro_receive);
3506
3507 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3508 {
3509         __skb_pull(skb, skb_headlen(skb));
3510         skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
3511         skb->vlan_tci = 0;
3512         skb->dev = napi->dev;
3513         skb->skb_iif = 0;
3514
3515         napi->skb = skb;
3516 }
3517
3518 struct sk_buff *napi_get_frags(struct napi_struct *napi)
3519 {
3520         struct sk_buff *skb = napi->skb;
3521
3522         if (!skb) {
3523                 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3524                 if (skb)
3525                         napi->skb = skb;
3526         }
3527         return skb;
3528 }
3529 EXPORT_SYMBOL(napi_get_frags);
3530
3531 gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3532                                gro_result_t ret)
3533 {
3534         switch (ret) {
3535         case GRO_NORMAL:
3536         case GRO_HELD:
3537                 skb->protocol = eth_type_trans(skb, skb->dev);
3538
3539                 if (ret == GRO_HELD)
3540                         skb_gro_pull(skb, -ETH_HLEN);
3541                 else if (netif_receive_skb(skb))
3542                         ret = GRO_DROP;
3543                 break;
3544
3545         case GRO_DROP:
3546         case GRO_MERGED_FREE:
3547                 napi_reuse_skb(napi, skb);
3548                 break;
3549
3550         case GRO_MERGED:
3551                 break;
3552         }
3553
3554         return ret;
3555 }
3556 EXPORT_SYMBOL(napi_frags_finish);
3557
3558 struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3559 {
3560         struct sk_buff *skb = napi->skb;
3561         struct ethhdr *eth;
3562         unsigned int hlen;
3563         unsigned int off;
3564
3565         napi->skb = NULL;
3566
3567         skb_reset_mac_header(skb);
3568         skb_gro_reset_offset(skb);
3569
3570         off = skb_gro_offset(skb);
3571         hlen = off + sizeof(*eth);
3572         eth = skb_gro_header_fast(skb, off);
3573         if (skb_gro_header_hard(skb, hlen)) {
3574                 eth = skb_gro_header_slow(skb, hlen, off);
3575                 if (unlikely(!eth)) {
3576                         napi_reuse_skb(napi, skb);
3577                         skb = NULL;
3578                         goto out;
3579                 }
3580         }
3581
3582         skb_gro_pull(skb, sizeof(*eth));
3583
3584         /*
3585          * This works because the only protocols we care about don't require
3586          * special handling.  We'll fix it up properly at the end.
3587          */
3588         skb->protocol = eth->h_proto;
3589
3590 out:
3591         return skb;
3592 }
3593 EXPORT_SYMBOL(napi_frags_skb);
3594
3595 gro_result_t napi_gro_frags(struct napi_struct *napi)
3596 {
3597         struct sk_buff *skb = napi_frags_skb(napi);
3598
3599         if (!skb)
3600                 return GRO_DROP;
3601
3602         return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
3603 }
3604 EXPORT_SYMBOL(napi_gro_frags);
3605
3606 /*
3607  * net_rps_action sends any pending IPI's for rps.
3608  * Note: called with local irq disabled, but exits with local irq enabled.
3609  */
3610 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3611 {
3612 #ifdef CONFIG_RPS
3613         struct softnet_data *remsd = sd->rps_ipi_list;
3614
3615         if (remsd) {
3616                 sd->rps_ipi_list = NULL;
3617
3618                 local_irq_enable();
3619
3620                 /* Send pending IPI's to kick RPS processing on remote cpus. */
3621                 while (remsd) {
3622                         struct softnet_data *next = remsd->rps_ipi_next;
3623
3624                         if (cpu_online(remsd->cpu))
3625                                 __smp_call_function_single(remsd->cpu,
3626                                                            &remsd->csd, 0);
3627                         remsd = next;
3628                 }
3629         } else
3630 #endif
3631                 local_irq_enable();
3632 }
3633
3634 static int process_backlog(struct napi_struct *napi, int quota)
3635 {
3636         int work = 0;
3637         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
3638
3639 #ifdef CONFIG_RPS
3640         /* Check if we have pending ipi, its better to send them now,
3641          * not waiting net_rx_action() end.
3642          */
3643         if (sd->rps_ipi_list) {
3644                 local_irq_disable();
3645                 net_rps_action_and_irq_enable(sd);
3646         }
3647 #endif
3648         napi->weight = weight_p;
3649         local_irq_disable();
3650         while (work < quota) {
3651                 struct sk_buff *skb;
3652                 unsigned int qlen;
3653
3654                 while ((skb = __skb_dequeue(&sd->process_queue))) {
3655                         local_irq_enable();
3656                         __netif_receive_skb(skb);
3657                         local_irq_disable();
3658                         input_queue_head_incr(sd);
3659                         if (++work >= quota) {
3660                                 local_irq_enable();
3661                                 return work;
3662                         }
3663                 }
3664
3665                 rps_lock(sd);
3666                 qlen = skb_queue_len(&sd->input_pkt_queue);
3667                 if (qlen)
3668                         skb_queue_splice_tail_init(&sd->input_pkt_queue,
3669                                                    &sd->process_queue);
3670
3671                 if (qlen < quota - work) {
3672                         /*
3673                          * Inline a custom version of __napi_complete().
3674                          * only current cpu owns and manipulates this napi,
3675                          * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3676                          * we can use a plain write instead of clear_bit(),
3677                          * and we dont need an smp_mb() memory barrier.
3678                          */
3679                         list_del(&napi->poll_list);
3680                         napi->state = 0;
3681
3682                         quota = work + qlen;
3683                 }
3684                 rps_unlock(sd);
3685         }
3686         local_irq_enable();
3687
3688         return work;
3689 }
3690
3691 /**
3692  * __napi_schedule - schedule for receive
3693  * @n: entry to schedule
3694  *
3695  * The entry's receive function will be scheduled to run
3696  */
3697 void __napi_schedule(struct napi_struct *n)
3698 {
3699         unsigned long flags;
3700
3701         local_irq_save(flags);
3702         ____napi_schedule(&__get_cpu_var(softnet_data), n);
3703         local_irq_restore(flags);
3704 }
3705 EXPORT_SYMBOL(__napi_schedule);
3706
3707 void __napi_complete(struct napi_struct *n)
3708 {
3709         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3710         BUG_ON(n->gro_list);
3711
3712         list_del(&n->poll_list);
3713         smp_mb__before_clear_bit();
3714         clear_bit(NAPI_STATE_SCHED, &n->state);
3715 }
3716 EXPORT_SYMBOL(__napi_complete);
3717
3718 void napi_complete(struct napi_struct *n)
3719 {
3720         unsigned long flags;
3721
3722         /*
3723          * don't let napi dequeue from the cpu poll list
3724          * just in case its running on a different cpu
3725          */
3726         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3727                 return;
3728
3729         napi_gro_flush(n);
3730         local_irq_save(flags);
3731         __napi_complete(n);
3732         local_irq_restore(flags);
3733 }
3734 EXPORT_SYMBOL(napi_complete);
3735
3736 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3737                     int (*poll)(struct napi_struct *, int), int weight)
3738 {
3739         INIT_LIST_HEAD(&napi->poll_list);
3740         napi->gro_count = 0;
3741         napi->gro_list = NULL;
3742         napi->skb = NULL;
3743         napi->poll = poll;
3744         napi->weight = weight;
3745         list_add(&napi->dev_list, &dev->napi_list);
3746         napi->dev = dev;
3747 #ifdef CONFIG_NETPOLL
3748         spin_lock_init(&napi->poll_lock);
3749         napi->poll_owner = -1;
3750 #endif
3751         set_bit(NAPI_STATE_SCHED, &napi->state);
3752 }
3753 EXPORT_SYMBOL(netif_napi_add);
3754
3755 void netif_napi_del(struct napi_struct *napi)
3756 {
3757         struct sk_buff *skb, *next;
3758
3759         list_del_init(&napi->dev_list);
3760         napi_free_frags(napi);
3761
3762         for (skb = napi->gro_list; skb; skb = next) {
3763                 next = skb->next;
3764                 skb->next = NULL;
3765                 kfree_skb(skb);
3766         }
3767
3768         napi->gro_list = NULL;
3769         napi->gro_count = 0;
3770 }
3771 EXPORT_SYMBOL(netif_napi_del);
3772
3773 static void net_rx_action(struct softirq_action *h)
3774 {
3775         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3776         unsigned long time_limit = jiffies + 2;
3777         int budget = netdev_budget;
3778         void *have;
3779
3780         local_irq_disable();
3781
3782         while (!list_empty(&sd->poll_list)) {
3783                 struct napi_struct *n;
3784                 int work, weight;
3785
3786                 /* If softirq window is exhuasted then punt.
3787                  * Allow this to run for 2 jiffies since which will allow
3788                  * an average latency of 1.5/HZ.
3789                  */
3790                 if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
3791                         goto softnet_break;
3792
3793                 local_irq_enable();
3794
3795                 /* Even though interrupts have been re-enabled, this
3796                  * access is safe because interrupts can only add new
3797                  * entries to the tail of this list, and only ->poll()
3798                  * calls can remove this head entry from the list.
3799                  */
3800                 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
3801
3802                 have = netpoll_poll_lock(n);
3803
3804                 weight = n->weight;
3805
3806                 /* This NAPI_STATE_SCHED test is for avoiding a race
3807                  * with netpoll's poll_napi().  Only the entity which
3808                  * obtains the lock and sees NAPI_STATE_SCHED set will
3809                  * actually make the ->poll() call.  Therefore we avoid
3810                  * accidentally calling ->poll() when NAPI is not scheduled.
3811                  */
3812                 work = 0;
3813                 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
3814                         work = n->poll(n, weight);
3815                         trace_napi_poll(n);
3816                 }
3817
3818                 WARN_ON_ONCE(work > weight);
3819
3820                 budget -= work;
3821
3822                 local_irq_disable();
3823
3824                 /* Drivers must not modify the NAPI state if they
3825                  * consume the entire weight.  In such cases this code
3826                  * still "owns" the NAPI instance and therefore can
3827                  * move the instance around on the list at-will.
3828                  */
3829                 if (unlikely(work == weight)) {
3830                         if (unlikely(napi_disable_pending(n))) {
3831                                 local_irq_enable();
3832                                 napi_complete(n);
3833                                 local_irq_disable();
3834                         } else
3835                                 list_move_tail(&n->poll_list, &sd->poll_list);
3836                 }
3837
3838                 netpoll_poll_unlock(have);
3839         }
3840 out:
3841         net_rps_action_and_irq_enable(sd);
3842
3843 #ifdef CONFIG_NET_DMA
3844         /*
3845          * There may not be any more sk_buffs coming right now, so push
3846          * any pending DMA copies to hardware
3847          */
3848         dma_issue_pending_all();
3849 #endif
3850
3851         return;
3852
3853 softnet_break:
3854         sd->time_squeeze++;
3855         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3856         goto out;
3857 }
3858
3859 static gifconf_func_t *gifconf_list[NPROTO];
3860
3861 /**
3862  *      register_gifconf        -       register a SIOCGIF handler
3863  *      @family: Address family
3864  *      @gifconf: Function handler
3865  *
3866  *      Register protocol dependent address dumping routines. The handler
3867  *      that is passed must not be freed or reused until it has been replaced
3868  *      by another handler.
3869  */
3870 int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
3871 {
3872         if (family >= NPROTO)
3873                 return -EINVAL;
3874         gifconf_list[family] = gifconf;
3875         return 0;
3876 }
3877 EXPORT_SYMBOL(register_gifconf);
3878
3879
3880 /*
3881  *      Map an interface index to its name (SIOCGIFNAME)
3882  */
3883
3884 /*
3885  *      We need this ioctl for efficient implementation of the
3886  *      if_indextoname() function required by the IPv6 API.  Without
3887  *      it, we would have to search all the interfaces to find a
3888  *      match.  --pb
3889  */
3890
3891 static int dev_ifname(struct net *net, struct ifreq __user *arg)
3892 {
3893         struct net_device *dev;
3894         struct ifreq ifr;
3895
3896         /*
3897          *      Fetch the caller's info block.
3898          */
3899
3900         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3901                 return -EFAULT;
3902
3903         rcu_read_lock();
3904         dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
3905         if (!dev) {
3906                 rcu_read_unlock();
3907                 return -ENODEV;
3908         }
3909
3910         strcpy(ifr.ifr_name, dev->name);
3911         rcu_read_unlock();
3912
3913         if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
3914                 return -EFAULT;
3915         return 0;
3916 }
3917
3918 /*
3919  *      Perform a SIOCGIFCONF call. This structure will change
3920  *      size eventually, and there is nothing I can do about it.
3921  *      Thus we will need a 'compatibility mode'.
3922  */
3923
3924 static int dev_ifconf(struct net *net, char __user *arg)
3925 {
3926         struct ifconf ifc;
3927         struct net_device *dev;
3928         char __user *pos;
3929         int len;
3930         int total;
3931         int i;
3932
3933         /*
3934          *      Fetch the caller's info block.
3935          */
3936
3937         if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
3938                 return -EFAULT;
3939
3940         pos = ifc.ifc_buf;
3941         len = ifc.ifc_len;
3942
3943         /*
3944          *      Loop over the interfaces, and write an info block for each.
3945          */
3946
3947         total = 0;
3948         for_each_netdev(net, dev) {
3949                 for (i = 0; i < NPROTO; i++) {
3950                         if (gifconf_list[i]) {
3951                                 int done;
3952                                 if (!pos)
3953                                         done = gifconf_list[i](dev, NULL, 0);
3954                                 else
3955                                         done = gifconf_list[i](dev, pos + total,
3956                                                                len - total);
3957                                 if (done < 0)
3958                                         return -EFAULT;
3959                                 total += done;
3960                         }
3961                 }
3962         }
3963
3964         /*
3965          *      All done.  Write the updated control block back to the caller.
3966          */
3967         ifc.ifc_len = total;
3968
3969         /*
3970          *      Both BSD and Solaris return 0 here, so we do too.
3971          */
3972         return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
3973 }
3974
3975 #ifdef CONFIG_PROC_FS
3976 /*
3977  *      This is invoked by the /proc filesystem handler to display a device
3978  *      in detail.
3979  */
3980 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
3981         __acquires(RCU)
3982 {
3983         struct net *net = seq_file_net(seq);
3984         loff_t off;
3985         struct net_device *dev;
3986
3987         rcu_read_lock();
3988         if (!*pos)
3989                 return SEQ_START_TOKEN;
3990
3991         off = 1;
3992         for_each_netdev_rcu(net, dev)
3993                 if (off++ == *pos)
3994                         return dev;
3995
3996         return NULL;
3997 }
3998
3999 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4000 {
4001         struct net_device *dev = v;
4002
4003         if (v == SEQ_START_TOKEN)
4004                 dev = first_net_device_rcu(seq_file_net(seq));
4005         else
4006                 dev = next_net_device_rcu(dev);
4007
4008         ++*pos;
4009         return dev;
4010 }
4011
4012 void dev_seq_stop(struct seq_file *seq, void *v)
4013         __releases(RCU)
4014 {
4015         rcu_read_unlock();
4016 }
4017
4018 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
4019 {
4020         struct rtnl_link_stats64 temp;
4021         const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
4022
4023         seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
4024                    "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
4025                    dev->name, stats->rx_bytes, stats->rx_packets,
4026                    stats->rx_errors,
4027                    stats->rx_dropped + stats->rx_missed_errors,
4028                    stats->rx_fifo_errors,
4029                    stats->rx_length_errors + stats->rx_over_errors +
4030                     stats->rx_crc_errors + stats->rx_frame_errors,
4031                    stats->rx_compressed, stats->multicast,
4032                    stats->tx_bytes, stats->tx_packets,
4033                    stats->tx_errors, stats->tx_dropped,
4034                    stats->tx_fifo_errors, stats->collisions,
4035                    stats->tx_carrier_errors +
4036                     stats->tx_aborted_errors +
4037                     stats->tx_window_errors +
4038                     stats->tx_heartbeat_errors,
4039                    stats->tx_compressed);
4040 }
4041
4042 /*
4043  *      Called from the PROCfs module. This now uses the new arbitrary sized
4044  *      /proc/net interface to create /proc/net/dev
4045  */
4046 static int dev_seq_show(struct seq_file *seq, void *v)
4047 {
4048         if (v == SEQ_START_TOKEN)
4049                 seq_puts(seq, "Inter-|   Receive                            "
4050                               "                    |  Transmit\n"
4051                               " face |bytes    packets errs drop fifo frame "
4052                               "compressed multicast|bytes    packets errs "
4053                               "drop fifo colls carrier compressed\n");
4054         else
4055                 dev_seq_printf_stats(seq, v);
4056         return 0;
4057 }
4058
4059 static struct softnet_data *softnet_get_online(loff_t *pos)
4060 {
4061         struct softnet_data *sd = NULL;
4062
4063         while (*pos < nr_cpu_ids)
4064                 if (cpu_online(*pos)) {
4065                         sd = &per_cpu(softnet_data, *pos);
4066                         break;
4067                 } else
4068                         ++*pos;
4069         return sd;
4070 }
4071
4072 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
4073 {
4074         return softnet_get_online(pos);
4075 }
4076
4077 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4078 {
4079         ++*pos;
4080         return softnet_get_online(pos);
4081 }
4082
4083 static void softnet_seq_stop(struct seq_file *seq, void *v)
4084 {
4085 }
4086
4087 static int softnet_seq_show(struct seq_file *seq, void *v)
4088 {
4089         struct softnet_data *sd = v;
4090
4091         seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
4092                    sd->processed, sd->dropped, sd->time_squeeze, 0,
4093                    0, 0, 0, 0, /* was fastroute */
4094                    sd->cpu_collision, sd->received_rps);
4095         return 0;
4096 }
4097
4098 static const struct seq_operations dev_seq_ops = {
4099         .start = dev_seq_start,
4100         .next  = dev_seq_next,
4101         .stop  = dev_seq_stop,
4102         .show  = dev_seq_show,
4103 };
4104
4105 static int dev_seq_open(struct inode *inode, struct file *file)
4106 {
4107         return seq_open_net(inode, file, &dev_seq_ops,
4108                             sizeof(struct seq_net_private));
4109 }
4110
4111 static const struct file_operations dev_seq_fops = {
4112         .owner   = THIS_MODULE,
4113         .open    = dev_seq_open,
4114         .read    = seq_read,
4115         .llseek  = seq_lseek,
4116         .release = seq_release_net,
4117 };
4118
4119 static const struct seq_operations softnet_seq_ops = {
4120         .start = softnet_seq_start,
4121         .next  = softnet_seq_next,
4122         .stop  = softnet_seq_stop,
4123         .show  = softnet_seq_show,
4124 };
4125
4126 static int softnet_seq_open(struct inode *inode, struct file *file)
4127 {
4128         return seq_open(file, &softnet_seq_ops);
4129 }
4130
4131 static const struct file_operations softnet_seq_fops = {
4132         .owner   = THIS_MODULE,
4133         .open    = softnet_seq_open,
4134         .read    = seq_read,
4135         .llseek  = seq_lseek,
4136         .release = seq_release,
4137 };
4138
4139 static void *ptype_get_idx(loff_t pos)
4140 {
4141         struct packet_type *pt = NULL;
4142         loff_t i = 0;
4143         int t;
4144
4145         list_for_each_entry_rcu(pt, &ptype_all, list) {
4146                 if (i == pos)
4147                         return pt;
4148                 ++i;
4149         }
4150
4151         for (t = 0; t < PTYPE_HASH_SIZE; t++) {
4152                 list_for_each_entry_rcu(pt, &ptype_base[t], list) {
4153                         if (i == pos)
4154                                 return pt;
4155                         ++i;
4156                 }
4157         }
4158         return NULL;
4159 }
4160
4161 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
4162         __acquires(RCU)
4163 {
4164         rcu_read_lock();
4165         return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
4166 }
4167
4168 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4169 {
4170         struct packet_type *pt;
4171         struct list_head *nxt;
4172         int hash;
4173
4174         ++*pos;
4175         if (v == SEQ_START_TOKEN)
4176                 return ptype_get_idx(0);
4177
4178         pt = v;
4179         nxt = pt->list.next;
4180         if (pt->type == htons(ETH_P_ALL)) {
4181                 if (nxt != &ptype_all)
4182                         goto found;
4183                 hash = 0;
4184                 nxt = ptype_base[0].next;
4185         } else
4186                 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
4187
4188         while (nxt == &ptype_base[hash]) {
4189                 if (++hash >= PTYPE_HASH_SIZE)
4190                         return NULL;
4191                 nxt = ptype_base[hash].next;
4192         }
4193 found:
4194         return list_entry(nxt, struct packet_type, list);
4195 }
4196
4197 static void ptype_seq_stop(struct seq_file *seq, void *v)
4198         __releases(RCU)
4199 {
4200         rcu_read_unlock();
4201 }
4202
4203 static int ptype_seq_show(struct seq_file *seq, void *v)
4204 {
4205         struct packet_type *pt = v;
4206
4207         if (v == SEQ_START_TOKEN)
4208                 seq_puts(seq, "Type Device      Function\n");
4209         else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
4210                 if (pt->type == htons(ETH_P_ALL))
4211                         seq_puts(seq, "ALL ");
4212                 else
4213                         seq_printf(seq, "%04x", ntohs(pt->type));
4214
4215                 seq_printf(seq, " %-8s %pF\n",
4216                            pt->dev ? pt->dev->name : "", pt->func);
4217         }
4218
4219         return 0;
4220 }
4221
4222 static const struct seq_operations ptype_seq_ops = {
4223         .start = ptype_seq_start,
4224         .next  = ptype_seq_next,
4225         .stop  = ptype_seq_stop,
4226         .show  = ptype_seq_show,
4227 };
4228
4229 static int ptype_seq_open(struct inode *inode, struct file *file)
4230 {
4231         return seq_open_net(inode, file, &ptype_seq_ops,
4232                         sizeof(struct seq_net_private));
4233 }
4234
4235 static const struct file_operations ptype_seq_fops = {
4236         .owner   = THIS_MODULE,
4237         .open    = ptype_seq_open,
4238         .read    = seq_read,
4239         .llseek  = seq_lseek,
4240         .release = seq_release_net,
4241 };
4242
4243
4244 static int __net_init dev_proc_net_init(struct net *net)
4245 {
4246         int rc = -ENOMEM;
4247
4248         if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
4249                 goto out;
4250         if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
4251                 goto out_dev;
4252         if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
4253                 goto out_softnet;
4254
4255         if (wext_proc_init(net))
4256                 goto out_ptype;
4257         rc = 0;
4258 out:
4259         return rc;
4260 out_ptype:
4261         proc_net_remove(net, "ptype");
4262 out_softnet:
4263         proc_net_remove(net, "softnet_stat");
4264 out_dev:
4265         proc_net_remove(net, "dev");
4266         goto out;
4267 }
4268
4269 static void __net_exit dev_proc_net_exit(struct net *net)
4270 {
4271         wext_proc_exit(net);
4272
4273         proc_net_remove(net, "ptype");
4274         proc_net_remove(net, "softnet_stat");
4275         proc_net_remove(net, "dev");
4276 }
4277
4278 static struct pernet_operations __net_initdata dev_proc_ops = {
4279         .init = dev_proc_net_init,
4280         .exit = dev_proc_net_exit,
4281 };
4282
4283 static int __init dev_proc_init(void)
4284 {
4285         return register_pernet_subsys(&dev_proc_ops);
4286 }
4287 #else
4288 #define dev_proc_init() 0
4289 #endif  /* CONFIG_PROC_FS */
4290
4291
4292 /**
4293  *      netdev_set_master       -       set up master pointer
4294  *      @slave: slave device
4295  *      @master: new master device
4296  *
4297  *      Changes the master device of the slave. Pass %NULL to break the
4298  *      bonding. The caller must hold the RTNL semaphore. On a failure
4299  *      a negative errno code is returned. On success the reference counts
4300  *      are adjusted and the function returns zero.
4301  */
4302 int netdev_set_master(struct net_device *slave, struct net_device *master)
4303 {
4304         struct net_device *old = slave->master;
4305
4306         ASSERT_RTNL();
4307
4308         if (master) {
4309                 if (old)
4310                         return -EBUSY;
4311                 dev_hold(master);
4312         }
4313
4314         slave->master = master;
4315
4316         if (old) {
4317                 synchronize_net();
4318                 dev_put(old);
4319         }
4320         return 0;
4321 }
4322 EXPORT_SYMBOL(netdev_set_master);
4323
4324 /**
4325  *      netdev_set_bond_master  -       set up bonding master/slave pair
4326  *      @slave: slave device
4327  *      @master: new master device
4328  *
4329  *      Changes the master device of the slave. Pass %NULL to break the
4330  *      bonding. The caller must hold the RTNL semaphore. On a failure
4331  *      a negative errno code is returned. On success %RTM_NEWLINK is sent
4332  *      to the routing socket and the function returns zero.
4333  */
4334 int netdev_set_bond_master(struct net_device *slave, struct net_device *master)
4335 {
4336         int err;
4337
4338         ASSERT_RTNL();
4339
4340         err = netdev_set_master(slave, master);
4341         if (err)
4342                 return err;
4343         if (master)
4344                 slave->flags |= IFF_SLAVE;
4345         else
4346                 slave->flags &= ~IFF_SLAVE;
4347
4348         rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4349         return 0;
4350 }
4351 EXPORT_SYMBOL(netdev_set_bond_master);
4352
4353 static void dev_change_rx_flags(struct net_device *dev, int flags)
4354 {
4355         const struct net_device_ops *ops = dev->netdev_ops;
4356
4357         if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4358                 ops->ndo_change_rx_flags(dev, flags);
4359 }
4360
4361 static int __dev_set_promiscuity(struct net_device *dev, int inc)
4362 {
4363         unsigned short old_flags = dev->flags;
4364         uid_t uid;
4365         gid_t gid;
4366
4367         ASSERT_RTNL();
4368
4369         dev->flags |= IFF_PROMISC;
4370         dev->promiscuity += inc;
4371         if (dev->promiscuity == 0) {
4372                 /*
4373                  * Avoid overflow.
4374                  * If inc causes overflow, untouch promisc and return error.
4375                  */
4376                 if (inc < 0)
4377                         dev->flags &= ~IFF_PROMISC;
4378                 else {
4379                         dev->promiscuity -= inc;
4380                         printk(KERN_WARNING "%s: promiscuity touches roof, "
4381                                 "set promiscuity failed, promiscuity feature "
4382                                 "of device might be broken.\n", dev->name);
4383                         return -EOVERFLOW;
4384                 }
4385         }
4386         if (dev->flags != old_flags) {
4387                 printk(KERN_INFO "device %s %s promiscuous mode\n",
4388                        dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
4389                                                                "left");
4390                 if (audit_enabled) {
4391                         current_uid_gid(&uid, &gid);
4392                         audit_log(current->audit_context, GFP_ATOMIC,
4393                                 AUDIT_ANOM_PROMISCUOUS,
4394                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4395                                 dev->name, (dev->flags & IFF_PROMISC),
4396                                 (old_flags & IFF_PROMISC),
4397                                 audit_get_loginuid(current),
4398                                 uid, gid,
4399                                 audit_get_sessionid(current));
4400                 }
4401
4402                 dev_change_rx_flags(dev, IFF_PROMISC);
4403         }
4404         return 0;
4405 }
4406
4407 /**
4408  *      dev_set_promiscuity     - update promiscuity count on a device
4409  *      @dev: device
4410  *      @inc: modifier
4411  *
4412  *      Add or remove promiscuity from a device. While the count in the device
4413  *      remains above zero the interface remains promiscuous. Once it hits zero
4414  *      the device reverts back to normal filtering operation. A negative inc
4415  *      value is used to drop promiscuity on the device.
4416  *      Return 0 if successful or a negative errno code on error.
4417  */
4418 int dev_set_promiscuity(struct net_device *dev, int inc)
4419 {
4420         unsigned short old_flags = dev->flags;
4421         int err;
4422
4423         err = __dev_set_promiscuity(dev, inc);
4424         if (err < 0)
4425                 return err;
4426         if (dev->flags != old_flags)
4427                 dev_set_rx_mode(dev);
4428         return err;
4429 }
4430 EXPORT_SYMBOL(dev_set_promiscuity);
4431
4432 /**
4433  *      dev_set_allmulti        - update allmulti count on a device
4434  *      @dev: device
4435  *      @inc: modifier
4436  *
4437  *      Add or remove reception of all multicast frames to a device. While the
4438  *      count in the device remains above zero the interface remains listening
4439  *      to all interfaces. Once it hits zero the device reverts back to normal
4440  *      filtering operation. A negative @inc value is used to drop the counter
4441  *      when releasing a resource needing all multicasts.
4442  *      Return 0 if successful or a negative errno code on error.
4443  */
4444
4445 int dev_set_allmulti(struct net_device *dev, int inc)
4446 {
4447         unsigned short old_flags = dev->flags;
4448
4449         ASSERT_RTNL();
4450
4451         dev->flags |= IFF_ALLMULTI;
4452         dev->allmulti += inc;
4453         if (dev->allmulti == 0) {
4454                 /*
4455                  * Avoid overflow.
4456                  * If inc causes overflow, untouch allmulti and return error.
4457                  */
4458                 if (inc < 0)
4459                         dev->flags &= ~IFF_ALLMULTI;
4460                 else {
4461                         dev->allmulti -= inc;
4462                         printk(KERN_WARNING "%s: allmulti touches roof, "
4463                                 "set allmulti failed, allmulti feature of "
4464                                 "device might be broken.\n", dev->name);
4465                         return -EOVERFLOW;
4466                 }
4467         }
4468         if (dev->flags ^ old_flags) {
4469                 dev_change_rx_flags(dev, IFF_ALLMULTI);
4470                 dev_set_rx_mode(dev);
4471         }
4472         return 0;
4473 }
4474 EXPORT_SYMBOL(dev_set_allmulti);
4475
4476 /*
4477  *      Upload unicast and multicast address lists to device and
4478  *      configure RX filtering. When the device doesn't support unicast
4479  *      filtering it is put in promiscuous mode while unicast addresses
4480  *      are present.
4481  */
4482 void __dev_set_rx_mode(struct net_device *dev)
4483 {
4484         const struct net_device_ops *ops = dev->netdev_ops;
4485
4486         /* dev_open will call this function so the list will stay sane. */
4487         if (!(dev->flags&IFF_UP))
4488                 return;
4489
4490         if (!netif_device_present(dev))
4491                 return;
4492
4493         if (ops->ndo_set_rx_mode)
4494                 ops->ndo_set_rx_mode(dev);
4495         else {
4496                 /* Unicast addresses changes may only happen under the rtnl,
4497                  * therefore calling __dev_set_promiscuity here is safe.
4498                  */
4499                 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4500                         __dev_set_promiscuity(dev, 1);
4501                         dev->uc_promisc = 1;
4502                 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4503                         __dev_set_promiscuity(dev, -1);
4504                         dev->uc_promisc = 0;
4505                 }
4506
4507                 if (ops->ndo_set_multicast_list)
4508                         ops->ndo_set_multicast_list(dev);
4509         }
4510 }
4511
4512 void dev_set_rx_mode(struct net_device *dev)
4513 {
4514         netif_addr_lock_bh(dev);
4515         __dev_set_rx_mode(dev);
4516         netif_addr_unlock_bh(dev);
4517 }
4518
4519 /**
4520  *      dev_get_flags - get flags reported to userspace
4521  *      @dev: device
4522  *
4523  *      Get the combination of flag bits exported through APIs to userspace.
4524  */
4525 unsigned dev_get_flags(const struct net_device *dev)
4526 {
4527         unsigned flags;
4528
4529         flags = (dev->flags & ~(IFF_PROMISC |
4530                                 IFF_ALLMULTI |
4531                                 IFF_RUNNING |
4532                                 IFF_LOWER_UP |
4533                                 IFF_DORMANT)) |
4534                 (dev->gflags & (IFF_PROMISC |
4535                                 IFF_ALLMULTI));
4536
4537         if (netif_running(dev)) {
4538                 if (netif_oper_up(dev))
4539                         flags |= IFF_RUNNING;
4540                 if (netif_carrier_ok(dev))
4541                         flags |= IFF_LOWER_UP;
4542                 if (netif_dormant(dev))
4543                         flags |= IFF_DORMANT;
4544         }
4545
4546         return flags;
4547 }
4548 EXPORT_SYMBOL(dev_get_flags);
4549
4550 int __dev_change_flags(struct net_device *dev, unsigned int flags)
4551 {
4552         int old_flags = dev->flags;
4553         int ret;
4554
4555         ASSERT_RTNL();
4556
4557         /*
4558          *      Set the flags on our device.
4559          */
4560
4561         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4562                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4563                                IFF_AUTOMEDIA)) |
4564                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4565                                     IFF_ALLMULTI));
4566
4567         /*
4568          *      Load in the correct multicast list now the flags have changed.
4569          */
4570
4571         if ((old_flags ^ flags) & IFF_MULTICAST)
4572                 dev_change_rx_flags(dev, IFF_MULTICAST);
4573
4574         dev_set_rx_mode(dev);
4575
4576         /*
4577          *      Have we downed the interface. We handle IFF_UP ourselves
4578          *      according to user attempts to set it, rather than blindly
4579          *      setting it.
4580          */
4581
4582         ret = 0;
4583         if ((old_flags ^ flags) & IFF_UP) {     /* Bit is different  ? */
4584                 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4585
4586                 if (!ret)
4587                         dev_set_rx_mode(dev);
4588         }
4589
4590         if ((flags ^ dev->gflags) & IFF_PROMISC) {
4591                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
4592
4593                 dev->gflags ^= IFF_PROMISC;
4594                 dev_set_promiscuity(dev, inc);
4595         }
4596
4597         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4598            is important. Some (broken) drivers set IFF_PROMISC, when
4599            IFF_ALLMULTI is requested not asking us and not reporting.
4600          */
4601         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4602                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4603
4604                 dev->gflags ^= IFF_ALLMULTI;
4605                 dev_set_allmulti(dev, inc);
4606         }
4607
4608         return ret;
4609 }
4610
4611 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4612 {
4613         unsigned int changes = dev->flags ^ old_flags;
4614
4615         if (changes & IFF_UP) {
4616                 if (dev->flags & IFF_UP)
4617                         call_netdevice_notifiers(NETDEV_UP, dev);
4618                 else
4619                         call_netdevice_notifiers(NETDEV_DOWN, dev);
4620         }
4621
4622         if (dev->flags & IFF_UP &&
4623             (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4624                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
4625 }
4626
4627 /**
4628  *      dev_change_flags - change device settings
4629  *      @dev: device
4630  *      @flags: device state flags
4631  *
4632  *      Change settings on device based state flags. The flags are
4633  *      in the userspace exported format.
4634  */
4635 int dev_change_flags(struct net_device *dev, unsigned flags)
4636 {
4637         int ret, changes;
4638         int old_flags = dev->flags;
4639
4640         ret = __dev_change_flags(dev, flags);
4641         if (ret < 0)
4642                 return ret;
4643
4644         changes = old_flags ^ dev->flags;
4645         if (changes)
4646                 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4647
4648         __dev_notify_flags(dev, old_flags);
4649         return ret;
4650 }
4651 EXPORT_SYMBOL(dev_change_flags);
4652
4653 /**
4654  *      dev_set_mtu - Change maximum transfer unit
4655  *      @dev: device
4656  *      @new_mtu: new transfer unit
4657  *
4658  *      Change the maximum transfer size of the network device.
4659  */
4660 int dev_set_mtu(struct net_device *dev, int new_mtu)
4661 {
4662         const struct net_device_ops *ops = dev->netdev_ops;
4663         int err;
4664
4665         if (new_mtu == dev->mtu)
4666                 return 0;
4667
4668         /*      MTU must be positive.    */
4669         if (new_mtu < 0)
4670                 return -EINVAL;
4671
4672         if (!netif_device_present(dev))
4673                 return -ENODEV;
4674
4675         err = 0;
4676         if (ops->ndo_change_mtu)
4677                 err = ops->ndo_change_mtu(dev, new_mtu);
4678         else
4679                 dev->mtu = new_mtu;
4680
4681         if (!err && dev->flags & IFF_UP)
4682                 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4683         return err;
4684 }
4685 EXPORT_SYMBOL(dev_set_mtu);
4686
4687 /**
4688  *      dev_set_group - Change group this device belongs to
4689  *      @dev: device
4690  *      @new_group: group this device should belong to
4691  */
4692 void dev_set_group(struct net_device *dev, int new_group)
4693 {
4694         dev->group = new_group;
4695 }
4696 EXPORT_SYMBOL(dev_set_group);
4697
4698 /**
4699  *      dev_set_mac_address - Change Media Access Control Address
4700  *      @dev: device
4701  *      @sa: new address
4702  *
4703  *      Change the hardware (MAC) address of the device
4704  */
4705 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4706 {
4707         const struct net_device_ops *ops = dev->netdev_ops;
4708         int err;
4709
4710         if (!ops->ndo_set_mac_address)
4711                 return -EOPNOTSUPP;
4712         if (sa->sa_family != dev->type)
4713                 return -EINVAL;
4714         if (!netif_device_present(dev))
4715                 return -ENODEV;
4716         err = ops->ndo_set_mac_address(dev, sa);
4717         if (!err)
4718                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4719         return err;
4720 }
4721 EXPORT_SYMBOL(dev_set_mac_address);
4722
4723 /*
4724  *      Perform the SIOCxIFxxx calls, inside rcu_read_lock()
4725  */
4726 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4727 {
4728         int err;
4729         struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
4730
4731         if (!dev)
4732                 return -ENODEV;
4733
4734         switch (cmd) {
4735         case SIOCGIFFLAGS:      /* Get interface flags */
4736                 ifr->ifr_flags = (short) dev_get_flags(dev);
4737                 return 0;
4738
4739         case SIOCGIFMETRIC:     /* Get the metric on the interface
4740                                    (currently unused) */
4741                 ifr->ifr_metric = 0;
4742                 return 0;
4743
4744         case SIOCGIFMTU:        /* Get the MTU of a device */
4745                 ifr->ifr_mtu = dev->mtu;
4746                 return 0;
4747
4748         case SIOCGIFHWADDR:
4749                 if (!dev->addr_len)
4750                         memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4751                 else
4752                         memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4753                                min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4754                 ifr->ifr_hwaddr.sa_family = dev->type;
4755                 return 0;
4756
4757         case SIOCGIFSLAVE:
4758                 err = -EINVAL;
4759                 break;
4760
4761         case SIOCGIFMAP:
4762                 ifr->ifr_map.mem_start = dev->mem_start;
4763                 ifr->ifr_map.mem_end   = dev->mem_end;
4764                 ifr->ifr_map.base_addr = dev->base_addr;
4765                 ifr->ifr_map.irq       = dev->irq;
4766                 ifr->ifr_map.dma       = dev->dma;
4767                 ifr->ifr_map.port      = dev->if_port;
4768                 return 0;
4769
4770         case SIOCGIFINDEX:
4771                 ifr->ifr_ifindex = dev->ifindex;
4772                 return 0;
4773
4774         case SIOCGIFTXQLEN:
4775                 ifr->ifr_qlen = dev->tx_queue_len;
4776                 return 0;
4777
4778         default:
4779                 /* dev_ioctl() should ensure this case
4780                  * is never reached
4781                  */
4782                 WARN_ON(1);
4783                 err = -EINVAL;
4784                 break;
4785
4786         }
4787         return err;
4788 }
4789
4790 /*
4791  *      Perform the SIOCxIFxxx calls, inside rtnl_lock()
4792  */
4793 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4794 {
4795         int err;
4796         struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4797         const struct net_device_ops *ops;
4798
4799         if (!dev)
4800                 return -ENODEV;
4801
4802         ops = dev->netdev_ops;
4803
4804         switch (cmd) {
4805         case SIOCSIFFLAGS:      /* Set interface flags */
4806                 return dev_change_flags(dev, ifr->ifr_flags);
4807
4808         case SIOCSIFMETRIC:     /* Set the metric on the interface
4809                                    (currently unused) */
4810                 return -EOPNOTSUPP;
4811
4812         case SIOCSIFMTU:        /* Set the MTU of a device */
4813                 return dev_set_mtu(dev, ifr->ifr_mtu);
4814
4815         case SIOCSIFHWADDR:
4816                 return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
4817
4818         case SIOCSIFHWBROADCAST:
4819                 if (ifr->ifr_hwaddr.sa_family != dev->type)
4820                         return -EINVAL;
4821                 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4822                        min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4823                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4824                 return 0;
4825
4826         case SIOCSIFMAP:
4827                 if (ops->ndo_set_config) {
4828                         if (!netif_device_present(dev))
4829                                 return -ENODEV;
4830                         return ops->ndo_set_config(dev, &ifr->ifr_map);
4831                 }
4832                 return -EOPNOTSUPP;
4833
4834         case SIOCADDMULTI:
4835                 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4836                     ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4837                         return -EINVAL;
4838                 if (!netif_device_present(dev))
4839                         return -ENODEV;
4840                 return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
4841
4842         case SIOCDELMULTI:
4843                 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4844                     ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4845                         return -EINVAL;
4846                 if (!netif_device_present(dev))
4847                         return -ENODEV;
4848                 return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
4849
4850         case SIOCSIFTXQLEN:
4851                 if (ifr->ifr_qlen < 0)
4852                         return -EINVAL;
4853                 dev->tx_queue_len = ifr->ifr_qlen;
4854                 return 0;
4855
4856         case SIOCSIFNAME:
4857                 ifr->ifr_newname[IFNAMSIZ-1] = '\0';
4858                 return dev_change_name(dev, ifr->ifr_newname);
4859
4860         /*
4861          *      Unknown or private ioctl
4862          */
4863         default:
4864                 if ((cmd >= SIOCDEVPRIVATE &&
4865                     cmd <= SIOCDEVPRIVATE + 15) ||
4866                     cmd == SIOCBONDENSLAVE ||
4867                     cmd == SIOCBONDRELEASE ||
4868                     cmd == SIOCBONDSETHWADDR ||
4869                     cmd == SIOCBONDSLAVEINFOQUERY ||
4870                     cmd == SIOCBONDINFOQUERY ||
4871                     cmd == SIOCBONDCHANGEACTIVE ||
4872                     cmd == SIOCGMIIPHY ||
4873                     cmd == SIOCGMIIREG ||
4874                     cmd == SIOCSMIIREG ||
4875                     cmd == SIOCBRADDIF ||
4876                     cmd == SIOCBRDELIF ||
4877                     cmd == SIOCSHWTSTAMP ||
4878                     cmd == SIOCWANDEV) {
4879                         err = -EOPNOTSUPP;
4880                         if (ops->ndo_do_ioctl) {
4881                                 if (netif_device_present(dev))
4882                                         err = ops->ndo_do_ioctl(dev, ifr, cmd);
4883                                 else
4884                                         err = -ENODEV;
4885                         }
4886                 } else
4887                         err = -EINVAL;
4888
4889         }
4890         return err;
4891 }
4892
4893 /*
4894  *      This function handles all "interface"-type I/O control requests. The actual
4895  *      'doing' part of this is dev_ifsioc above.
4896  */
4897
4898 /**
4899  *      dev_ioctl       -       network device ioctl
4900  *      @net: the applicable net namespace
4901  *      @cmd: command to issue
4902  *      @arg: pointer to a struct ifreq in user space
4903  *
4904  *      Issue ioctl functions to devices. This is normally called by the
4905  *      user space syscall interfaces but can sometimes be useful for
4906  *      other purposes. The return value is the return from the syscall if
4907  *      positive or a negative errno code on error.
4908  */
4909
4910 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
4911 {
4912         struct ifreq ifr;
4913         int ret;
4914         char *colon;
4915
4916         /* One special case: SIOCGIFCONF takes ifconf argument
4917            and requires shared lock, because it sleeps writing
4918            to user space.
4919          */
4920
4921         if (cmd == SIOCGIFCONF) {
4922                 rtnl_lock();
4923                 ret = dev_ifconf(net, (char __user *) arg);
4924                 rtnl_unlock();
4925                 return ret;
4926         }
4927         if (cmd == SIOCGIFNAME)
4928                 return dev_ifname(net, (struct ifreq __user *)arg);
4929
4930         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4931                 return -EFAULT;
4932
4933         ifr.ifr_name[IFNAMSIZ-1] = 0;
4934
4935         colon = strchr(ifr.ifr_name, ':');
4936         if (colon)
4937                 *colon = 0;
4938
4939         /*
4940          *      See which interface the caller is talking about.
4941          */
4942
4943         switch (cmd) {
4944         /*
4945          *      These ioctl calls:
4946          *      - can be done by all.
4947          *      - atomic and do not require locking.
4948          *      - return a value
4949          */
4950         case SIOCGIFFLAGS:
4951         case SIOCGIFMETRIC:
4952         case SIOCGIFMTU:
4953         case SIOCGIFHWADDR:
4954         case SIOCGIFSLAVE:
4955         case SIOCGIFMAP:
4956         case SIOCGIFINDEX:
4957         case SIOCGIFTXQLEN:
4958                 dev_load(net, ifr.ifr_name);
4959                 rcu_read_lock();
4960                 ret = dev_ifsioc_locked(net, &ifr, cmd);
4961                 rcu_read_unlock();
4962                 if (!ret) {
4963                         if (colon)
4964                                 *colon = ':';
4965                         if (copy_to_user(arg, &ifr,
4966                                          sizeof(struct ifreq)))
4967                                 ret = -EFAULT;
4968                 }
4969                 return ret;
4970
4971         case SIOCETHTOOL:
4972                 dev_load(net, ifr.ifr_name);
4973                 rtnl_lock();
4974                 ret = dev_ethtool(net, &ifr);
4975                 rtnl_unlock();
4976                 if (!ret) {
4977                         if (colon)
4978                                 *colon = ':';
4979                         if (copy_to_user(arg, &ifr,
4980                                          sizeof(struct ifreq)))
4981                                 ret = -EFAULT;
4982                 }
4983                 return ret;
4984
4985         /*
4986          *      These ioctl calls:
4987          *      - require superuser power.
4988          *      - require strict serialization.
4989          *      - return a value
4990          */
4991         case SIOCGMIIPHY:
4992         case SIOCGMIIREG:
4993         case SIOCSIFNAME:
4994                 if (!capable(CAP_NET_ADMIN))
4995                         return -EPERM;
4996                 dev_load(net, ifr.ifr_name);
4997                 rtnl_lock();
4998                 ret = dev_ifsioc(net, &ifr, cmd);
4999                 rtnl_unlock();
5000                 if (!ret) {
5001                         if (colon)
5002                                 *colon = ':';
5003                         if (copy_to_user(arg, &ifr,
5004                                          sizeof(struct ifreq)))
5005                                 ret = -EFAULT;
5006                 }
5007                 return ret;
5008
5009         /*
5010          *      These ioctl calls:
5011          *      - require superuser power.
5012          *      - require strict serialization.
5013          *      - do not return a value
5014          */
5015         case SIOCSIFFLAGS:
5016         case SIOCSIFMETRIC:
5017         case SIOCSIFMTU:
5018         case SIOCSIFMAP:
5019         case SIOCSIFHWADDR:
5020         case SIOCSIFSLAVE:
5021         case SIOCADDMULTI:
5022         case SIOCDELMULTI:
5023         case SIOCSIFHWBROADCAST:
5024         case SIOCSIFTXQLEN:
5025         case SIOCSMIIREG:
5026         case SIOCBONDENSLAVE:
5027         case SIOCBONDRELEASE:
5028         case SIOCBONDSETHWADDR:
5029         case SIOCBONDCHANGEACTIVE:
5030         case SIOCBRADDIF:
5031         case SIOCBRDELIF:
5032         case SIOCSHWTSTAMP:
5033                 if (!capable(CAP_NET_ADMIN))
5034                         return -EPERM;
5035                 /* fall through */
5036         case SIOCBONDSLAVEINFOQUERY:
5037         case SIOCBONDINFOQUERY:
5038                 dev_load(net, ifr.ifr_name);
5039                 rtnl_lock();
5040                 ret = dev_ifsioc(net, &ifr, cmd);
5041                 rtnl_unlock();
5042                 return ret;
5043
5044         case SIOCGIFMEM:
5045                 /* Get the per device memory space. We can add this but
5046                  * currently do not support it */
5047         case SIOCSIFMEM:
5048                 /* Set the per device memory buffer space.
5049                  * Not applicable in our case */
5050         case SIOCSIFLINK:
5051                 return -EINVAL;
5052
5053         /*
5054          *      Unknown or private ioctl.
5055          */
5056         default:
5057                 if (cmd == SIOCWANDEV ||
5058                     (cmd >= SIOCDEVPRIVATE &&
5059                      cmd <= SIOCDEVPRIVATE + 15)) {
5060                         dev_load(net, ifr.ifr_name);
5061                         rtnl_lock();
5062                         ret = dev_ifsioc(net, &ifr, cmd);
5063                         rtnl_unlock();
5064                         if (!ret && copy_to_user(arg, &ifr,
5065                                                  sizeof(struct ifreq)))
5066                                 ret = -EFAULT;
5067                         return ret;
5068                 }
5069                 /* Take care of Wireless Extensions */
5070                 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
5071                         return wext_handle_ioctl(net, &ifr, cmd, arg);
5072                 return -EINVAL;
5073         }
5074 }
5075
5076
5077 /**
5078  *      dev_new_index   -       allocate an ifindex
5079  *      @net: the applicable net namespace
5080  *
5081  *      Returns a suitable unique value for a new device interface
5082  *      number.  The caller must hold the rtnl semaphore or the
5083  *      dev_base_lock to be sure it remains unique.
5084  */
5085 static int dev_new_index(struct net *net)
5086 {
5087         static int ifindex;
5088         for (;;) {
5089                 if (++ifindex <= 0)
5090                         ifindex = 1;
5091                 if (!__dev_get_by_index(net, ifindex))
5092                         return ifindex;
5093         }
5094 }
5095
5096 /* Delayed registration/unregisteration */
5097 static LIST_HEAD(net_todo_list);
5098
5099 static void net_set_todo(struct net_device *dev)
5100 {
5101         list_add_tail(&dev->todo_list, &net_todo_list);
5102 }
5103
5104 static void rollback_registered_many(struct list_head *head)
5105 {
5106         struct net_device *dev, *tmp;
5107
5108         BUG_ON(dev_boot_phase);
5109         ASSERT_RTNL();
5110
5111         list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5112                 /* Some devices call without registering
5113                  * for initialization unwind. Remove those
5114                  * devices and proceed with the remaining.
5115                  */
5116                 if (dev->reg_state == NETREG_UNINITIALIZED) {
5117                         pr_debug("unregister_netdevice: device %s/%p never "
5118                                  "was registered\n", dev->name, dev);
5119
5120                         WARN_ON(1);
5121                         list_del(&dev->unreg_list);
5122                         continue;
5123                 }
5124
5125                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
5126         }
5127
5128         /* If device is running, close it first. */
5129         dev_close_many(head);
5130
5131         list_for_each_entry(dev, head, unreg_list) {
5132                 /* And unlink it from device chain. */
5133                 unlist_netdevice(dev);
5134
5135                 dev->reg_state = NETREG_UNREGISTERING;
5136         }
5137
5138         synchronize_net();
5139
5140         list_for_each_entry(dev, head, unreg_list) {
5141                 /* Shutdown queueing discipline. */
5142                 dev_shutdown(dev);
5143
5144
5145                 /* Notify protocols, that we are about to destroy
5146                    this device. They should clean all the things.
5147                 */
5148                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5149
5150                 if (!dev->rtnl_link_ops ||
5151                     dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5152                         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5153
5154                 /*
5155                  *      Flush the unicast and multicast chains
5156                  */
5157                 dev_uc_flush(dev);
5158                 dev_mc_flush(dev);
5159
5160                 if (dev->netdev_ops->ndo_uninit)
5161                         dev->netdev_ops->ndo_uninit(dev);
5162
5163                 /* Notifier chain MUST detach us from master device. */
5164                 WARN_ON(dev->master);
5165
5166                 /* Remove entries from kobject tree */
5167                 netdev_unregister_kobject(dev);
5168         }
5169
5170         /* Process any work delayed until the end of the batch */
5171         dev = list_first_entry(head, struct net_device, unreg_list);
5172         call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5173
5174         rcu_barrier();
5175
5176         list_for_each_entry(dev, head, unreg_list)
5177                 dev_put(dev);
5178 }
5179
5180 static void rollback_registered(struct net_device *dev)
5181 {
5182         LIST_HEAD(single);
5183
5184         list_add(&dev->unreg_list, &single);
5185         rollback_registered_many(&single);
5186         list_del(&single);
5187 }
5188
5189 u32 netdev_fix_features(struct net_device *dev, u32 features)
5190 {
5191         /* Fix illegal checksum combinations */
5192         if ((features & NETIF_F_HW_CSUM) &&
5193             (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5194                 netdev_info(dev, "mixed HW and IP checksum settings.\n");
5195                 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5196         }
5197
5198         if ((features & NETIF_F_NO_CSUM) &&
5199             (features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5200                 netdev_info(dev, "mixed no checksumming and other settings.\n");
5201                 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
5202         }
5203
5204         /* Fix illegal SG+CSUM combinations. */
5205         if ((features & NETIF_F_SG) &&
5206             !(features & NETIF_F_ALL_CSUM)) {
5207                 netdev_info(dev,
5208                             "Dropping NETIF_F_SG since no checksum feature.\n");
5209                 features &= ~NETIF_F_SG;
5210         }
5211
5212         /* TSO requires that SG is present as well. */
5213         if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5214                 netdev_info(dev, "Dropping TSO features since no SG feature.\n");
5215                 features &= ~NETIF_F_ALL_TSO;
5216         }
5217
5218         /* TSO ECN requires that TSO is present as well. */
5219         if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5220                 features &= ~NETIF_F_TSO_ECN;
5221
5222         /* Software GSO depends on SG. */
5223         if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5224                 netdev_info(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5225                 features &= ~NETIF_F_GSO;
5226         }
5227
5228         /* UFO needs SG and checksumming */
5229         if (features & NETIF_F_UFO) {
5230                 /* maybe split UFO into V4 and V6? */
5231                 if (!((features & NETIF_F_GEN_CSUM) ||
5232                     (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5233                             == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5234                         netdev_info(dev,
5235                                 "Dropping NETIF_F_UFO since no checksum offload features.\n");
5236                         features &= ~NETIF_F_UFO;
5237                 }
5238
5239                 if (!(features & NETIF_F_SG)) {
5240                         netdev_info(dev,
5241                                 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5242                         features &= ~NETIF_F_UFO;
5243                 }
5244         }
5245
5246         return features;
5247 }
5248 EXPORT_SYMBOL(netdev_fix_features);
5249
5250 int __netdev_update_features(struct net_device *dev)
5251 {
5252         u32 features;
5253         int err = 0;
5254
5255         ASSERT_RTNL();
5256
5257         features = netdev_get_wanted_features(dev);
5258
5259         if (dev->netdev_ops->ndo_fix_features)
5260                 features = dev->netdev_ops->ndo_fix_features(dev, features);
5261
5262         /* driver might be less strict about feature dependencies */
5263         features = netdev_fix_features(dev, features);
5264
5265         if (dev->features == features)
5266                 return 0;
5267
5268         netdev_info(dev, "Features changed: 0x%08x -> 0x%08x\n",
5269                 dev->features, features);
5270
5271         if (dev->netdev_ops->ndo_set_features)
5272                 err = dev->netdev_ops->ndo_set_features(dev, features);
5273
5274         if (unlikely(err < 0)) {
5275                 netdev_err(dev,
5276                         "set_features() failed (%d); wanted 0x%08x, left 0x%08x\n",
5277                         err, features, dev->features);
5278                 return -1;
5279         }
5280
5281         if (!err)
5282                 dev->features = features;
5283
5284         return 1;
5285 }
5286
5287 void netdev_update_features(struct net_device *dev)
5288 {
5289         if (__netdev_update_features(dev))
5290                 netdev_features_change(dev);
5291 }
5292 EXPORT_SYMBOL(netdev_update_features);
5293
5294 /**
5295  *      netif_stacked_transfer_operstate -      transfer operstate
5296  *      @rootdev: the root or lower level device to transfer state from
5297  *      @dev: the device to transfer operstate to
5298  *
5299  *      Transfer operational state from root to device. This is normally
5300  *      called when a stacking relationship exists between the root
5301  *      device and the device(a leaf device).
5302  */
5303 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5304                                         struct net_device *dev)
5305 {
5306         if (rootdev->operstate == IF_OPER_DORMANT)
5307                 netif_dormant_on(dev);
5308         else
5309                 netif_dormant_off(dev);
5310
5311         if (netif_carrier_ok(rootdev)) {
5312                 if (!netif_carrier_ok(dev))
5313                         netif_carrier_on(dev);
5314         } else {
5315                 if (netif_carrier_ok(dev))
5316                         netif_carrier_off(dev);
5317         }
5318 }
5319 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5320
5321 #ifdef CONFIG_RPS
5322 static int netif_alloc_rx_queues(struct net_device *dev)
5323 {
5324         unsigned int i, count = dev->num_rx_queues;
5325         struct netdev_rx_queue *rx;
5326
5327         BUG_ON(count < 1);
5328
5329         rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5330         if (!rx) {
5331                 pr_err("netdev: Unable to allocate %u rx queues.\n", count);
5332                 return -ENOMEM;
5333         }
5334         dev->_rx = rx;
5335
5336         for (i = 0; i < count; i++)
5337                 rx[i].dev = dev;
5338         return 0;
5339 }
5340 #endif
5341
5342 static void netdev_init_one_queue(struct net_device *dev,
5343                                   struct netdev_queue *queue, void *_unused)
5344 {
5345         /* Initialize queue lock */
5346         spin_lock_init(&queue->_xmit_lock);
5347         netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5348         queue->xmit_lock_owner = -1;
5349         netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5350         queue->dev = dev;
5351 }
5352
5353 static int netif_alloc_netdev_queues(struct net_device *dev)
5354 {
5355         unsigned int count = dev->num_tx_queues;
5356         struct netdev_queue *tx;
5357
5358         BUG_ON(count < 1);
5359
5360         tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5361         if (!tx) {
5362                 pr_err("netdev: Unable to allocate %u tx queues.\n",
5363                        count);
5364                 return -ENOMEM;
5365         }
5366         dev->_tx = tx;
5367
5368         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5369         spin_lock_init(&dev->tx_global_lock);
5370
5371         return 0;
5372 }
5373
5374 /**
5375  *      register_netdevice      - register a network device
5376  *      @dev: device to register
5377  *
5378  *      Take a completed network device structure and add it to the kernel
5379  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5380  *      chain. 0 is returned on success. A negative errno code is returned
5381  *      on a failure to set up the device, or if the name is a duplicate.
5382  *
5383  *      Callers must hold the rtnl semaphore. You may want
5384  *      register_netdev() instead of this.
5385  *
5386  *      BUGS:
5387  *      The locking appears insufficient to guarantee two parallel registers
5388  *      will not get the same name.
5389  */
5390
5391 int register_netdevice(struct net_device *dev)
5392 {
5393         int ret;
5394         struct net *net = dev_net(dev);
5395
5396         BUG_ON(dev_boot_phase);
5397         ASSERT_RTNL();
5398
5399         might_sleep();
5400
5401         /* When net_device's are persistent, this will be fatal. */
5402         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5403         BUG_ON(!net);
5404
5405         spin_lock_init(&dev->addr_list_lock);
5406         netdev_set_addr_lockdep_class(dev);
5407
5408         dev->iflink = -1;
5409
5410         /* Init, if this function is available */
5411         if (dev->netdev_ops->ndo_init) {
5412                 ret = dev->netdev_ops->ndo_init(dev);
5413                 if (ret) {
5414                         if (ret > 0)
5415                                 ret = -EIO;
5416                         goto out;
5417                 }
5418         }
5419
5420         ret = dev_get_valid_name(dev, dev->name, 0);
5421         if (ret)
5422                 goto err_uninit;
5423
5424         dev->ifindex = dev_new_index(net);
5425         if (dev->iflink == -1)
5426                 dev->iflink = dev->ifindex;
5427
5428         /* Transfer changeable features to wanted_features and enable
5429          * software offloads (GSO and GRO).
5430          */
5431         dev->hw_features |= NETIF_F_SOFT_FEATURES;
5432         dev->features |= NETIF_F_SOFT_FEATURES;
5433         dev->wanted_features = dev->features & dev->hw_features;
5434
5435         /* Avoid warning from netdev_fix_features() for GSO without SG */
5436         if (!(dev->wanted_features & NETIF_F_SG)) {
5437                 dev->wanted_features &= ~NETIF_F_GSO;
5438                 dev->features &= ~NETIF_F_GSO;
5439         }
5440
5441         /* Turn on no cache copy if HW is doing checksum */
5442         dev->hw_features |= NETIF_F_NOCACHE_COPY;
5443         if ((dev->features & NETIF_F_ALL_CSUM) &&
5444             !(dev->features & NETIF_F_NO_CSUM)) {
5445                 dev->wanted_features |= NETIF_F_NOCACHE_COPY;
5446                 dev->features |= NETIF_F_NOCACHE_COPY;
5447         }
5448
5449         /* Enable GRO and NETIF_F_HIGHDMA for vlans by default,
5450          * vlan_dev_init() will do the dev->features check, so these features
5451          * are enabled only if supported by underlying device.
5452          */
5453         dev->vlan_features |= (NETIF_F_GRO | NETIF_F_HIGHDMA);
5454
5455         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5456         ret = notifier_to_errno(ret);
5457         if (ret)
5458                 goto err_uninit;
5459
5460         ret = netdev_register_kobject(dev);
5461         if (ret)
5462                 goto err_uninit;
5463         dev->reg_state = NETREG_REGISTERED;
5464
5465         __netdev_update_features(dev);
5466
5467         /*
5468          *      Default initial state at registry is that the
5469          *      device is present.
5470          */
5471
5472         set_bit(__LINK_STATE_PRESENT, &dev->state);
5473
5474         dev_init_scheduler(dev);
5475         dev_hold(dev);
5476         list_netdevice(dev);
5477
5478         /* Notify protocols, that a new device appeared. */
5479         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5480         ret = notifier_to_errno(ret);
5481         if (ret) {
5482                 rollback_registered(dev);
5483                 dev->reg_state = NETREG_UNREGISTERED;
5484         }
5485         /*
5486          *      Prevent userspace races by waiting until the network
5487          *      device is fully setup before sending notifications.
5488          */
5489         if (!dev->rtnl_link_ops ||
5490             dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5491                 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5492
5493 out:
5494         return ret;
5495
5496 err_uninit:
5497         if (dev->netdev_ops->ndo_uninit)
5498                 dev->netdev_ops->ndo_uninit(dev);
5499         goto out;
5500 }
5501 EXPORT_SYMBOL(register_netdevice);
5502
5503 /**
5504  *      init_dummy_netdev       - init a dummy network device for NAPI
5505  *      @dev: device to init
5506  *
5507  *      This takes a network device structure and initialize the minimum
5508  *      amount of fields so it can be used to schedule NAPI polls without
5509  *      registering a full blown interface. This is to be used by drivers
5510  *      that need to tie several hardware interfaces to a single NAPI
5511  *      poll scheduler due to HW limitations.
5512  */
5513 int init_dummy_netdev(struct net_device *dev)
5514 {
5515         /* Clear everything. Note we don't initialize spinlocks
5516          * are they aren't supposed to be taken by any of the
5517          * NAPI code and this dummy netdev is supposed to be
5518          * only ever used for NAPI polls
5519          */
5520         memset(dev, 0, sizeof(struct net_device));
5521
5522         /* make sure we BUG if trying to hit standard
5523          * register/unregister code path
5524          */
5525         dev->reg_state = NETREG_DUMMY;
5526
5527         /* NAPI wants this */
5528         INIT_LIST_HEAD(&dev->napi_list);
5529
5530         /* a dummy interface is started by default */
5531         set_bit(__LINK_STATE_PRESENT, &dev->state);
5532         set_bit(__LINK_STATE_START, &dev->state);
5533
5534         /* Note : We dont allocate pcpu_refcnt for dummy devices,
5535          * because users of this 'device' dont need to change
5536          * its refcount.
5537          */
5538
5539         return 0;
5540 }
5541 EXPORT_SYMBOL_GPL(init_dummy_netdev);
5542
5543
5544 /**
5545  *      register_netdev - register a network device
5546  *      @dev: device to register
5547  *
5548  *      Take a completed network device structure and add it to the kernel
5549  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5550  *      chain. 0 is returned on success. A negative errno code is returned
5551  *      on a failure to set up the device, or if the name is a duplicate.
5552  *
5553  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
5554  *      and expands the device name if you passed a format string to
5555  *      alloc_netdev.
5556  */
5557 int register_netdev(struct net_device *dev)
5558 {
5559         int err;
5560
5561         rtnl_lock();
5562
5563         /*
5564          * If the name is a format string the caller wants us to do a
5565          * name allocation.
5566          */
5567         if (strchr(dev->name, '%')) {
5568                 err = dev_alloc_name(dev, dev->name);
5569                 if (err < 0)
5570                         goto out;
5571         }
5572
5573         err = register_netdevice(dev);
5574 out:
5575         rtnl_unlock();
5576         return err;
5577 }
5578 EXPORT_SYMBOL(register_netdev);
5579
5580 int netdev_refcnt_read(const struct net_device *dev)
5581 {
5582         int i, refcnt = 0;
5583
5584         for_each_possible_cpu(i)
5585                 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5586         return refcnt;
5587 }
5588 EXPORT_SYMBOL(netdev_refcnt_read);
5589
5590 /*
5591  * netdev_wait_allrefs - wait until all references are gone.
5592  *
5593  * This is called when unregistering network devices.
5594  *
5595  * Any protocol or device that holds a reference should register
5596  * for netdevice notification, and cleanup and put back the
5597  * reference if they receive an UNREGISTER event.
5598  * We can get stuck here if buggy protocols don't correctly
5599  * call dev_put.
5600  */
5601 static void netdev_wait_allrefs(struct net_device *dev)
5602 {
5603         unsigned long rebroadcast_time, warning_time;
5604         int refcnt;
5605
5606         linkwatch_forget_dev(dev);
5607
5608         rebroadcast_time = warning_time = jiffies;
5609         refcnt = netdev_refcnt_read(dev);
5610
5611         while (refcnt != 0) {
5612                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5613                         rtnl_lock();
5614
5615                         /* Rebroadcast unregister notification */
5616                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5617                         /* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
5618                          * should have already handle it the first time */
5619
5620                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5621                                      &dev->state)) {
5622                                 /* We must not have linkwatch events
5623                                  * pending on unregister. If this
5624                                  * happens, we simply run the queue
5625                                  * unscheduled, resulting in a noop
5626                                  * for this device.
5627                                  */
5628                                 linkwatch_run_queue();
5629                         }
5630
5631                         __rtnl_unlock();
5632
5633                         rebroadcast_time = jiffies;
5634                 }
5635
5636                 msleep(250);
5637
5638                 refcnt = netdev_refcnt_read(dev);
5639
5640                 if (time_after(jiffies, warning_time + 10 * HZ)) {
5641                         printk(KERN_EMERG "unregister_netdevice: "
5642                                "waiting for %s to become free. Usage "
5643                                "count = %d\n",
5644                                dev->name, refcnt);
5645                         warning_time = jiffies;
5646                 }
5647         }
5648 }
5649
5650 /* The sequence is:
5651  *
5652  *      rtnl_lock();
5653  *      ...
5654  *      register_netdevice(x1);
5655  *      register_netdevice(x2);
5656  *      ...
5657  *      unregister_netdevice(y1);
5658  *      unregister_netdevice(y2);
5659  *      ...
5660  *      rtnl_unlock();
5661  *      free_netdev(y1);
5662  *      free_netdev(y2);
5663  *
5664  * We are invoked by rtnl_unlock().
5665  * This allows us to deal with problems:
5666  * 1) We can delete sysfs objects which invoke hotplug
5667  *    without deadlocking with linkwatch via keventd.
5668  * 2) Since we run with the RTNL semaphore not held, we can sleep
5669  *    safely in order to wait for the netdev refcnt to drop to zero.
5670  *
5671  * We must not return until all unregister events added during
5672  * the interval the lock was held have been completed.
5673  */
5674 void netdev_run_todo(void)
5675 {
5676         struct list_head list;
5677
5678         /* Snapshot list, allow later requests */
5679         list_replace_init(&net_todo_list, &list);
5680
5681         __rtnl_unlock();
5682
5683         while (!list_empty(&list)) {
5684                 struct net_device *dev
5685                         = list_first_entry(&list, struct net_device, todo_list);
5686                 list_del(&dev->todo_list);
5687
5688                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5689                         printk(KERN_ERR "network todo '%s' but state %d\n",
5690                                dev->name, dev->reg_state);
5691                         dump_stack();
5692                         continue;
5693                 }
5694
5695                 dev->reg_state = NETREG_UNREGISTERED;
5696
5697                 on_each_cpu(flush_backlog, dev, 1);
5698
5699                 netdev_wait_allrefs(dev);
5700
5701                 /* paranoia */
5702                 BUG_ON(netdev_refcnt_read(dev));
5703                 WARN_ON(rcu_dereference_raw(dev->ip_ptr));
5704                 WARN_ON(rcu_dereference_raw(dev->ip6_ptr));
5705                 WARN_ON(dev->dn_ptr);
5706
5707                 if (dev->destructor)
5708                         dev->destructor(dev);
5709
5710                 /* Free network device */
5711                 kobject_put(&dev->dev.kobj);
5712         }
5713 }
5714
5715 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
5716  * fields in the same order, with only the type differing.
5717  */
5718 static void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5719                                     const struct net_device_stats *netdev_stats)
5720 {
5721 #if BITS_PER_LONG == 64
5722         BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5723         memcpy(stats64, netdev_stats, sizeof(*stats64));
5724 #else
5725         size_t i, n = sizeof(*stats64) / sizeof(u64);
5726         const unsigned long *src = (const unsigned long *)netdev_stats;
5727         u64 *dst = (u64 *)stats64;
5728
5729         BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5730                      sizeof(*stats64) / sizeof(u64));
5731         for (i = 0; i < n; i++)
5732                 dst[i] = src[i];
5733 #endif
5734 }
5735
5736 /**
5737  *      dev_get_stats   - get network device statistics
5738  *      @dev: device to get statistics from
5739  *      @storage: place to store stats
5740  *
5741  *      Get network statistics from device. Return @storage.
5742  *      The device driver may provide its own method by setting
5743  *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5744  *      otherwise the internal statistics structure is used.
5745  */
5746 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5747                                         struct rtnl_link_stats64 *storage)
5748 {
5749         const struct net_device_ops *ops = dev->netdev_ops;
5750
5751         if (ops->ndo_get_stats64) {
5752                 memset(storage, 0, sizeof(*storage));
5753                 ops->ndo_get_stats64(dev, storage);
5754         } else if (ops->ndo_get_stats) {
5755                 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
5756         } else {
5757                 netdev_stats_to_stats64(storage, &dev->stats);
5758         }
5759         storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
5760         return storage;
5761 }
5762 EXPORT_SYMBOL(dev_get_stats);
5763
5764 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
5765 {
5766         struct netdev_queue *queue = dev_ingress_queue(dev);
5767
5768 #ifdef CONFIG_NET_CLS_ACT
5769         if (queue)
5770                 return queue;
5771         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5772         if (!queue)
5773                 return NULL;
5774         netdev_init_one_queue(dev, queue, NULL);
5775         queue->qdisc = &noop_qdisc;
5776         queue->qdisc_sleeping = &noop_qdisc;
5777         rcu_assign_pointer(dev->ingress_queue, queue);
5778 #endif
5779         return queue;
5780 }
5781
5782 /**
5783  *      alloc_netdev_mqs - allocate network device
5784  *      @sizeof_priv:   size of private data to allocate space for
5785  *      @name:          device name format string
5786  *      @setup:         callback to initialize device
5787  *      @txqs:          the number of TX subqueues to allocate
5788  *      @rxqs:          the number of RX subqueues to allocate
5789  *
5790  *      Allocates a struct net_device with private data area for driver use
5791  *      and performs basic initialization.  Also allocates subquue structs
5792  *      for each queue on the device.
5793  */
5794 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
5795                 void (*setup)(struct net_device *),
5796                 unsigned int txqs, unsigned int rxqs)
5797 {
5798         struct net_device *dev;
5799         size_t alloc_size;
5800         struct net_device *p;
5801
5802         BUG_ON(strlen(name) >= sizeof(dev->name));
5803
5804         if (txqs < 1) {
5805                 pr_err("alloc_netdev: Unable to allocate device "
5806                        "with zero queues.\n");
5807                 return NULL;
5808         }
5809
5810 #ifdef CONFIG_RPS
5811         if (rxqs < 1) {
5812                 pr_err("alloc_netdev: Unable to allocate device "
5813                        "with zero RX queues.\n");
5814                 return NULL;
5815         }
5816 #endif
5817
5818         alloc_size = sizeof(struct net_device);
5819         if (sizeof_priv) {
5820                 /* ensure 32-byte alignment of private area */
5821                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5822                 alloc_size += sizeof_priv;
5823         }
5824         /* ensure 32-byte alignment of whole construct */
5825         alloc_size += NETDEV_ALIGN - 1;
5826
5827         p = kzalloc(alloc_size, GFP_KERNEL);
5828         if (!p) {
5829                 printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
5830                 return NULL;
5831         }
5832
5833         dev = PTR_ALIGN(p, NETDEV_ALIGN);
5834         dev->padded = (char *)dev - (char *)p;
5835
5836         dev->pcpu_refcnt = alloc_percpu(int);
5837         if (!dev->pcpu_refcnt)
5838                 goto free_p;
5839
5840         if (dev_addr_init(dev))
5841                 goto free_pcpu;
5842
5843         dev_mc_init(dev);
5844         dev_uc_init(dev);
5845
5846         dev_net_set(dev, &init_net);
5847
5848         dev->gso_max_size = GSO_MAX_SIZE;
5849
5850         INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list);
5851         dev->ethtool_ntuple_list.count = 0;
5852         INIT_LIST_HEAD(&dev->napi_list);
5853         INIT_LIST_HEAD(&dev->unreg_list);
5854         INIT_LIST_HEAD(&dev->link_watch_list);
5855         dev->priv_flags = IFF_XMIT_DST_RELEASE;
5856         setup(dev);
5857
5858         dev->num_tx_queues = txqs;
5859         dev->real_num_tx_queues = txqs;
5860         if (netif_alloc_netdev_queues(dev))
5861                 goto free_all;
5862
5863 #ifdef CONFIG_RPS
5864         dev->num_rx_queues = rxqs;
5865         dev->real_num_rx_queues = rxqs;
5866         if (netif_alloc_rx_queues(dev))
5867                 goto free_all;
5868 #endif
5869
5870         strcpy(dev->name, name);
5871         dev->group = INIT_NETDEV_GROUP;
5872         return dev;
5873
5874 free_all:
5875         free_netdev(dev);
5876         return NULL;
5877
5878 free_pcpu:
5879         free_percpu(dev->pcpu_refcnt);
5880         kfree(dev->_tx);
5881 #ifdef CONFIG_RPS
5882         kfree(dev->_rx);
5883 #endif
5884
5885 free_p:
5886         kfree(p);
5887         return NULL;
5888 }
5889 EXPORT_SYMBOL(alloc_netdev_mqs);
5890
5891 /**
5892  *      free_netdev - free network device
5893  *      @dev: device
5894  *
5895  *      This function does the last stage of destroying an allocated device
5896  *      interface. The reference to the device object is released.
5897  *      If this is the last reference then it will be freed.
5898  */
5899 void free_netdev(struct net_device *dev)
5900 {
5901         struct napi_struct *p, *n;
5902
5903         release_net(dev_net(dev));
5904
5905         kfree(dev->_tx);
5906 #ifdef CONFIG_RPS
5907         kfree(dev->_rx);
5908 #endif
5909
5910         kfree(rcu_dereference_raw(dev->ingress_queue));
5911
5912         /* Flush device addresses */
5913         dev_addr_flush(dev);
5914
5915         /* Clear ethtool n-tuple list */
5916         ethtool_ntuple_flush(dev);
5917
5918         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5919                 netif_napi_del(p);
5920
5921         free_percpu(dev->pcpu_refcnt);
5922         dev->pcpu_refcnt = NULL;
5923
5924         /*  Compatibility with error handling in drivers */
5925         if (dev->reg_state == NETREG_UNINITIALIZED) {
5926                 kfree((char *)dev - dev->padded);
5927                 return;
5928         }
5929
5930         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5931         dev->reg_state = NETREG_RELEASED;
5932
5933         /* will free via device release */
5934         put_device(&dev->dev);
5935 }
5936 EXPORT_SYMBOL(free_netdev);
5937
5938 /**
5939  *      synchronize_net -  Synchronize with packet receive processing
5940  *
5941  *      Wait for packets currently being received to be done.
5942  *      Does not block later packets from starting.
5943  */
5944 void synchronize_net(void)
5945 {
5946         might_sleep();
5947         synchronize_rcu();
5948 }
5949 EXPORT_SYMBOL(synchronize_net);
5950
5951 /**
5952  *      unregister_netdevice_queue - remove device from the kernel
5953  *      @dev: device
5954  *      @head: list
5955  *
5956  *      This function shuts down a device interface and removes it
5957  *      from the kernel tables.
5958  *      If head not NULL, device is queued to be unregistered later.
5959  *
5960  *      Callers must hold the rtnl semaphore.  You may want
5961  *      unregister_netdev() instead of this.
5962  */
5963
5964 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
5965 {
5966         ASSERT_RTNL();
5967
5968         if (head) {
5969                 list_move_tail(&dev->unreg_list, head);
5970         } else {
5971                 rollback_registered(dev);
5972                 /* Finish processing unregister after unlock */
5973                 net_set_todo(dev);
5974         }
5975 }
5976 EXPORT_SYMBOL(unregister_netdevice_queue);
5977
5978 /**
5979  *      unregister_netdevice_many - unregister many devices
5980  *      @head: list of devices
5981  */
5982 void unregister_netdevice_many(struct list_head *head)
5983 {
5984         struct net_device *dev;
5985
5986         if (!list_empty(head)) {
5987                 rollback_registered_many(head);
5988                 list_for_each_entry(dev, head, unreg_list)
5989                         net_set_todo(dev);
5990         }
5991 }
5992 EXPORT_SYMBOL(unregister_netdevice_many);
5993
5994 /**
5995  *      unregister_netdev - remove device from the kernel
5996  *      @dev: device
5997  *
5998  *      This function shuts down a device interface and removes it
5999  *      from the kernel tables.
6000  *
6001  *      This is just a wrapper for unregister_netdevice that takes
6002  *      the rtnl semaphore.  In general you want to use this and not
6003  *      unregister_netdevice.
6004  */
6005 void unregister_netdev(struct net_device *dev)
6006 {
6007         rtnl_lock();
6008         unregister_netdevice(dev);
6009         rtnl_unlock();
6010 }
6011 EXPORT_SYMBOL(unregister_netdev);
6012
6013 /**
6014  *      dev_change_net_namespace - move device to different nethost namespace
6015  *      @dev: device
6016  *      @net: network namespace
6017  *      @pat: If not NULL name pattern to try if the current device name
6018  *            is already taken in the destination network namespace.
6019  *
6020  *      This function shuts down a device interface and moves it
6021  *      to a new network namespace. On success 0 is returned, on
6022  *      a failure a netagive errno code is returned.
6023  *
6024  *      Callers must hold the rtnl semaphore.
6025  */
6026
6027 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6028 {
6029         int err;
6030
6031         ASSERT_RTNL();
6032
6033         /* Don't allow namespace local devices to be moved. */
6034         err = -EINVAL;
6035         if (dev->features & NETIF_F_NETNS_LOCAL)
6036                 goto out;
6037
6038         /* Ensure the device has been registrered */
6039         err = -EINVAL;
6040         if (dev->reg_state != NETREG_REGISTERED)
6041                 goto out;
6042
6043         /* Get out if there is nothing todo */
6044         err = 0;
6045         if (net_eq(dev_net(dev), net))
6046                 goto out;
6047
6048         /* Pick the destination device name, and ensure
6049          * we can use it in the destination network namespace.
6050          */
6051         err = -EEXIST;
6052         if (__dev_get_by_name(net, dev->name)) {
6053                 /* We get here if we can't use the current device name */
6054                 if (!pat)
6055                         goto out;
6056                 if (dev_get_valid_name(dev, pat, 1))
6057                         goto out;
6058         }
6059
6060         /*
6061          * And now a mini version of register_netdevice unregister_netdevice.
6062          */
6063
6064         /* If device is running close it first. */
6065         dev_close(dev);
6066
6067         /* And unlink it from device chain */
6068         err = -ENODEV;
6069         unlist_netdevice(dev);
6070
6071         synchronize_net();
6072
6073         /* Shutdown queueing discipline. */
6074         dev_shutdown(dev);
6075
6076         /* Notify protocols, that we are about to destroy
6077            this device. They should clean all the things.
6078
6079            Note that dev->reg_state stays at NETREG_REGISTERED.
6080            This is wanted because this way 8021q and macvlan know
6081            the device is just moving and can keep their slaves up.
6082         */
6083         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6084         call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
6085
6086         /*
6087          *      Flush the unicast and multicast chains
6088          */
6089         dev_uc_flush(dev);
6090         dev_mc_flush(dev);
6091
6092         /* Actually switch the network namespace */
6093         dev_net_set(dev, net);
6094
6095         /* If there is an ifindex conflict assign a new one */
6096         if (__dev_get_by_index(net, dev->ifindex)) {
6097                 int iflink = (dev->iflink == dev->ifindex);
6098                 dev->ifindex = dev_new_index(net);
6099                 if (iflink)
6100                         dev->iflink = dev->ifindex;
6101         }
6102
6103         /* Fixup kobjects */
6104         err = device_rename(&dev->dev, dev->name);
6105         WARN_ON(err);
6106
6107         /* Add the device back in the hashes */
6108         list_netdevice(dev);
6109
6110         /* Notify protocols, that a new device appeared. */
6111         call_netdevice_notifiers(NETDEV_REGISTER, dev);
6112
6113         /*
6114          *      Prevent userspace races by waiting until the network
6115          *      device is fully setup before sending notifications.
6116          */
6117         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6118
6119         synchronize_net();
6120         err = 0;
6121 out:
6122         return err;
6123 }
6124 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6125
6126 static int dev_cpu_callback(struct notifier_block *nfb,
6127                             unsigned long action,
6128                             void *ocpu)
6129 {
6130         struct sk_buff **list_skb;
6131         struct sk_buff *skb;
6132         unsigned int cpu, oldcpu = (unsigned long)ocpu;
6133         struct softnet_data *sd, *oldsd;
6134
6135         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6136                 return NOTIFY_OK;
6137
6138         local_irq_disable();
6139         cpu = smp_processor_id();
6140         sd = &per_cpu(softnet_data, cpu);
6141         oldsd = &per_cpu(softnet_data, oldcpu);
6142
6143         /* Find end of our completion_queue. */
6144         list_skb = &sd->completion_queue;
6145         while (*list_skb)
6146                 list_skb = &(*list_skb)->next;
6147         /* Append completion queue from offline CPU. */
6148         *list_skb = oldsd->completion_queue;
6149         oldsd->completion_queue = NULL;
6150
6151         /* Append output queue from offline CPU. */
6152         if (oldsd->output_queue) {
6153                 *sd->output_queue_tailp = oldsd->output_queue;
6154                 sd->output_queue_tailp = oldsd->output_queue_tailp;
6155                 oldsd->output_queue = NULL;
6156                 oldsd->output_queue_tailp = &oldsd->output_queue;
6157         }
6158
6159         raise_softirq_irqoff(NET_TX_SOFTIRQ);
6160         local_irq_enable();
6161
6162         /* Process offline CPU's input_pkt_queue */
6163         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6164                 netif_rx(skb);
6165                 input_queue_head_incr(oldsd);
6166         }
6167         while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6168                 netif_rx(skb);
6169                 input_queue_head_incr(oldsd);
6170         }
6171
6172         return NOTIFY_OK;
6173 }
6174
6175
6176 /**
6177  *      netdev_increment_features - increment feature set by one
6178  *      @all: current feature set
6179  *      @one: new feature set
6180  *      @mask: mask feature set
6181  *
6182  *      Computes a new feature set after adding a device with feature set
6183  *      @one to the master device with current feature set @all.  Will not
6184  *      enable anything that is off in @mask. Returns the new feature set.
6185  */
6186 u32 netdev_increment_features(u32 all, u32 one, u32 mask)
6187 {
6188         /* If device needs checksumming, downgrade to it. */
6189         if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
6190                 all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM);
6191         else if (mask & NETIF_F_ALL_CSUM) {
6192                 /* If one device supports v4/v6 checksumming, set for all. */
6193                 if (one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM) &&
6194                     !(all & NETIF_F_GEN_CSUM)) {
6195                         all &= ~NETIF_F_ALL_CSUM;
6196                         all |= one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
6197                 }
6198
6199                 /* If one device supports hw checksumming, set for all. */
6200                 if (one & NETIF_F_GEN_CSUM && !(all & NETIF_F_GEN_CSUM)) {
6201                         all &= ~NETIF_F_ALL_CSUM;
6202                         all |= NETIF_F_HW_CSUM;
6203                 }
6204         }
6205
6206         /* If device can't no cache copy, don't do for all */
6207         if (!(one & NETIF_F_NOCACHE_COPY))
6208                 all &= ~NETIF_F_NOCACHE_COPY;
6209
6210         one |= NETIF_F_ALL_CSUM;
6211
6212         one |= all & NETIF_F_ONE_FOR_ALL;
6213         all &= one | NETIF_F_LLTX | NETIF_F_GSO | NETIF_F_UFO;
6214         all |= one & mask & NETIF_F_ONE_FOR_ALL;
6215
6216         return all;
6217 }
6218 EXPORT_SYMBOL(netdev_increment_features);
6219
6220 static struct hlist_head *netdev_create_hash(void)
6221 {
6222         int i;
6223         struct hlist_head *hash;
6224
6225         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6226         if (hash != NULL)
6227                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
6228                         INIT_HLIST_HEAD(&hash[i]);
6229
6230         return hash;
6231 }
6232
6233 /* Initialize per network namespace state */
6234 static int __net_init netdev_init(struct net *net)
6235 {
6236         INIT_LIST_HEAD(&net->dev_base_head);
6237
6238         net->dev_name_head = netdev_create_hash();
6239         if (net->dev_name_head == NULL)
6240                 goto err_name;
6241
6242         net->dev_index_head = netdev_create_hash();
6243         if (net->dev_index_head == NULL)
6244                 goto err_idx;
6245
6246         return 0;
6247
6248 err_idx:
6249         kfree(net->dev_name_head);
6250 err_name:
6251         return -ENOMEM;
6252 }
6253
6254 /**
6255  *      netdev_drivername - network driver for the device
6256  *      @dev: network device
6257  *      @buffer: buffer for resulting name
6258  *      @len: size of buffer
6259  *
6260  *      Determine network driver for device.
6261  */
6262 char *netdev_drivername(const struct net_device *dev, char *buffer, int len)
6263 {
6264         const struct device_driver *driver;
6265         const struct device *parent;
6266
6267         if (len <= 0 || !buffer)
6268                 return buffer;
6269         buffer[0] = 0;
6270
6271         parent = dev->dev.parent;
6272
6273         if (!parent)
6274                 return buffer;
6275
6276         driver = parent->driver;
6277         if (driver && driver->name)
6278                 strlcpy(buffer, driver->name, len);
6279         return buffer;
6280 }
6281
6282 static int __netdev_printk(const char *level, const struct net_device *dev,
6283                            struct va_format *vaf)
6284 {
6285         int r;
6286
6287         if (dev && dev->dev.parent)
6288                 r = dev_printk(level, dev->dev.parent, "%s: %pV",
6289                                netdev_name(dev), vaf);
6290         else if (dev)
6291                 r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6292         else
6293                 r = printk("%s(NULL net_device): %pV", level, vaf);
6294
6295         return r;
6296 }
6297
6298 int netdev_printk(const char *level, const struct net_device *dev,
6299                   const char *format, ...)
6300 {
6301         struct va_format vaf;
6302         va_list args;
6303         int r;
6304
6305         va_start(args, format);
6306
6307         vaf.fmt = format;
6308         vaf.va = &args;
6309
6310         r = __netdev_printk(level, dev, &vaf);
6311         va_end(args);
6312
6313         return r;
6314 }
6315 EXPORT_SYMBOL(netdev_printk);
6316
6317 #define define_netdev_printk_level(func, level)                 \
6318 int func(const struct net_device *dev, const char *fmt, ...)    \
6319 {                                                               \
6320         int r;                                                  \
6321         struct va_format vaf;                                   \
6322         va_list args;                                           \
6323                                                                 \
6324         va_start(args, fmt);                                    \
6325                                                                 \
6326         vaf.fmt = fmt;                                          \
6327         vaf.va = &args;                                         \
6328                                                                 \
6329         r = __netdev_printk(level, dev, &vaf);                  \
6330         va_end(args);                                           \
6331                                                                 \
6332         return r;                                               \
6333 }                                                               \
6334 EXPORT_SYMBOL(func);
6335
6336 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6337 define_netdev_printk_level(netdev_alert, KERN_ALERT);
6338 define_netdev_printk_level(netdev_crit, KERN_CRIT);
6339 define_netdev_printk_level(netdev_err, KERN_ERR);
6340 define_netdev_printk_level(netdev_warn, KERN_WARNING);
6341 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6342 define_netdev_printk_level(netdev_info, KERN_INFO);
6343
6344 static void __net_exit netdev_exit(struct net *net)
6345 {
6346         kfree(net->dev_name_head);
6347         kfree(net->dev_index_head);
6348 }
6349
6350 static struct pernet_operations __net_initdata netdev_net_ops = {
6351         .init = netdev_init,
6352         .exit = netdev_exit,
6353 };
6354
6355 static void __net_exit default_device_exit(struct net *net)
6356 {
6357         struct net_device *dev, *aux;
6358         /*
6359          * Push all migratable network devices back to the
6360          * initial network namespace
6361          */
6362         rtnl_lock();
6363         for_each_netdev_safe(net, dev, aux) {
6364                 int err;
6365                 char fb_name[IFNAMSIZ];
6366
6367                 /* Ignore unmoveable devices (i.e. loopback) */
6368                 if (dev->features & NETIF_F_NETNS_LOCAL)
6369                         continue;
6370
6371                 /* Leave virtual devices for the generic cleanup */
6372                 if (dev->rtnl_link_ops)
6373                         continue;
6374
6375                 /* Push remaining network devices to init_net */
6376                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6377                 err = dev_change_net_namespace(dev, &init_net, fb_name);
6378                 if (err) {
6379                         printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
6380                                 __func__, dev->name, err);
6381                         BUG();
6382                 }
6383         }
6384         rtnl_unlock();
6385 }
6386
6387 static void __net_exit default_device_exit_batch(struct list_head *net_list)
6388 {
6389         /* At exit all network devices most be removed from a network
6390          * namespace.  Do this in the reverse order of registration.
6391          * Do this across as many network namespaces as possible to
6392          * improve batching efficiency.
6393          */
6394         struct net_device *dev;
6395         struct net *net;
6396         LIST_HEAD(dev_kill_list);
6397
6398         rtnl_lock();
6399         list_for_each_entry(net, net_list, exit_list) {
6400                 for_each_netdev_reverse(net, dev) {
6401                         if (dev->rtnl_link_ops)
6402                                 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6403                         else
6404                                 unregister_netdevice_queue(dev, &dev_kill_list);
6405                 }
6406         }
6407         unregister_netdevice_many(&dev_kill_list);
6408         list_del(&dev_kill_list);
6409         rtnl_unlock();
6410 }
6411
6412 static struct pernet_operations __net_initdata default_device_ops = {
6413         .exit = default_device_exit,
6414         .exit_batch = default_device_exit_batch,
6415 };
6416
6417 /*
6418  *      Initialize the DEV module. At boot time this walks the device list and
6419  *      unhooks any devices that fail to initialise (normally hardware not
6420  *      present) and leaves us with a valid list of present and active devices.
6421  *
6422  */
6423
6424 /*
6425  *       This is called single threaded during boot, so no need
6426  *       to take the rtnl semaphore.
6427  */
6428 static int __init net_dev_init(void)
6429 {
6430         int i, rc = -ENOMEM;
6431
6432         BUG_ON(!dev_boot_phase);
6433
6434         if (dev_proc_init())
6435                 goto out;
6436
6437         if (netdev_kobject_init())
6438                 goto out;
6439
6440         INIT_LIST_HEAD(&ptype_all);
6441         for (i = 0; i < PTYPE_HASH_SIZE; i++)
6442                 INIT_LIST_HEAD(&ptype_base[i]);
6443
6444         if (register_pernet_subsys(&netdev_net_ops))
6445                 goto out;
6446
6447         /*
6448          *      Initialise the packet receive queues.
6449          */
6450
6451         for_each_possible_cpu(i) {
6452                 struct softnet_data *sd = &per_cpu(softnet_data, i);
6453
6454                 memset(sd, 0, sizeof(*sd));
6455                 skb_queue_head_init(&sd->input_pkt_queue);
6456                 skb_queue_head_init(&sd->process_queue);
6457                 sd->completion_queue = NULL;
6458                 INIT_LIST_HEAD(&sd->poll_list);
6459                 sd->output_queue = NULL;
6460                 sd->output_queue_tailp = &sd->output_queue;
6461 #ifdef CONFIG_RPS
6462                 sd->csd.func = rps_trigger_softirq;
6463                 sd->csd.info = sd;
6464                 sd->csd.flags = 0;
6465                 sd->cpu = i;
6466 #endif
6467
6468                 sd->backlog.poll = process_backlog;
6469                 sd->backlog.weight = weight_p;
6470                 sd->backlog.gro_list = NULL;
6471                 sd->backlog.gro_count = 0;
6472         }
6473
6474         dev_boot_phase = 0;
6475
6476         /* The loopback device is special if any other network devices
6477          * is present in a network namespace the loopback device must
6478          * be present. Since we now dynamically allocate and free the
6479          * loopback device ensure this invariant is maintained by
6480          * keeping the loopback device as the first device on the
6481          * list of network devices.  Ensuring the loopback devices
6482          * is the first device that appears and the last network device
6483          * that disappears.
6484          */
6485         if (register_pernet_device(&loopback_net_ops))
6486                 goto out;
6487
6488         if (register_pernet_device(&default_device_ops))
6489                 goto out;
6490
6491         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6492         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6493
6494         hotcpu_notifier(dev_cpu_callback, 0);
6495         dst_init();
6496         dev_mcast_init();
6497         rc = 0;
6498 out:
6499         return rc;
6500 }
6501
6502 subsys_initcall(net_dev_init);
6503
6504 static int __init initialize_hashrnd(void)
6505 {
6506         get_random_bytes(&hashrnd, sizeof(hashrnd));
6507         return 0;
6508 }
6509
6510 late_initcall_sync(initialize_hashrnd);
6511